From 5009065d38c95455bd2d27c2838313e3dd0c5bc7 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:25 +0200 Subject: [PATCH 01/53] iommu/core: stop converting bytes to page order back and forth Express sizes in bytes rather than in page order, to eliminate the size->order->size conversions we have whenever the IOMMU API is calling the low level drivers' map/unmap methods. Adopt all existing drivers. Signed-off-by: Ohad Ben-Cohen Cc: David Brown Cc: David Woodhouse Cc: Joerg Roedel Cc: Stepan Moskovchenko Cc: KyongHo Cho Cc: Hiroshi DOYU Cc: Laurent Pinchart Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 13 +++++-------- drivers/iommu/intel-iommu.c | 11 ++++------- drivers/iommu/iommu.c | 8 +++++--- drivers/iommu/msm_iommu.c | 19 +++++++------------ drivers/iommu/omap-iommu.c | 14 +++++--------- include/linux/iommu.h | 6 +++--- 6 files changed, 29 insertions(+), 42 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4ee277a8521a..a3b7072e86e2 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2702,9 +2702,8 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, int gfp_order, int iommu_prot) + phys_addr_t paddr, size_t page_size, int iommu_prot) { - unsigned long page_size = 0x1000UL << gfp_order; struct protection_domain *domain = dom->priv; int prot = 0; int ret; @@ -2721,13 +2720,11 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, return ret; } -static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - int gfp_order) +static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, + size_t page_size) { struct protection_domain *domain = dom->priv; - unsigned long page_size, unmap_size; - - page_size = 0x1000UL << gfp_order; + size_t unmap_size; mutex_lock(&domain->api_lock); unmap_size = iommu_unmap_page(domain, iova, page_size); @@ -2735,7 +2732,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, domain_flush_tlb_pde(domain); - return get_order(unmap_size); + return unmap_size; } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c0c7820d4c46..2a165010a1c1 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3979,12 +3979,11 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, static int intel_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t hpa, - int gfp_order, int iommu_prot) + size_t size, int iommu_prot) { struct dmar_domain *dmar_domain = domain->priv; u64 max_addr; int prot = 0; - size_t size; int ret; if (iommu_prot & IOMMU_READ) @@ -3994,7 +3993,6 @@ static int intel_iommu_map(struct iommu_domain *domain, if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) prot |= DMA_PTE_SNP; - size = PAGE_SIZE << gfp_order; max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { u64 end; @@ -4017,11 +4015,10 @@ static int intel_iommu_map(struct iommu_domain *domain, return ret; } -static int intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, int gfp_order) +static size_t intel_iommu_unmap(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = domain->priv; - size_t size = PAGE_SIZE << gfp_order; int order; order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, @@ -4030,7 +4027,7 @@ static int intel_iommu_unmap(struct iommu_domain *domain, if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; - return order; + return PAGE_SIZE << order; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2fb2963df553..7a2953d8f12e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -168,13 +168,13 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, BUG_ON(!IS_ALIGNED(iova | paddr, size)); - return domain->ops->map(domain, iova, paddr, gfp_order, prot); + return domain->ops->map(domain, iova, paddr, size, prot); } EXPORT_SYMBOL_GPL(iommu_map); int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) { - size_t size; + size_t size, unmapped; if (unlikely(domain->ops->unmap == NULL)) return -ENODEV; @@ -183,6 +183,8 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) BUG_ON(!IS_ALIGNED(iova, size)); - return domain->ops->unmap(domain, iova, gfp_order); + unmapped = domain->ops->unmap(domain, iova, size); + + return get_order(unmapped); } EXPORT_SYMBOL_GPL(iommu_unmap); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 5865dd2e28f9..13718d958da8 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -352,7 +352,7 @@ fail: } static int msm_iommu_map(struct iommu_domain *domain, unsigned long va, - phys_addr_t pa, int order, int prot) + phys_addr_t pa, size_t len, int prot) { struct msm_priv *priv; unsigned long flags; @@ -363,7 +363,6 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long va, unsigned long *sl_pte; unsigned long sl_offset; unsigned int pgprot; - size_t len = 0x1000UL << order; int ret = 0, tex, sh; spin_lock_irqsave(&msm_iommu_lock, flags); @@ -463,8 +462,8 @@ fail: return ret; } -static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, - int order) +static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, + size_t len) { struct msm_priv *priv; unsigned long flags; @@ -474,7 +473,6 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, unsigned long *sl_table; unsigned long *sl_pte; unsigned long sl_offset; - size_t len = 0x1000UL << order; int i, ret = 0; spin_lock_irqsave(&msm_iommu_lock, flags); @@ -544,15 +542,12 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, ret = __flush_iotlb(domain); - /* - * the IOMMU API requires us to return the order of the unmapped - * page (on success). - */ - if (!ret) - ret = order; fail: spin_unlock_irqrestore(&msm_iommu_lock, flags); - return ret; + + /* the IOMMU API requires us to return how many bytes were unmapped */ + len = ret ? 0 : len; + return len; } static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain, diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 8f32b2bf7587..ad80b1d0d099 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1019,12 +1019,11 @@ static void iopte_cachep_ctor(void *iopte) } static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, - phys_addr_t pa, int order, int prot) + phys_addr_t pa, size_t bytes, int prot) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu = omap_domain->iommu_dev; struct device *dev = oiommu->dev; - size_t bytes = PAGE_SIZE << order; struct iotlb_entry e; int omap_pgsz; u32 ret, flags; @@ -1049,19 +1048,16 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, return ret; } -static int omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - int order) +static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, + size_t size) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu = omap_domain->iommu_dev; struct device *dev = oiommu->dev; - size_t unmap_size; - dev_dbg(dev, "unmapping da 0x%lx order %d\n", da, order); + dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size); - unmap_size = iopgtable_clear_entry(oiommu, da); - - return unmap_size ? get_order(unmap_size) : -EINVAL; + return iopgtable_clear_entry(oiommu, da); } static int diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 432acc4c054d..d5ebf3f4dd53 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -54,9 +54,9 @@ struct iommu_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); int (*map)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, int gfp_order, int prot); - int (*unmap)(struct iommu_domain *domain, unsigned long iova, - int gfp_order); + phys_addr_t paddr, size_t size, int prot); + size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, + size_t size); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, unsigned long iova); int (*domain_has_cap)(struct iommu_domain *domain, From 7d3002cc8c160dbda0e6ab9cd66dc6eb401b8b70 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:26 +0200 Subject: [PATCH 02/53] iommu/core: split mapping to page sizes as supported by the hardware When mapping a memory region, split it to page sizes as supported by the iommu hardware. Always prefer bigger pages, when possible, in order to reduce the TLB pressure. The logic to do that is now added to the IOMMU core, so neither the iommu drivers themselves nor users of the IOMMU API have to duplicate it. This allows a more lenient granularity of mappings; traditionally the IOMMU API took 'order' (of a page) as a mapping size, and directly let the low level iommu drivers handle the mapping, but now that the IOMMU core can split arbitrary memory regions into pages, we can remove this limitation, so users don't have to split those regions by themselves. Currently the supported page sizes are advertised once and they then remain static. That works well for OMAP and MSM but it would probably not fly well with intel's hardware, where the page size capabilities seem to have the potential to be different between several DMA remapping devices. register_iommu() currently sets a default pgsize behavior, so we can convert the IOMMU drivers in subsequent patches. After all the drivers are converted, the temporary default settings will be removed. Mainline users of the IOMMU API (kvm and omap-iovmm) are adopted to deal with bytes instead of page order. Many thanks to Joerg Roedel for significant review! Signed-off-by: Ohad Ben-Cohen Cc: David Brown Cc: David Woodhouse Cc: Joerg Roedel Cc: Stepan Moskovchenko Cc: KyongHo Cho Cc: Hiroshi DOYU Cc: Laurent Pinchart Cc: kvm@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 125 +++++++++++++++++++++++++++++++++---- drivers/iommu/omap-iovmm.c | 17 ++--- include/linux/iommu.h | 20 +++++- virt/kvm/iommu.c | 8 +-- 4 files changed, 141 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7a2953d8f12e..b278458d5816 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -16,6 +16,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include #include #include @@ -47,6 +49,16 @@ int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops) if (bus->iommu_ops != NULL) return -EBUSY; + /* + * Set the default pgsize values, which retain the existing + * IOMMU API behavior: drivers will be called to map + * regions that are sized/aligned to order of 4KiB pages. + * + * This will be removed once all drivers are migrated. + */ + if (!ops->pgsize_bitmap) + ops->pgsize_bitmap = ~0xFFFUL; + bus->iommu_ops = ops; /* Do IOMMU specific setup for this bus-type */ @@ -157,34 +169,125 @@ int iommu_domain_has_cap(struct iommu_domain *domain, EXPORT_SYMBOL_GPL(iommu_domain_has_cap); int iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, int gfp_order, int prot) + phys_addr_t paddr, size_t size, int prot) { - size_t size; + unsigned long orig_iova = iova; + unsigned int min_pagesz; + size_t orig_size = size; + int ret = 0; if (unlikely(domain->ops->map == NULL)) return -ENODEV; - size = PAGE_SIZE << gfp_order; + /* find out the minimum page size supported */ + min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap); - BUG_ON(!IS_ALIGNED(iova | paddr, size)); + /* + * both the virtual address and the physical one, as well as + * the size of the mapping, must be aligned (at least) to the + * size of the smallest page supported by the hardware + */ + if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz " + "0x%x\n", iova, (unsigned long)paddr, + (unsigned long)size, min_pagesz); + return -EINVAL; + } - return domain->ops->map(domain, iova, paddr, size, prot); + pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova, + (unsigned long)paddr, (unsigned long)size); + + while (size) { + unsigned long pgsize, addr_merge = iova | paddr; + unsigned int pgsize_idx; + + /* Max page size that still fits into 'size' */ + pgsize_idx = __fls(size); + + /* need to consider alignment requirements ? */ + if (likely(addr_merge)) { + /* Max page size allowed by both iova and paddr */ + unsigned int align_pgsize_idx = __ffs(addr_merge); + + pgsize_idx = min(pgsize_idx, align_pgsize_idx); + } + + /* build a mask of acceptable page sizes */ + pgsize = (1UL << (pgsize_idx + 1)) - 1; + + /* throw away page sizes not supported by the hardware */ + pgsize &= domain->ops->pgsize_bitmap; + + /* make sure we're still sane */ + BUG_ON(!pgsize); + + /* pick the biggest page */ + pgsize_idx = __fls(pgsize); + pgsize = 1UL << pgsize_idx; + + pr_debug("mapping: iova 0x%lx pa 0x%lx pgsize %lu\n", iova, + (unsigned long)paddr, pgsize); + + ret = domain->ops->map(domain, iova, paddr, pgsize, prot); + if (ret) + break; + + iova += pgsize; + paddr += pgsize; + size -= pgsize; + } + + /* unroll mapping in case something went wrong */ + if (ret) + iommu_unmap(domain, orig_iova, orig_size - size); + + return ret; } EXPORT_SYMBOL_GPL(iommu_map); -int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) +size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - size_t size, unmapped; + size_t unmapped_page, unmapped = 0; + unsigned int min_pagesz; if (unlikely(domain->ops->unmap == NULL)) return -ENODEV; - size = PAGE_SIZE << gfp_order; + /* find out the minimum page size supported */ + min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap); - BUG_ON(!IS_ALIGNED(iova, size)); + /* + * The virtual address, as well as the size of the mapping, must be + * aligned (at least) to the size of the smallest page supported + * by the hardware + */ + if (!IS_ALIGNED(iova | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx size 0x%lx min_pagesz 0x%x\n", + iova, (unsigned long)size, min_pagesz); + return -EINVAL; + } - unmapped = domain->ops->unmap(domain, iova, size); + pr_debug("unmap this: iova 0x%lx size 0x%lx\n", iova, + (unsigned long)size); - return get_order(unmapped); + /* + * Keep iterating until we either unmap 'size' bytes (or more) + * or we hit an area that isn't mapped. + */ + while (unmapped < size) { + size_t left = size - unmapped; + + unmapped_page = domain->ops->unmap(domain, iova, left); + if (!unmapped_page) + break; + + pr_debug("unmapped: iova 0x%lx size %lx\n", iova, + (unsigned long)unmapped_page); + + iova += unmapped_page; + unmapped += unmapped_page; + } + + return unmapped; } EXPORT_SYMBOL_GPL(iommu_unmap); diff --git a/drivers/iommu/omap-iovmm.c b/drivers/iommu/omap-iovmm.c index e8fdb8830f69..0b7b14cb030b 100644 --- a/drivers/iommu/omap-iovmm.c +++ b/drivers/iommu/omap-iovmm.c @@ -409,7 +409,6 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new, unsigned int i, j; struct scatterlist *sg; u32 da = new->da_start; - int order; if (!domain || !sgt) return -EINVAL; @@ -428,12 +427,10 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new, if (bytes_to_iopgsz(bytes) < 0) goto err_out; - order = get_order(bytes); - pr_debug("%s: [%d] %08x %08x(%x)\n", __func__, i, da, pa, bytes); - err = iommu_map(domain, da, pa, order, flags); + err = iommu_map(domain, da, pa, bytes, flags); if (err) goto err_out; @@ -448,10 +445,9 @@ err_out: size_t bytes; bytes = sg->length + sg->offset; - order = get_order(bytes); /* ignore failures.. we're already handling one */ - iommu_unmap(domain, da, order); + iommu_unmap(domain, da, bytes); da += bytes; } @@ -466,7 +462,8 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj, size_t total = area->da_end - area->da_start; const struct sg_table *sgt = area->sgt; struct scatterlist *sg; - int i, err; + int i; + size_t unmapped; BUG_ON(!sgtable_ok(sgt)); BUG_ON((!total) || !IS_ALIGNED(total, PAGE_SIZE)); @@ -474,13 +471,11 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj, start = area->da_start; for_each_sg(sgt->sgl, sg, sgt->nents, i) { size_t bytes; - int order; bytes = sg->length + sg->offset; - order = get_order(bytes); - err = iommu_unmap(domain, start, order); - if (err < 0) + unmapped = iommu_unmap(domain, start, bytes); + if (unmapped < bytes) break; dev_dbg(obj->dev, "%s: unmap %08x(%x) %08x\n", diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d5ebf3f4dd53..cc26f89c4ee6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -48,6 +48,19 @@ struct iommu_domain { #ifdef CONFIG_IOMMU_API +/** + * struct iommu_ops - iommu ops and capabilities + * @domain_init: init iommu domain + * @domain_destroy: destroy iommu domain + * @attach_dev: attach device to an iommu domain + * @detach_dev: detach device from an iommu domain + * @map: map a physically contiguous memory region to an iommu domain + * @unmap: unmap a physically contiguous memory region from an iommu domain + * @iova_to_phys: translate iova to physical address + * @domain_has_cap: domain capabilities query + * @commit: commit iommu domain + * @pgsize_bitmap: bitmap of supported page sizes + */ struct iommu_ops { int (*domain_init)(struct iommu_domain *domain); void (*domain_destroy)(struct iommu_domain *domain); @@ -61,6 +74,7 @@ struct iommu_ops { unsigned long iova); int (*domain_has_cap)(struct iommu_domain *domain, unsigned long cap); + unsigned long pgsize_bitmap; }; extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops); @@ -72,9 +86,9 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, int gfp_order, int prot); -extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova, - int gfp_order); + phys_addr_t paddr, size_t size, int prot); +extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova); extern int iommu_domain_has_cap(struct iommu_domain *domain, diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index a195c07fa829..304d7e5717e9 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -113,7 +113,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) /* Map into IO address space */ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - get_order(page_size), flags); + page_size, flags); if (r) { printk(KERN_ERR "kvm_iommu_map_address:" "iommu failed to map pfn=%llx\n", pfn); @@ -292,15 +292,15 @@ static void kvm_iommu_put_pages(struct kvm *kvm, while (gfn < end_gfn) { unsigned long unmap_pages; - int order; + size_t size; /* Get physical address */ phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); pfn = phys >> PAGE_SHIFT; /* Unmap address from IO address space */ - order = iommu_unmap(domain, gfn_to_gpa(gfn), 0); - unmap_pages = 1ULL << order; + size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); + unmap_pages = 1ULL << get_order(size); /* Unpin all pages we just unmapped to not leak any memory */ kvm_unpin_pages(kvm, pfn, unmap_pages); From 66bc8cf3b1f70227a7847c88c24a36b4886bb3c3 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:27 +0200 Subject: [PATCH 03/53] iommu/omap: announce supported page sizes Let the IOMMU core know we support 4KiB, 64KiB, 1MiB and 16MiB page sizes. This way the IOMMU core can split any arbitrary-sized physically contiguous regions (that it needs to map) as needed. Signed-off-by: Ohad Ben-Cohen Cc: Hiroshi DOYU Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index ad80b1d0d099..08cf7ec5b4a5 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -33,6 +33,9 @@ (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true); \ __i++) +/* bitmap of the page sizes currently supported */ +#define OMAP_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M) + /** * struct omap_iommu_domain - omap iommu domain * @pgtable: the page table @@ -1207,6 +1210,7 @@ static struct iommu_ops omap_iommu_ops = { .unmap = omap_iommu_unmap, .iova_to_phys = omap_iommu_iova_to_phys, .domain_has_cap = omap_iommu_domain_has_cap, + .pgsize_bitmap = OMAP_IOMMU_PGSIZES, }; static int __init omap_iommu_init(void) From 83427275546a6e36076d4f1a0545335b1bb2afc2 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:28 +0200 Subject: [PATCH 04/53] iommu/msm: announce supported page sizes Let the IOMMU core know we support 4KiB, 64KiB, 1MiB and 16MiB page sizes. This way the IOMMU core can split any arbitrary-sized physically contiguous regions (that it needs to map) as needed. Signed-off-by: Ohad Ben-Cohen Acked-by: David Brown Cc: Stepan Moskovchenko Signed-off-by: Joerg Roedel --- drivers/iommu/msm_iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 13718d958da8..08a90b88e40d 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -42,6 +42,9 @@ __asm__ __volatile__ ( \ #define RCP15_PRRR(reg) MRC(reg, p15, 0, c10, c2, 0) #define RCP15_NMRR(reg) MRC(reg, p15, 0, c10, c2, 1) +/* bitmap of the page sizes currently supported */ +#define MSM_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M) + static int msm_iommu_tex_class[4]; DEFINE_SPINLOCK(msm_iommu_lock); @@ -679,7 +682,8 @@ static struct iommu_ops msm_iommu_ops = { .map = msm_iommu_map, .unmap = msm_iommu_unmap, .iova_to_phys = msm_iommu_iova_to_phys, - .domain_has_cap = msm_iommu_domain_has_cap + .domain_has_cap = msm_iommu_domain_has_cap, + .pgsize_bitmap = MSM_IOMMU_PGSIZES, }; static int __init get_tex_class(int icp, int ocp, int mt, int nos) From aa3de9c05051ac26355276944789217040e38207 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:29 +0200 Subject: [PATCH 05/53] iommu/amd: announce supported page sizes Let the IOMMU core know we support arbitrary page sizes (as long as they're an order of 4KiB). This way the IOMMU core will retain the existing behavior we're used to; it will let us map regions that: - their size is an order of 4KiB - they are naturally aligned Signed-off-by: Ohad Ben-Cohen Cc: Joerg Roedel Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a3b7072e86e2..341573821864 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -41,6 +41,24 @@ #define LOOP_TIMEOUT 100000 +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * Traditionally the IOMMU core just handed us the mappings directly, + * after making sure the size is an order of a 4KiB page and that the + * mapping has natural alignment. + * + * To retain this behavior, we currently advertise that we support + * all page sizes that are an order of 4KiB. + * + * If at some point we'd like to utilize the IOMMU core's new behavior, + * we could change this to advertise the real page sizes we support. + */ +#define AMD_IOMMU_PGSIZES (~0xFFFUL) + static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* A list of preallocated protection domains */ @@ -2779,6 +2797,7 @@ static struct iommu_ops amd_iommu_ops = { .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, + .pgsize_bitmap = AMD_IOMMU_PGSIZES, }; /***************************************************************************** From 6d1c56a9db48977942602a50e88eeb61a3e625eb Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:30 +0200 Subject: [PATCH 06/53] iommu/intel: announce supported page sizes Let the IOMMU core know we support arbitrary page sizes (as long as they're an order of 4KiB). This way the IOMMU core will retain the existing behavior we're used to; it will let us map regions that: - their size is an order of 4KiB - they are naturally aligned Note: Intel IOMMU hardware doesn't support arbitrary page sizes, but the driver does (it splits arbitrary-sized mappings into the pages supported by the hardware). To make everything simpler for now, though, this patch effectively tells the IOMMU core to keep giving this driver the same memory regions it did before, so nothing is changed as far as it's concerned. At this point, the page sizes announced remain static within the IOMMU core. To correctly utilize the pgsize-splitting of the IOMMU core by this driver, it seems that some core changes should still be done, because Intel's IOMMU page size capabilities seem to have the potential to be different between different DMA remapping devices. Signed-off-by: Ohad Ben-Cohen Cc: David Woodhouse Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2a165010a1c1..4c780efff169 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -78,6 +78,24 @@ #define LEVEL_STRIDE (9) #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * Traditionally the IOMMU core just handed us the mappings directly, + * after making sure the size is an order of a 4KiB page and that the + * mapping has natural alignment. + * + * To retain this behavior, we currently advertise that we support + * all page sizes that are an order of 4KiB. + * + * If at some point we'd like to utilize the IOMMU core's new behavior, + * we could change this to advertise the real page sizes we support. + */ +#define INTEL_IOMMU_PGSIZES (~0xFFFUL) + static inline int agaw_to_level(int agaw) { return agaw + 2; @@ -4066,6 +4084,7 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, .domain_has_cap = intel_iommu_domain_has_cap, + .pgsize_bitmap = INTEL_IOMMU_PGSIZES, }; static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) From 6c274d1cd5b3aa0834e9f0c3f58038f42278ff8c Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 10 Nov 2011 11:32:31 +0200 Subject: [PATCH 07/53] iommu/core: remove the temporary pgsize settings Now that all IOMMU drivers are exporting their supported pgsizes, we can remove the default pgsize settings in register_iommu(). Signed-off-by: Ohad Ben-Cohen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b278458d5816..84cdd8ac81f1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -49,16 +49,6 @@ int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops) if (bus->iommu_ops != NULL) return -EBUSY; - /* - * Set the default pgsize values, which retain the existing - * IOMMU API behavior: drivers will be called to map - * regions that are sized/aligned to order of 4KiB pages. - * - * This will be removed once all drivers are migrated. - */ - if (!ops->pgsize_bitmap) - ops->pgsize_bitmap = ~0xFFFUL; - bus->iommu_ops = ops; /* Do IOMMU specific setup for this bus-type */ From 1460432cb513f0c16136ed132c20ecfbf8ccf942 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:05 -0400 Subject: [PATCH 08/53] iommu: Add iommu_device_group callback and iommu_group sysfs entry An IOMMU group is a set of devices for which the IOMMU cannot distinguish transactions. For PCI devices, a group often occurs when a PCI bridge is involved. Transactions from any device behind the bridge appear to be sourced from the bridge itself. We leave it to the IOMMU driver to define the grouping restraints for their platform. Using this new interface, the group for a device can be retrieved using the iommu_device_group() callback. Users will compare the value returned against the value returned for other devices to determine whether they are part of the same group. Devices with no group are not translated by the IOMMU. There should be no expectations about the group numbers as they may be arbitrarily assigned by the IOMMU driver and may not be persistent across boots. We also provide a sysfs interface to the group numbers here so that userspace can understand IOMMU dependencies between devices for managing safe, userspace drivers. [Some code changes by Joerg Roedel ] Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 60 +++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 7 +++++ 2 files changed, 67 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2fb2963df553..9c35be4b333f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -25,8 +25,59 @@ #include #include +static ssize_t show_iommu_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid)) + return 0; + + return sprintf(buf, "%u", groupid); +} +static DEVICE_ATTR(iommu_group, S_IRUGO, show_iommu_group, NULL); + +static int add_iommu_group(struct device *dev, void *data) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid) == 0) + return device_create_file(dev, &dev_attr_iommu_group); + + return 0; +} + +static int remove_iommu_group(struct device *dev) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid) == 0) + device_remove_file(dev, &dev_attr_iommu_group); + + return 0; +} + +static int iommu_device_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + if (action == BUS_NOTIFY_ADD_DEVICE) + return add_iommu_group(dev, NULL); + else if (action == BUS_NOTIFY_DEL_DEVICE) + return remove_iommu_group(dev); + + return 0; +} + +static struct notifier_block iommu_device_nb = { + .notifier_call = iommu_device_notifier, +}; + static void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops) { + bus_register_notifier(bus, &iommu_device_nb); + bus_for_each_dev(bus, NULL, NULL, add_iommu_group); } /** @@ -186,3 +237,12 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) return domain->ops->unmap(domain, iova, gfp_order); } EXPORT_SYMBOL_GPL(iommu_unmap); + +int iommu_device_group(struct device *dev, unsigned int *groupid) +{ + if (iommu_present(dev->bus) && dev->bus->iommu_ops->device_group) + return dev->bus->iommu_ops->device_group(dev, groupid); + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(iommu_device_group); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 432acc4c054d..93617e7779a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -61,6 +61,7 @@ struct iommu_ops { unsigned long iova); int (*domain_has_cap)(struct iommu_domain *domain, unsigned long cap); + int (*device_group)(struct device *dev, unsigned int *groupid); }; extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops); @@ -81,6 +82,7 @@ extern int iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap); extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler); +extern int iommu_device_group(struct device *dev, unsigned int *groupid); /** * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework @@ -179,6 +181,11 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain, { } +static inline int iommu_device_group(struct device *dev, unsigned int *groupid); +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_API */ #endif /* __LINUX_IOMMU_H */ From 70ae6f0d55bd216b2f773fa5fa5018c0490a9e50 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:11 -0400 Subject: [PATCH 09/53] iommu/intel: Implement iommu_device_group We generally have BDF granularity for devices, so we just need to make sure devices aren't hidden behind PCIe-to-PCI bridges. We can then make up a group number that's simply the concatenated seg|bus|dev|fn so we don't have to track them (not that users should depend on that). Signed-off-by: Alex Williamson Acked-By: David Woodhouse Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c0c7820d4c46..39ca6bb17e13 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4060,6 +4060,51 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain, return 0; } +/* + * Group numbers are arbitrary. Device with the same group number + * indicate the iommu cannot differentiate between them. To avoid + * tracking used groups we just use the seg|bus|devfn of the lowest + * level we're able to differentiate devices + */ +static int intel_iommu_device_group(struct device *dev, unsigned int *groupid) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_dev *bridge; + union { + struct { + u8 devfn; + u8 bus; + u16 segment; + } pci; + u32 group; + } id; + + if (iommu_no_mapping(dev)) + return -ENODEV; + + id.pci.segment = pci_domain_nr(pdev->bus); + id.pci.bus = pdev->bus->number; + id.pci.devfn = pdev->devfn; + + if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn)) + return -ENODEV; + + bridge = pci_find_upstream_pcie_bridge(pdev); + if (bridge) { + if (pci_is_pcie(bridge)) { + id.pci.bus = bridge->subordinate->number; + id.pci.devfn = 0; + } else { + id.pci.bus = bridge->bus->number; + id.pci.devfn = bridge->devfn; + } + } + + *groupid = id.group; + + return 0; +} + static struct iommu_ops intel_iommu_ops = { .domain_init = intel_iommu_domain_init, .domain_destroy = intel_iommu_domain_destroy, @@ -4069,6 +4114,7 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, .domain_has_cap = intel_iommu_domain_has_cap, + .device_group = intel_iommu_device_group, }; static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) From 8fbdce659549d93dfb257ec4eabacf63a188e506 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:18 -0400 Subject: [PATCH 10/53] iommu/amd: Implement iommu_device_group Just use the amd_iommu_alias_table directly. Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4ee277a8521a..1d82b631d09c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2773,6 +2773,18 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain, return 0; } +static int amd_iommu_device_group(struct device *dev, unsigned int *groupid) +{ + struct iommu_dev_data *dev_data = dev->archdata.iommu; + + if (!dev_data) + return -ENODEV; + + *groupid = amd_iommu_alias_table[dev_data->devid]; + + return 0; +} + static struct iommu_ops amd_iommu_ops = { .domain_init = amd_iommu_domain_init, .domain_destroy = amd_iommu_domain_destroy, @@ -2782,6 +2794,7 @@ static struct iommu_ops amd_iommu_ops = { .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, + .device_group = amd_iommu_device_group, }; /***************************************************************************** From bcb71abe7d4c5a0d0368c67da0a7def4fc73497a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:24 -0400 Subject: [PATCH 11/53] iommu: Add option to group multi-function devices The option iommu=group_mf indicates the that the iommu driver should expose all functions of a multi-function PCI device as the same iommu_device_group. This is useful for disallowing individual functions being exposed as independent devices to userspace as there are often hidden dependencies. Virtual functions are not affected by this option. Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 4 +++- arch/ia64/include/asm/iommu.h | 2 ++ arch/ia64/kernel/pci-dma.c | 1 + arch/x86/include/asm/iommu.h | 1 + arch/x86/kernel/pci-dma.c | 11 +++++++++++ drivers/iommu/amd_iommu.c | 10 +++++++++- drivers/iommu/intel-iommu.c | 3 +++ 7 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a0c5c5f4fce6..e1b6e4469845 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1059,7 +1059,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nomerge forcesac soft - pt [x86, IA-64] + pt [x86, IA-64] + group_mf [x86, IA-64] + io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 105c93b00b1b..b6a809fa2995 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -11,10 +11,12 @@ extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_pass_through; extern int iommu_detected; +extern int iommu_group_mf; #else #define iommu_pass_through (0) #define no_iommu (1) #define iommu_detected (0) +#define iommu_group_mf (0) #endif extern void iommu_dma_init(void); extern void machvec_init(const char *name); diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c index c16162c70860..eb1175720050 100644 --- a/arch/ia64/kernel/pci-dma.c +++ b/arch/ia64/kernel/pci-dma.c @@ -33,6 +33,7 @@ int force_iommu __read_mostly; #endif int iommu_pass_through; +int iommu_group_mf; /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 345c99cef152..dffc38ee6255 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; +extern int iommu_group_mf; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793b3f63..1c4d769e21ea 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface. This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device. Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; + extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -169,6 +178,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "group_mf", 8)) + iommu_group_mf = 1; gart_parse_options(p); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1d82b631d09c..6f7553684c1e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2776,11 +2776,19 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain, static int amd_iommu_device_group(struct device *dev, unsigned int *groupid) { struct iommu_dev_data *dev_data = dev->archdata.iommu; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid; if (!dev_data) return -ENODEV; - *groupid = amd_iommu_alias_table[dev_data->devid]; + if (pdev->is_virtfn || !iommu_group_mf) + devid = dev_data->devid; + else + devid = calc_devid(pdev->bus->number, + PCI_DEVFN(PCI_SLOT(pdev->devfn), 0)); + + *groupid = amd_iommu_alias_table[devid]; return 0; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 39ca6bb17e13..9ef16d664a99 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4100,6 +4100,9 @@ static int intel_iommu_device_group(struct device *dev, unsigned int *groupid) } } + if (!pdev->is_virtfn && iommu_group_mf) + id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0); + *groupid = id.group; return 0; From 95bdaf71ccf2cb4bba0c9a3d2baea0e7916f466b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 15 Nov 2011 12:48:29 +0100 Subject: [PATCH 12/53] iommu: Fix compile error with !IOMMU_API Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 93617e7779a1..0f318fd549be 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -181,7 +181,7 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain, { } -static inline int iommu_device_group(struct device *dev, unsigned int *groupid); +static inline int iommu_device_group(struct device *dev, unsigned int *groupid) { return -ENODEV; } From 1fa02ac9536c9a5278e999e40e274bba62c23a4d Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Fri, 23 Sep 2011 16:37:27 +0300 Subject: [PATCH 13/53] ARM: OMAP: iommu: declare a private iommu binding struct Declare an omap iommu private struct, which binds an iommu user to its iommu device. This struct should be placed at the iommu user's dev_archdata so generic IOMMU API can be used without having to utilize omap-specific plumbing anymore. While at it, provide an accessor method to ease the retrieval of the omap_iommu handle from a user device. Signed-off-by: Ohad Ben-Cohen Acked-by: Tony Lindgren Cc: Hiroshi Doyu --- arch/arm/plat-omap/include/plat/iommu.h | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm/plat-omap/include/plat/iommu.h b/arch/arm/plat-omap/include/plat/iommu.h index a1d79ee19250..fa11ee26aca9 100644 --- a/arch/arm/plat-omap/include/plat/iommu.h +++ b/arch/arm/plat-omap/include/plat/iommu.h @@ -111,6 +111,32 @@ struct iommu_platform_data { u32 da_end; }; +/** + * struct iommu_arch_data - omap iommu private data + * @name: name of the iommu device + * @iommu_dev: handle of the iommu device + * + * This is an omap iommu private data object, which binds an iommu user + * to its iommu device. This object should be placed at the iommu user's + * dev_archdata so generic IOMMU API can be used without having to + * utilize omap-specific plumbing anymore. + */ +struct omap_iommu_arch_data { + const char *name; + struct omap_iommu *iommu_dev; +}; + +/** + * dev_to_omap_iommu() - retrieves an omap iommu object from a user device + * @dev: iommu client device + */ +static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev) +{ + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; + + return arch_data->iommu_dev; +} + /* IOMMU errors */ #define OMAP_IOMMU_ERR_TLB_MISS (1 << 0) #define OMAP_IOMMU_ERR_TRANS_FAULT (1 << 1) From c8eaab3b74f1326c989e1db8d7c0c14981556e4e Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Fri, 23 Sep 2011 16:44:57 +0300 Subject: [PATCH 14/53] ARM: OMAP3: bind omap3isp_device to its iommu device Bind OMAP3's isp device to the isp's dedicated iommu, by setting the device's archdata iommu member. This way omap3isp will be able to use the generic IOMMU API without having to call any omap-specific binding method. Signed-off-by: Ohad Ben-Cohen Acked-by: Laurent Pinchart Acked-by: Tony Lindgren --- arch/arm/mach-omap2/devices.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/mach-omap2/devices.c b/arch/arm/mach-omap2/devices.c index c15cfada5f13..9ceabf02eeb7 100644 --- a/arch/arm/mach-omap2/devices.c +++ b/arch/arm/mach-omap2/devices.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -211,9 +212,15 @@ static struct platform_device omap3isp_device = { .resource = omap3isp_resources, }; +static struct omap_iommu_arch_data omap3_isp_iommu = { + .name = "isp", +}; + int omap3_init_camera(struct isp_platform_data *pdata) { omap3isp_device.dev.platform_data = pdata; + omap3isp_device.dev.archdata.iommu = &omap3_isp_iommu; + return platform_device_register(&omap3isp_device); } From fabdbca8c991dfa0ea1ff26214ae7d18e5740cc3 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Tue, 11 Oct 2011 00:18:33 +0200 Subject: [PATCH 15/53] iommu/omap: eliminate the public omap_find_iommu_device() method Eliminate the public omap_find_iommu_device() method, and don't expect clients to provide the omap_iommu handle anymore. Instead, OMAP's iommu driver now utilizes dev_archdata's private iommu extension to be able to access the required iommu information. This way OMAP IOMMU users are now able to use the generic IOMMU API without having to call any omap-specific binding method. Update omap3isp appropriately. Signed-off-by: Ohad Ben-Cohen Acked-by: Laurent Pinchart Acked-by: Tony Lindgren Cc: Hiroshi Doyu --- arch/arm/plat-omap/include/plat/iommu.h | 5 +-- arch/arm/plat-omap/include/plat/iovmm.h | 12 ++--- drivers/iommu/omap-iommu.c | 58 +++++++++++-------------- drivers/iommu/omap-iovmm.c | 31 ++++++++----- drivers/media/video/omap3isp/isp.c | 30 +++---------- drivers/media/video/omap3isp/isp.h | 2 - drivers/media/video/omap3isp/ispccdc.c | 18 ++++---- drivers/media/video/omap3isp/ispstat.c | 8 ++-- drivers/media/video/omap3isp/ispvideo.c | 4 +- 9 files changed, 74 insertions(+), 94 deletions(-) diff --git a/arch/arm/plat-omap/include/plat/iommu.h b/arch/arm/plat-omap/include/plat/iommu.h index fa11ee26aca9..88be3e628b33 100644 --- a/arch/arm/plat-omap/include/plat/iommu.h +++ b/arch/arm/plat-omap/include/plat/iommu.h @@ -189,8 +189,8 @@ extern int omap_iommu_set_isr(const char *name, void *priv), void *isr_priv); -extern void omap_iommu_save_ctx(struct omap_iommu *obj); -extern void omap_iommu_restore_ctx(struct omap_iommu *obj); +extern void omap_iommu_save_ctx(struct device *dev); +extern void omap_iommu_restore_ctx(struct device *dev); extern int omap_install_iommu_arch(const struct iommu_functions *ops); extern void omap_uninstall_iommu_arch(const struct iommu_functions *ops); @@ -202,6 +202,5 @@ extern ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len); extern size_t omap_dump_tlb_entries(struct omap_iommu *obj, char *buf, ssize_t len); -struct device *omap_find_iommu_device(const char *name); #endif /* __MACH_IOMMU_H */ diff --git a/arch/arm/plat-omap/include/plat/iovmm.h b/arch/arm/plat-omap/include/plat/iovmm.h index 6af1a91c0f36..498e57cda6cd 100644 --- a/arch/arm/plat-omap/include/plat/iovmm.h +++ b/arch/arm/plat-omap/include/plat/iovmm.h @@ -72,18 +72,18 @@ struct iovm_struct { #define IOVMF_DA_FIXED (1 << (4 + IOVMF_SW_SHIFT)) -extern struct iovm_struct *omap_find_iovm_area(struct omap_iommu *obj, u32 da); +extern struct iovm_struct *omap_find_iovm_area(struct device *dev, u32 da); extern u32 -omap_iommu_vmap(struct iommu_domain *domain, struct omap_iommu *obj, u32 da, +omap_iommu_vmap(struct iommu_domain *domain, struct device *dev, u32 da, const struct sg_table *sgt, u32 flags); extern struct sg_table *omap_iommu_vunmap(struct iommu_domain *domain, - struct omap_iommu *obj, u32 da); + struct device *dev, u32 da); extern u32 -omap_iommu_vmalloc(struct iommu_domain *domain, struct omap_iommu *obj, +omap_iommu_vmalloc(struct iommu_domain *domain, struct device *dev, u32 da, size_t bytes, u32 flags); extern void -omap_iommu_vfree(struct iommu_domain *domain, struct omap_iommu *obj, +omap_iommu_vfree(struct iommu_domain *domain, struct device *dev, const u32 da); -extern void *omap_da_to_va(struct omap_iommu *obj, u32 da); +extern void *omap_da_to_va(struct device *dev, u32 da); #endif /* __IOMMU_MMAP_H */ diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 8f32b2bf7587..b7f863d72c08 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -86,20 +86,24 @@ EXPORT_SYMBOL_GPL(omap_uninstall_iommu_arch); /** * omap_iommu_save_ctx - Save registers for pm off-mode support - * @obj: target iommu + * @dev: client device **/ -void omap_iommu_save_ctx(struct omap_iommu *obj) +void omap_iommu_save_ctx(struct device *dev) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); + arch_iommu->save_ctx(obj); } EXPORT_SYMBOL_GPL(omap_iommu_save_ctx); /** * omap_iommu_restore_ctx - Restore registers for pm off-mode support - * @obj: target iommu + * @dev: client device **/ -void omap_iommu_restore_ctx(struct omap_iommu *obj) +void omap_iommu_restore_ctx(struct device *dev) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); + arch_iommu->restore_ctx(obj); } EXPORT_SYMBOL_GPL(omap_iommu_restore_ctx); @@ -819,36 +823,24 @@ static int device_match_by_alias(struct device *dev, void *data) return strcmp(obj->name, name) == 0; } -/** - * omap_find_iommu_device() - find an omap iommu device by name - * @name: name of the iommu device - * - * The generic iommu API requires the caller to provide the device - * he wishes to attach to a certain iommu domain. - * - * Drivers generally should not bother with this as it should just - * be taken care of by the DMA-API using dev_archdata. - * - * This function is provided as an interim solution until the latter - * materializes, and omap3isp is fully migrated to the DMA-API. - */ -struct device *omap_find_iommu_device(const char *name) -{ - return driver_find_device(&omap_iommu_driver.driver, NULL, - (void *)name, - device_match_by_alias); -} -EXPORT_SYMBOL_GPL(omap_find_iommu_device); - /** * omap_iommu_attach() - attach iommu device to an iommu domain - * @dev: target omap iommu device + * @name: name of target omap iommu device * @iopgd: page table **/ -static struct omap_iommu *omap_iommu_attach(struct device *dev, u32 *iopgd) +static struct omap_iommu *omap_iommu_attach(const char *name, u32 *iopgd) { int err = -ENOMEM; - struct omap_iommu *obj = to_iommu(dev); + struct device *dev; + struct omap_iommu *obj; + + dev = driver_find_device(&omap_iommu_driver.driver, NULL, + (void *)name, + device_match_by_alias); + if (!dev) + return NULL; + + obj = to_iommu(dev); spin_lock(&obj->iommu_lock); @@ -1069,6 +1061,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu; + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; int ret = 0; spin_lock(&omap_domain->lock); @@ -1081,14 +1074,14 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* get a handle to and enable the omap iommu */ - oiommu = omap_iommu_attach(dev, omap_domain->pgtable); + oiommu = omap_iommu_attach(arch_data->name, omap_domain->pgtable); if (IS_ERR(oiommu)) { ret = PTR_ERR(oiommu); dev_err(dev, "can't get omap iommu: %d\n", ret); goto out; } - omap_domain->iommu_dev = oiommu; + omap_domain->iommu_dev = arch_data->iommu_dev = oiommu; oiommu->domain = domain; out: @@ -1100,7 +1093,8 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain, struct device *dev) { struct omap_iommu_domain *omap_domain = domain->priv; - struct omap_iommu *oiommu = to_iommu(dev); + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; + struct omap_iommu *oiommu = dev_to_omap_iommu(dev); spin_lock(&omap_domain->lock); @@ -1114,7 +1108,7 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain, omap_iommu_detach(oiommu); - omap_domain->iommu_dev = NULL; + omap_domain->iommu_dev = arch_data->iommu_dev = NULL; out: spin_unlock(&omap_domain->lock); diff --git a/drivers/iommu/omap-iovmm.c b/drivers/iommu/omap-iovmm.c index 46be456fcc00..23655945e30c 100644 --- a/drivers/iommu/omap-iovmm.c +++ b/drivers/iommu/omap-iovmm.c @@ -231,12 +231,14 @@ static struct iovm_struct *__find_iovm_area(struct omap_iommu *obj, /** * omap_find_iovm_area - find iovma which includes @da + * @dev: client device * @da: iommu device virtual address * * Find the existing iovma starting at @da */ -struct iovm_struct *omap_find_iovm_area(struct omap_iommu *obj, u32 da) +struct iovm_struct *omap_find_iovm_area(struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct iovm_struct *area; mutex_lock(&obj->mmap_lock); @@ -343,14 +345,15 @@ static void free_iovm_area(struct omap_iommu *obj, struct iovm_struct *area) /** * omap_da_to_va - convert (d) to (v) - * @obj: objective iommu + * @dev: client device * @da: iommu device virtual address * @va: mpu virtual address * * Returns mpu virtual addr which corresponds to a given device virtual addr */ -void *omap_da_to_va(struct omap_iommu *obj, u32 da) +void *omap_da_to_va(struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); void *va = NULL; struct iovm_struct *area; @@ -582,16 +585,18 @@ __iommu_vmap(struct iommu_domain *domain, struct omap_iommu *obj, /** * omap_iommu_vmap - (d)-(p)-(v) address mapper - * @obj: objective iommu + * @domain: iommu domain + * @dev: client device * @sgt: address of scatter gather table * @flags: iovma and page property * * Creates 1-n-1 mapping with given @sgt and returns @da. * All @sgt element must be io page size aligned. */ -u32 omap_iommu_vmap(struct iommu_domain *domain, struct omap_iommu *obj, u32 da, +u32 omap_iommu_vmap(struct iommu_domain *domain, struct device *dev, u32 da, const struct sg_table *sgt, u32 flags) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); size_t bytes; void *va = NULL; @@ -622,15 +627,17 @@ EXPORT_SYMBOL_GPL(omap_iommu_vmap); /** * omap_iommu_vunmap - release virtual mapping obtained by 'omap_iommu_vmap()' - * @obj: objective iommu + * @domain: iommu domain + * @dev: client device * @da: iommu device virtual address * * Free the iommu virtually contiguous memory area starting at * @da, which was returned by 'omap_iommu_vmap()'. */ struct sg_table * -omap_iommu_vunmap(struct iommu_domain *domain, struct omap_iommu *obj, u32 da) +omap_iommu_vunmap(struct iommu_domain *domain, struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct sg_table *sgt; /* * 'sgt' is allocated before 'omap_iommu_vmalloc()' is called. @@ -647,7 +654,7 @@ EXPORT_SYMBOL_GPL(omap_iommu_vunmap); /** * omap_iommu_vmalloc - (d)-(p)-(v) address allocator and mapper - * @obj: objective iommu + * @dev: client device * @da: contiguous iommu virtual memory * @bytes: allocation size * @flags: iovma and page property @@ -656,9 +663,10 @@ EXPORT_SYMBOL_GPL(omap_iommu_vunmap); * @da again, which might be adjusted if 'IOVMF_DA_FIXED' is not set. */ u32 -omap_iommu_vmalloc(struct iommu_domain *domain, struct omap_iommu *obj, u32 da, +omap_iommu_vmalloc(struct iommu_domain *domain, struct device *dev, u32 da, size_t bytes, u32 flags) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); void *va; struct sg_table *sgt; @@ -698,15 +706,16 @@ EXPORT_SYMBOL_GPL(omap_iommu_vmalloc); /** * omap_iommu_vfree - release memory allocated by 'omap_iommu_vmalloc()' - * @obj: objective iommu + * @dev: client device * @da: iommu device virtual address * * Frees the iommu virtually continuous memory area starting at * @da, as obtained from 'omap_iommu_vmalloc()'. */ -void omap_iommu_vfree(struct iommu_domain *domain, struct omap_iommu *obj, +void omap_iommu_vfree(struct iommu_domain *domain, struct device *dev, const u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct sg_table *sgt; sgt = unmap_vm_area(domain, obj, da, vfree, diff --git a/drivers/media/video/omap3isp/isp.c b/drivers/media/video/omap3isp/isp.c index b818cacf420f..d4c48ef227fb 100644 --- a/drivers/media/video/omap3isp/isp.c +++ b/drivers/media/video/omap3isp/isp.c @@ -80,13 +80,6 @@ #include "isph3a.h" #include "isphist.h" -/* - * this is provided as an interim solution until omap3isp doesn't need - * any omap-specific iommu API - */ -#define to_iommu(dev) \ - (struct omap_iommu *)platform_get_drvdata(to_platform_device(dev)) - static unsigned int autoidle; module_param(autoidle, int, 0444); MODULE_PARM_DESC(autoidle, "Enable OMAP3ISP AUTOIDLE support"); @@ -1114,8 +1107,7 @@ isp_restore_context(struct isp_device *isp, struct isp_reg *reg_list) static void isp_save_ctx(struct isp_device *isp) { isp_save_context(isp, isp_reg_list); - if (isp->iommu) - omap_iommu_save_ctx(isp->iommu); + omap_iommu_save_ctx(isp->dev); } /* @@ -1128,8 +1120,7 @@ static void isp_save_ctx(struct isp_device *isp) static void isp_restore_ctx(struct isp_device *isp) { isp_restore_context(isp, isp_reg_list); - if (isp->iommu) - omap_iommu_restore_ctx(isp->iommu); + omap_iommu_restore_ctx(isp->dev); omap3isp_ccdc_restore_context(isp); omap3isp_preview_restore_context(isp); } @@ -1983,7 +1974,7 @@ static int isp_remove(struct platform_device *pdev) isp_cleanup_modules(isp); omap3isp_get(isp); - iommu_detach_device(isp->domain, isp->iommu_dev); + iommu_detach_device(isp->domain, &pdev->dev); iommu_domain_free(isp->domain); omap3isp_put(isp); @@ -2131,17 +2122,6 @@ static int isp_probe(struct platform_device *pdev) } } - /* IOMMU */ - isp->iommu_dev = omap_find_iommu_device("isp"); - if (!isp->iommu_dev) { - dev_err(isp->dev, "omap_find_iommu_device failed\n"); - ret = -ENODEV; - goto error_isp; - } - - /* to be removed once iommu migration is complete */ - isp->iommu = to_iommu(isp->iommu_dev); - isp->domain = iommu_domain_alloc(pdev->dev.bus); if (!isp->domain) { dev_err(isp->dev, "can't alloc iommu domain\n"); @@ -2149,7 +2129,7 @@ static int isp_probe(struct platform_device *pdev) goto error_isp; } - ret = iommu_attach_device(isp->domain, isp->iommu_dev); + ret = iommu_attach_device(isp->domain, &pdev->dev); if (ret) { dev_err(&pdev->dev, "can't attach iommu device: %d\n", ret); goto free_domain; @@ -2188,7 +2168,7 @@ error_modules: error_irq: free_irq(isp->irq_num, isp); detach_dev: - iommu_detach_device(isp->domain, isp->iommu_dev); + iommu_detach_device(isp->domain, &pdev->dev); free_domain: iommu_domain_free(isp->domain); error_isp: diff --git a/drivers/media/video/omap3isp/isp.h b/drivers/media/video/omap3isp/isp.h index 705946ef4d60..d96603eb0d17 100644 --- a/drivers/media/video/omap3isp/isp.h +++ b/drivers/media/video/omap3isp/isp.h @@ -212,9 +212,7 @@ struct isp_device { unsigned int sbl_resources; unsigned int subclk_resources; - struct omap_iommu *iommu; struct iommu_domain *domain; - struct device *iommu_dev; struct isp_platform_callback platform_cb; }; diff --git a/drivers/media/video/omap3isp/ispccdc.c b/drivers/media/video/omap3isp/ispccdc.c index b0b0fa5a3572..8748e0855c61 100644 --- a/drivers/media/video/omap3isp/ispccdc.c +++ b/drivers/media/video/omap3isp/ispccdc.c @@ -366,7 +366,7 @@ static void ccdc_lsc_free_request(struct isp_ccdc_device *ccdc, dma_unmap_sg(isp->dev, req->iovm->sgt->sgl, req->iovm->sgt->nents, DMA_TO_DEVICE); if (req->table) - omap_iommu_vfree(isp->domain, isp->iommu, req->table); + omap_iommu_vfree(isp->domain, isp->dev, req->table); kfree(req); } @@ -438,7 +438,7 @@ static int ccdc_lsc_config(struct isp_ccdc_device *ccdc, req->enable = 1; - req->table = omap_iommu_vmalloc(isp->domain, isp->iommu, 0, + req->table = omap_iommu_vmalloc(isp->domain, isp->dev, 0, req->config.size, IOMMU_FLAG); if (IS_ERR_VALUE(req->table)) { req->table = 0; @@ -446,7 +446,7 @@ static int ccdc_lsc_config(struct isp_ccdc_device *ccdc, goto done; } - req->iovm = omap_find_iovm_area(isp->iommu, req->table); + req->iovm = omap_find_iovm_area(isp->dev, req->table); if (req->iovm == NULL) { ret = -ENOMEM; goto done; @@ -462,7 +462,7 @@ static int ccdc_lsc_config(struct isp_ccdc_device *ccdc, dma_sync_sg_for_cpu(isp->dev, req->iovm->sgt->sgl, req->iovm->sgt->nents, DMA_TO_DEVICE); - table = omap_da_to_va(isp->iommu, req->table); + table = omap_da_to_va(isp->dev, req->table); if (copy_from_user(table, config->lsc, req->config.size)) { ret = -EFAULT; goto done; @@ -734,15 +734,15 @@ static int ccdc_config(struct isp_ccdc_device *ccdc, * already done by omap_iommu_vmalloc(). */ size = ccdc->fpc.fpnum * 4; - table_new = omap_iommu_vmalloc(isp->domain, isp->iommu, + table_new = omap_iommu_vmalloc(isp->domain, isp->dev, 0, size, IOMMU_FLAG); if (IS_ERR_VALUE(table_new)) return -ENOMEM; - if (copy_from_user(omap_da_to_va(isp->iommu, table_new), + if (copy_from_user(omap_da_to_va(isp->dev, table_new), (__force void __user *) ccdc->fpc.fpcaddr, size)) { - omap_iommu_vfree(isp->domain, isp->iommu, + omap_iommu_vfree(isp->domain, isp->dev, table_new); return -EFAULT; } @@ -753,7 +753,7 @@ static int ccdc_config(struct isp_ccdc_device *ccdc, ccdc_configure_fpc(ccdc); if (table_old != 0) - omap_iommu_vfree(isp->domain, isp->iommu, table_old); + omap_iommu_vfree(isp->domain, isp->dev, table_old); } return ccdc_lsc_config(ccdc, ccdc_struct); @@ -2309,7 +2309,7 @@ void omap3isp_ccdc_cleanup(struct isp_device *isp) ccdc_lsc_free_queue(ccdc, &ccdc->lsc.free_queue); if (ccdc->fpc.fpcaddr != 0) - omap_iommu_vfree(isp->domain, isp->iommu, ccdc->fpc.fpcaddr); + omap_iommu_vfree(isp->domain, isp->dev, ccdc->fpc.fpcaddr); mutex_destroy(&ccdc->ioctl_lock); } diff --git a/drivers/media/video/omap3isp/ispstat.c b/drivers/media/video/omap3isp/ispstat.c index 68d539456c55..9f41aa3d00a5 100644 --- a/drivers/media/video/omap3isp/ispstat.c +++ b/drivers/media/video/omap3isp/ispstat.c @@ -366,7 +366,7 @@ static void isp_stat_bufs_free(struct ispstat *stat) dma_unmap_sg(isp->dev, buf->iovm->sgt->sgl, buf->iovm->sgt->nents, DMA_FROM_DEVICE); - omap_iommu_vfree(isp->domain, isp->iommu, + omap_iommu_vfree(isp->domain, isp->dev, buf->iommu_addr); } else { if (!buf->virt_addr) @@ -400,7 +400,7 @@ static int isp_stat_bufs_alloc_iommu(struct ispstat *stat, unsigned int size) struct iovm_struct *iovm; WARN_ON(buf->dma_addr); - buf->iommu_addr = omap_iommu_vmalloc(isp->domain, isp->iommu, 0, + buf->iommu_addr = omap_iommu_vmalloc(isp->domain, isp->dev, 0, size, IOMMU_FLAG); if (IS_ERR((void *)buf->iommu_addr)) { dev_err(stat->isp->dev, @@ -410,7 +410,7 @@ static int isp_stat_bufs_alloc_iommu(struct ispstat *stat, unsigned int size) return -ENOMEM; } - iovm = omap_find_iovm_area(isp->iommu, buf->iommu_addr); + iovm = omap_find_iovm_area(isp->dev, buf->iommu_addr); if (!iovm || !dma_map_sg(isp->dev, iovm->sgt->sgl, iovm->sgt->nents, DMA_FROM_DEVICE)) { @@ -419,7 +419,7 @@ static int isp_stat_bufs_alloc_iommu(struct ispstat *stat, unsigned int size) } buf->iovm = iovm; - buf->virt_addr = omap_da_to_va(stat->isp->iommu, + buf->virt_addr = omap_da_to_va(stat->isp->dev, (u32)buf->iommu_addr); buf->empty = 1; dev_dbg(stat->isp->dev, "%s: buffer[%d] allocated." diff --git a/drivers/media/video/omap3isp/ispvideo.c b/drivers/media/video/omap3isp/ispvideo.c index d1000723c5ae..9974cfc09090 100644 --- a/drivers/media/video/omap3isp/ispvideo.c +++ b/drivers/media/video/omap3isp/ispvideo.c @@ -452,7 +452,7 @@ ispmmu_vmap(struct isp_device *isp, const struct scatterlist *sglist, int sglen) sgt->nents = sglen; sgt->orig_nents = sglen; - da = omap_iommu_vmap(isp->domain, isp->iommu, 0, sgt, IOMMU_FLAG); + da = omap_iommu_vmap(isp->domain, isp->dev, 0, sgt, IOMMU_FLAG); if (IS_ERR_VALUE(da)) kfree(sgt); @@ -468,7 +468,7 @@ static void ispmmu_vunmap(struct isp_device *isp, dma_addr_t da) { struct sg_table *sgt; - sgt = omap_iommu_vunmap(isp->domain, isp->iommu, (u32)da); + sgt = omap_iommu_vunmap(isp->domain, isp->dev, (u32)da); kfree(sgt); } From b50cac55bf859d5b2fdcc1803a553a251b703456 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 6 Oct 2011 14:08:18 -0400 Subject: [PATCH 16/53] PCI/sysfs: add per pci device msi[x] irq listing (v5) This patch adds a per-pci-device subdirectory in sysfs called: /sys/bus/pci/devices//msi_irqs This sub-directory exports the set of msi vectors allocated by a given pci device, by creating a numbered sub-directory for each vector beneath msi_irqs. For each vector various attributes can be exported. Currently the only attribute is called mode, which tracks the operational mode of that vector (msi vs. msix) Acked-by: Greg Kroah-Hartman Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 18 ++++ drivers/pci/msi.c | 111 ++++++++++++++++++++++++ include/linux/msi.h | 3 + include/linux/pci.h | 1 + 4 files changed, 133 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 349ecf26ce10..34f51100f029 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -66,6 +66,24 @@ Description: re-discover previously removed devices. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../msi_irqs/ +Date: September, 2011 +Contact: Neil Horman +Description: + The /sys/devices/.../msi_irqs directory contains a variable set + of sub-directories, with each sub-directory being named after a + corresponding msi irq vector allocated to that device. Each + numbered sub-directory N contains attributes of that irq. + Note that this directory is not created for device drivers which + do not support msi irqs + +What: /sys/bus/pci/devices/.../msi_irqs//mode +Date: September 2011 +Contact: Neil Horman +Description: + This attribute indicates the mode that the irq vector named by + the parent directory is in (msi vs. msix) + What: /sys/bus/pci/devices/.../remove Date: January 2009 Contact: Linux PCI developers diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0e6d04d7ba4f..e6b6b9c67023 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -323,6 +323,8 @@ static void free_msi_irqs(struct pci_dev *dev) if (list_is_last(&entry->list, &dev->msi_list)) iounmap(entry->mask_base); } + kobject_del(&entry->kobj); + kobject_put(&entry->kobj); list_del(&entry->list); kfree(entry); } @@ -403,6 +405,98 @@ void pci_restore_msi_state(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_restore_msi_state); + +#define to_msi_attr(obj) container_of(obj, struct msi_attribute, attr) +#define to_msi_desc(obj) container_of(obj, struct msi_desc, kobj) + +struct msi_attribute { + struct attribute attr; + ssize_t (*show)(struct msi_desc *entry, struct msi_attribute *attr, + char *buf); + ssize_t (*store)(struct msi_desc *entry, struct msi_attribute *attr, + const char *buf, size_t count); +}; + +static ssize_t show_msi_mode(struct msi_desc *entry, struct msi_attribute *atr, + char *buf) +{ + return sprintf(buf, "%s\n", entry->msi_attrib.is_msix ? "msix" : "msi"); +} + +static ssize_t msi_irq_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct msi_attribute *attribute = to_msi_attr(attr); + struct msi_desc *entry = to_msi_desc(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(entry, attribute, buf); +} + +static const struct sysfs_ops msi_irq_sysfs_ops = { + .show = msi_irq_attr_show, +}; + +static struct msi_attribute mode_attribute = + __ATTR(mode, S_IRUGO, show_msi_mode, NULL); + + +struct attribute *msi_irq_default_attrs[] = { + &mode_attribute.attr, + NULL +}; + +void msi_kobj_release(struct kobject *kobj) +{ + struct msi_desc *entry = to_msi_desc(kobj); + + pci_dev_put(entry->dev); +} + +static struct kobj_type msi_irq_ktype = { + .release = msi_kobj_release, + .sysfs_ops = &msi_irq_sysfs_ops, + .default_attrs = msi_irq_default_attrs, +}; + +static int populate_msi_sysfs(struct pci_dev *pdev) +{ + struct msi_desc *entry; + struct kobject *kobj; + int ret; + int count = 0; + + pdev->msi_kset = kset_create_and_add("msi_irqs", NULL, &pdev->dev.kobj); + if (!pdev->msi_kset) + return -ENOMEM; + + list_for_each_entry(entry, &pdev->msi_list, list) { + kobj = &entry->kobj; + kobj->kset = pdev->msi_kset; + pci_dev_get(pdev); + ret = kobject_init_and_add(kobj, &msi_irq_ktype, NULL, + "%u", entry->irq); + if (ret) + goto out_unroll; + + count++; + } + + return 0; + +out_unroll: + list_for_each_entry(entry, &pdev->msi_list, list) { + if (!count) + break; + kobject_del(&entry->kobj); + kobject_put(&entry->kobj); + count--; + } + return ret; +} + /** * msi_capability_init - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function @@ -454,6 +548,13 @@ static int msi_capability_init(struct pci_dev *dev, int nvec) return ret; } + ret = populate_msi_sysfs(dev); + if (ret) { + msi_mask_irq(entry, mask, ~mask); + free_msi_irqs(dev); + return ret; + } + /* Set MSI enabled bits */ pci_intx_for_msi(dev, 0); msi_set_enable(dev, pos, 1); @@ -574,6 +675,12 @@ static int msix_capability_init(struct pci_dev *dev, msix_program_entries(dev, entries); + ret = populate_msi_sysfs(dev); + if (ret) { + ret = 0; + goto error; + } + /* Set MSI-X enabled bits and unmask the function */ pci_intx_for_msi(dev, 0); dev->msix_enabled = 1; @@ -732,6 +839,8 @@ void pci_disable_msi(struct pci_dev *dev) pci_msi_shutdown(dev); free_msi_irqs(dev); + kset_unregister(dev->msi_kset); + dev->msi_kset = NULL; } EXPORT_SYMBOL(pci_disable_msi); @@ -830,6 +939,8 @@ void pci_disable_msix(struct pci_dev *dev) pci_msix_shutdown(dev); free_msi_irqs(dev); + kset_unregister(dev->msi_kset); + dev->msi_kset = NULL; } EXPORT_SYMBOL(pci_disable_msix); diff --git a/include/linux/msi.h b/include/linux/msi.h index 05acced439a3..ce93a341337d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -1,6 +1,7 @@ #ifndef LINUX_MSI_H #define LINUX_MSI_H +#include #include struct msi_msg { @@ -44,6 +45,8 @@ struct msi_desc { /* Last set MSI message */ struct msi_msg msg; + + struct kobject kobj; }; /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 7cda65b5f798..84225c756bd1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -336,6 +336,7 @@ struct pci_dev { struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ #ifdef CONFIG_PCI_MSI struct list_head msi_list; + struct kset *msi_kset; #endif struct pci_vpd *vpd; #ifdef CONFIG_PCI_ATS From 69166fbf02c7a21745013f2de037bf7af26e4279 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Nov 2011 14:07:15 -0600 Subject: [PATCH 17/53] PCI: Fix PRI and PASID consistency These are extended capabilities, rename and move to proper group for consistency. Signed-off-by: Alex Williamson Signed-off-by: Jesse Barnes --- drivers/pci/ats.c | 20 ++++++++++---------- include/linux/pci_regs.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 7ec56fb0bd78..831e1920386c 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -174,7 +174,7 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs) u32 max_requests; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; @@ -205,7 +205,7 @@ void pci_disable_pri(struct pci_dev *pdev) u16 control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return; @@ -226,7 +226,7 @@ bool pci_pri_enabled(struct pci_dev *pdev) u16 control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return false; @@ -248,7 +248,7 @@ int pci_reset_pri(struct pci_dev *pdev) u16 control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; @@ -281,7 +281,7 @@ bool pci_pri_stopped(struct pci_dev *pdev) u16 control, status; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return true; @@ -310,7 +310,7 @@ int pci_pri_status(struct pci_dev *pdev) u16 status, control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; @@ -341,7 +341,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) u16 control, supported; int pos; - pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); if (!pos) return -EINVAL; @@ -375,7 +375,7 @@ void pci_disable_pasid(struct pci_dev *pdev) u16 control = 0; int pos; - pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); if (!pos) return; @@ -399,7 +399,7 @@ int pci_pasid_features(struct pci_dev *pdev) u16 supported; int pos; - pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); if (!pos) return -EINVAL; @@ -425,7 +425,7 @@ int pci_max_pasids(struct pci_dev *pdev) u16 supported; int pos; - pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); if (!pos) return -EINVAL; diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index b5d9657f3100..090d3a9f5b26 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -537,7 +537,9 @@ #define PCI_EXT_CAP_ID_ARI 14 #define PCI_EXT_CAP_ID_ATS 15 #define PCI_EXT_CAP_ID_SRIOV 16 +#define PCI_EXT_CAP_ID_PRI 19 #define PCI_EXT_CAP_ID_LTR 24 +#define PCI_EXT_CAP_ID_PASID 27 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ @@ -664,7 +666,6 @@ #define PCI_ATS_MIN_STU 12 /* shift of minimum STU block */ /* Page Request Interface */ -#define PCI_PRI_CAP 0x13 /* PRI capability ID */ #define PCI_PRI_CONTROL_OFF 0x04 /* Offset of control register */ #define PCI_PRI_STATUS_OFF 0x06 /* Offset of status register */ #define PCI_PRI_ENABLE 0x0001 /* Enable mask */ @@ -676,7 +677,6 @@ #define PCI_PRI_ALLOC_REQ_OFF 0x0c /* Cap offset for max reqs allowed */ /* PASID capability */ -#define PCI_PASID_CAP 0x1b /* PASID capability ID */ #define PCI_PASID_CAP_OFF 0x04 /* PASID feature register */ #define PCI_PASID_CONTROL_OFF 0x06 /* PASID control register */ #define PCI_PASID_ENABLE 0x01 /* Enable/Supported bit */ From 3c076351c4027a56d5005a39a0b518a4ba393ce2 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 10 Nov 2011 16:38:33 -0500 Subject: [PATCH 18/53] PCI: Rework ASPM disable code Right now we forcibly clear ASPM state on all devices if the BIOS indicates that the feature isn't supported. Based on the Microsoft presentation "PCI Express In Depth for Windows Vista and Beyond", I'm starting to think that this may be an error. The implication is that unless the platform grants full control via _OSC, Windows will not touch any PCIe features - including ASPM. In that case clearing ASPM state would be an error unless the platform has granted us that control. This patch reworks the ASPM disabling code such that the actual clearing of state is triggered by a successful handoff of PCIe control to the OS. The general ASPM code undergoes some changes in order to ensure that the ability to clear the bits isn't overridden by ASPM having already been disabled. Further, this theoretically now allows for situations where only a subset of PCIe roots hand over control, leaving the others in the BIOS state. It's difficult to know for sure that this is the right thing to do - there's zero public documentation on the interaction between all of these components. But enough vendors enable ASPM on platforms and then set this bit that it seems likely that they're expecting the OS to leave them alone. Measured to save around 5W on an idle Thinkpad X220. Signed-off-by: Matthew Garrett Signed-off-by: Jesse Barnes --- drivers/acpi/pci_root.c | 7 +++++ drivers/pci/pci-acpi.c | 1 - drivers/pci/pcie/aspm.c | 58 +++++++++++++++++++++++++--------------- include/linux/pci-aspm.h | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 2672c798272f..7aff6312ce7c 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -596,6 +596,13 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device) if (ACPI_SUCCESS(status)) { dev_info(root->bus->bridge, "ACPI _OSC control (0x%02x) granted\n", flags); + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) { + /* + * We have ASPM control, but the FADT indicates + * that it's unsupported. Clear it. + */ + pcie_clear_aspm(root->bus); + } } else { dev_info(root->bus->bridge, "ACPI _OSC request failed (%s), " diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 4ecb6408b0d6..c8e75851a314 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -395,7 +395,6 @@ static int __init acpi_pci_init(void) if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) { printk(KERN_INFO"ACPI FADT declares the system doesn't support PCIe ASPM, so disable it\n"); - pcie_clear_aspm(); pcie_no_aspm(); } diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index cbfbab18be91..1cfbf228fbb1 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -68,7 +68,7 @@ struct pcie_link_state { struct aspm_latency acceptable[8]; }; -static int aspm_disabled, aspm_force, aspm_clear_state; +static int aspm_disabled, aspm_force; static bool aspm_support_enabled = true; static DEFINE_MUTEX(aspm_lock); static LIST_HEAD(link_list); @@ -500,9 +500,6 @@ static int pcie_aspm_sanity_check(struct pci_dev *pdev) int pos; u32 reg32; - if (aspm_clear_state) - return -EINVAL; - /* * Some functions in a slot might not all be PCIe functions, * very strange. Disable ASPM for the whole slot @@ -574,9 +571,6 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev) pdev->pcie_type != PCI_EXP_TYPE_DOWNSTREAM) return; - if (aspm_disabled && !aspm_clear_state) - return; - /* VIA has a strange chipset, root port is under a bridge */ if (pdev->pcie_type == PCI_EXP_TYPE_ROOT_PORT && pdev->bus->self) @@ -608,7 +602,7 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev) * the BIOS's expectation, we'll do so once pci_enable_device() is * called. */ - if (aspm_policy != POLICY_POWERSAVE || aspm_clear_state) { + if (aspm_policy != POLICY_POWERSAVE) { pcie_config_aspm_path(link); pcie_set_clkpm(link, policy_to_clkpm_state(link)); } @@ -649,8 +643,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev) struct pci_dev *parent = pdev->bus->self; struct pcie_link_state *link, *root, *parent_link; - if ((aspm_disabled && !aspm_clear_state) || !pci_is_pcie(pdev) || - !parent || !parent->link_state) + if (!pci_is_pcie(pdev) || !parent || !parent->link_state) return; if ((parent->pcie_type != PCI_EXP_TYPE_ROOT_PORT) && (parent->pcie_type != PCI_EXP_TYPE_DOWNSTREAM)) @@ -734,13 +727,18 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev) * pci_disable_link_state - disable pci device's link state, so the link will * never enter specific states */ -static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem) +static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem, + bool force) { struct pci_dev *parent = pdev->bus->self; struct pcie_link_state *link; - if (aspm_disabled || !pci_is_pcie(pdev)) + if (aspm_disabled && !force) return; + + if (!pci_is_pcie(pdev)) + return; + if (pdev->pcie_type == PCI_EXP_TYPE_ROOT_PORT || pdev->pcie_type == PCI_EXP_TYPE_DOWNSTREAM) parent = pdev; @@ -768,16 +766,31 @@ static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem) void pci_disable_link_state_locked(struct pci_dev *pdev, int state) { - __pci_disable_link_state(pdev, state, false); + __pci_disable_link_state(pdev, state, false, false); } EXPORT_SYMBOL(pci_disable_link_state_locked); void pci_disable_link_state(struct pci_dev *pdev, int state) { - __pci_disable_link_state(pdev, state, true); + __pci_disable_link_state(pdev, state, true, false); } EXPORT_SYMBOL(pci_disable_link_state); +void pcie_clear_aspm(struct pci_bus *bus) +{ + struct pci_dev *child; + + /* + * Clear any ASPM setup that the firmware has carried out on this bus + */ + list_for_each_entry(child, &bus->devices, bus_list) { + __pci_disable_link_state(child, PCIE_LINK_STATE_L0S | + PCIE_LINK_STATE_L1 | + PCIE_LINK_STATE_CLKPM, + false, true); + } +} + static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp) { int i; @@ -935,6 +948,7 @@ void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) static int __init pcie_aspm_disable(char *str) { if (!strcmp(str, "off")) { + aspm_policy = POLICY_DEFAULT; aspm_disabled = 1; aspm_support_enabled = false; printk(KERN_INFO "PCIe ASPM is disabled\n"); @@ -947,16 +961,18 @@ static int __init pcie_aspm_disable(char *str) __setup("pcie_aspm=", pcie_aspm_disable); -void pcie_clear_aspm(void) -{ - if (!aspm_force) - aspm_clear_state = 1; -} - void pcie_no_aspm(void) { - if (!aspm_force) + /* + * Disabling ASPM is intended to prevent the kernel from modifying + * existing hardware state, not to clear existing state. To that end: + * (a) set policy to POLICY_DEFAULT in order to avoid changing state + * (b) prevent userspace from changing policy + */ + if (!aspm_force) { + aspm_policy = POLICY_DEFAULT; aspm_disabled = 1; + } } /** diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h index 7cea7b6c1413..c8320144fe79 100644 --- a/include/linux/pci-aspm.h +++ b/include/linux/pci-aspm.h @@ -29,7 +29,7 @@ extern void pcie_aspm_pm_state_change(struct pci_dev *pdev); extern void pcie_aspm_powersave_config_link(struct pci_dev *pdev); extern void pci_disable_link_state(struct pci_dev *pdev, int state); extern void pci_disable_link_state_locked(struct pci_dev *pdev, int state); -extern void pcie_clear_aspm(void); +extern void pcie_clear_aspm(struct pci_bus *bus); extern void pcie_no_aspm(void); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) @@ -47,7 +47,7 @@ static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) static inline void pci_disable_link_state(struct pci_dev *pdev, int state) { } -static inline void pcie_clear_aspm(void) +static inline void pcie_clear_aspm(struct pci_bus *bus) { } static inline void pcie_no_aspm(void) From 027e8d52abdd44bc00e405af83cd2fbfb96c0824 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 7 Nov 2011 20:55:46 +0900 Subject: [PATCH 19/53] PCI: pciehp: Fix wrong workqueue cleanup Fix improper workqueue cleanup. In the current pciehp, pcied_cleanup() calls destroy_workqueue() before calling pcie_port_service_unregister(). This causes kernel oops because flush_workqueue() is called in the pcie_port_service_unregister() code path after the workqueue was destroyed. So pcied_cleanup() must call pcie_port_service_unregister() first before calling destroy_workqueue(). Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 7ac8358df8fd..a13ad1308adb 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -366,9 +366,9 @@ static int __init pcied_init(void) static void __exit pcied_cleanup(void) { dbg("unload_pciehpd()\n"); + pcie_port_service_unregister(&hpdriver_portdrv); destroy_workqueue(pciehp_ordered_wq); destroy_workqueue(pciehp_wq); - pcie_port_service_unregister(&hpdriver_portdrv); info(DRIVER_DESC " version: " DRIVER_VERSION " unloaded\n"); } From 486b10b9f43500741cd63a878d0ef23cd87fc66d Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 7 Nov 2011 20:56:50 +0900 Subject: [PATCH 20/53] PCI: pciehp: Handle push button event asynchronously Use non-ordered workqueue for attention button events. Attention button events on each slot can be handled asynchronously. So we should use non-ordered workqueue. This patch also removes ordered workqueue in pciehp as a result. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp.h | 1 - drivers/pci/hotplug/pciehp_core.c | 9 --------- drivers/pci/hotplug/pciehp_ctrl.c | 4 ++-- drivers/pci/hotplug/pciehp_hpc.c | 1 - 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 838f571027b7..9a33fdde2d16 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -45,7 +45,6 @@ extern int pciehp_poll_time; extern int pciehp_debug; extern int pciehp_force; extern struct workqueue_struct *pciehp_wq; -extern struct workqueue_struct *pciehp_ordered_wq; #define dbg(format, arg...) \ do { \ diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index a13ad1308adb..b8c99d35ac97 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -43,7 +43,6 @@ int pciehp_poll_mode; int pciehp_poll_time; int pciehp_force; struct workqueue_struct *pciehp_wq; -struct workqueue_struct *pciehp_ordered_wq; #define DRIVER_VERSION "0.4" #define DRIVER_AUTHOR "Dan Zink , Greg Kroah-Hartman , Dely Sy " @@ -345,18 +344,11 @@ static int __init pcied_init(void) if (!pciehp_wq) return -ENOMEM; - pciehp_ordered_wq = alloc_ordered_workqueue("pciehp_ordered", 0); - if (!pciehp_ordered_wq) { - destroy_workqueue(pciehp_wq); - return -ENOMEM; - } - pciehp_firmware_init(); retval = pcie_port_service_register(&hpdriver_portdrv); dbg("pcie_port_service_register = %d\n", retval); info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); if (retval) { - destroy_workqueue(pciehp_ordered_wq); destroy_workqueue(pciehp_wq); dbg("Failure to register service\n"); } @@ -367,7 +359,6 @@ static void __exit pcied_cleanup(void) { dbg("unload_pciehpd()\n"); pcie_port_service_unregister(&hpdriver_portdrv); - destroy_workqueue(pciehp_ordered_wq); destroy_workqueue(pciehp_wq); info(DRIVER_DESC " version: " DRIVER_VERSION " unloaded\n"); } diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 085dbb5fc168..27f44295a657 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -344,7 +344,7 @@ void pciehp_queue_pushbutton_work(struct work_struct *work) kfree(info); goto out; } - queue_work(pciehp_ordered_wq, &info->work); + queue_work(pciehp_wq, &info->work); out: mutex_unlock(&p_slot->lock); } @@ -439,7 +439,7 @@ static void handle_surprise_event(struct slot *p_slot) else p_slot->state = POWERON_STATE; - queue_work(pciehp_ordered_wq, &info->work); + queue_work(pciehp_wq, &info->work); } static void interrupt_event_handler(struct work_struct *work) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 7b1414810ae3..bcdbb1643621 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -806,7 +806,6 @@ static void pcie_cleanup_slot(struct controller *ctrl) struct slot *slot = ctrl->slot; cancel_delayed_work(&slot->work); flush_workqueue(pciehp_wq); - flush_workqueue(pciehp_ordered_wq); kfree(slot); } From d90116ea38f7768dac0349f01ffbc2663d63b7e9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 6 Nov 2011 23:11:28 +0100 Subject: [PATCH 21/53] PCI/ACPI: Make acpiphp ignore root bridges using SHPC native hotplug If the kernel has requested control of the SHPC native hotplug feature for a given root bridge, the acpiphp driver should not try to handle that root bridge and it should leave it to shpchp. Failing to do so causes problems to happen if shpchp is loaded and unloaded before loading acpiphp (ACPI-based hotplug won't work in that case anyway). To address this issue make find_root_bridges() ignore PCI root bridges with SHPC native hotplug enabled and make add_bridge() return error code if SHPC native hotplug is enabled for the given root bridge. This causes acpiphp to refuse to load if SHPC native hotplug is enabled for all root bridges and to refuse binding to the root bridges with SHPC native hotplug enabled. Reviewed-by: Kenji Kaneshige Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/acpiphp_glue.c | 4 ++-- include/linux/acpi.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index fce1c54a0c8d..ba43c037de80 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -467,7 +467,7 @@ static int add_bridge(acpi_handle handle) * granted by the BIOS for it. */ root = acpi_pci_find_root(handle); - if (root && (root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL)) + if (root && (root->osc_control_set & OSC_PCI_NATIVE_HOTPLUG)) return -ENODEV; /* if the bridge doesn't have _STA, we assume it is always there */ @@ -1395,7 +1395,7 @@ find_root_bridges(acpi_handle handle, u32 lvl, void *context, void **rv) if (!root) return AE_OK; - if (root->osc_control_set & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL) + if (root->osc_control_set & OSC_PCI_NATIVE_HOTPLUG) return AE_OK; (*count)++; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6001b4da39dd..627a3a42e4d8 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -302,6 +302,10 @@ extern bool osc_sb_apei_support_acked; OSC_PCI_EXPRESS_PME_CONTROL | \ OSC_PCI_EXPRESS_AER_CONTROL | \ OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL) + +#define OSC_PCI_NATIVE_HOTPLUG (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | \ + OSC_SHPC_NATIVE_HP_CONTROL) + extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req); extern void acpi_early_init(void); From a424948dde8421089826e2f782d0efe9e565707e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 6 Nov 2011 22:21:46 +0100 Subject: [PATCH 22/53] PCI/ACPI/PM: Avoid resuming devices that don't signal PME Modify pci_acpi_wake_dev() to avoid resuming PME-capable devices whose PME Status bits are not set, which may happen currently if several devices are associated with the same wakeup GPE and all of them are notified whenever at least one of them signals PME. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pci-acpi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index c8e75851a314..060fd22a1103 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -45,16 +45,20 @@ static void pci_acpi_wake_dev(acpi_handle handle, u32 event, void *context) { struct pci_dev *pci_dev = context; - if (event == ACPI_NOTIFY_DEVICE_WAKE && pci_dev) { + if (event != ACPI_NOTIFY_DEVICE_WAKE || !pci_dev) + return; + + if (!pci_dev->pm_cap || !pci_dev->pme_support + || pci_check_pme_status(pci_dev)) { if (pci_dev->pme_poll) pci_dev->pme_poll = false; pci_wakeup_event(pci_dev); - pci_check_pme_status(pci_dev); pm_runtime_resume(&pci_dev->dev); - if (pci_dev->subordinate) - pci_pme_wakeup_bus(pci_dev->subordinate); } + + if (pci_dev->subordinate) + pci_pme_wakeup_bus(pci_dev->subordinate); } /** From a776c491ca5e38c26d9f66923ff574d041e747f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Oct 2011 11:46:06 -0700 Subject: [PATCH 23/53] PCI: msi: Disable msi interrupts when we initialize a pci device I traced a nasty kexec on panic boot failure to the fact that we had screaming msi interrupts and we were not disabling the msi messages at kernel startup. The booting kernel had not enabled those interupts so was not prepared to handle them. I can see no reason why we would ever want to leave the msi interrupts enabled at boot if something else has enabled those interrupts. The pci spec specifies that msi interrupts should be off by default. Drivers are expected to enable the msi interrupts if they want to use them. Our interrupt handling code reprograms the interrupt handlers at boot and will not be be able to do anything useful with an unexpected interrupt. This patch applies cleanly all of the way back to 2.6.32 where I noticed the problem. Cc: stable@kernel.org Signed-off-by: Eric W. Biederman Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index e6b6b9c67023..337e16ab4a92 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -981,5 +981,15 @@ EXPORT_SYMBOL(pci_msi_enabled); void pci_msi_init_pci_dev(struct pci_dev *dev) { + int pos; INIT_LIST_HEAD(&dev->msi_list); + + /* Disable the msi hardware to avoid screaming interrupts + * during boot. This is the power on reset default so + * usually this should be a noop. + */ + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (pos) + msi_set_enable(dev, pos, 0); + msix_set_enable(dev, 0); } From 60fe823837d10673500ff685c01eb2f896fe5849 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 11 Nov 2011 10:06:56 -0700 Subject: [PATCH 24/53] PCI: Enable is not exposed as a PASID capability The PASID ECN indicates bit 0 is reserved in the capability register. Switch pci_enable_pasid() to error if PASID is already enabled and don't expose enable as a feature in pci_pasid_features(). Reviewed-by: Joerg Roedel Tested-by: Joerg Roedel Signed-off-by: Alex Williamson Signed-off-by: Jesse Barnes --- drivers/pci/ats.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 831e1920386c..8e95a123d37a 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -348,7 +348,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) pci_read_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, &control); pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); - if (!(supported & PCI_PASID_ENABLE)) + if (control & PCI_PASID_ENABLE) return -EINVAL; supported &= PCI_PASID_EXEC | PCI_PASID_PRIV; @@ -390,7 +390,6 @@ EXPORT_SYMBOL_GPL(pci_disable_pasid); * Returns a negative value when no PASI capability is present. * Otherwise is returns a bitmask with supported features. Current * features reported are: - * PCI_PASID_ENABLE - PASID capability can be enabled * PCI_PASID_EXEC - Execute permission supported * PCI_PASID_PRIV - Priviledged mode supported */ @@ -405,7 +404,7 @@ int pci_pasid_features(struct pci_dev *pdev) pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); - supported &= PCI_PASID_ENABLE | PCI_PASID_EXEC | PCI_PASID_PRIV; + supported &= PCI_PASID_EXEC | PCI_PASID_PRIV; return supported; } From 91f57d5e1be3db1e079c8696f1eab214f1c7922d Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 11 Nov 2011 10:07:36 -0700 Subject: [PATCH 25/53] PCI: More PRI/PASID cleanup More consistency cleanups. Drop the _OFF, separate and indent CTRL/CAP/STATUS bit definitions. This helped find the previous mis-use of bit 0 in the PASID capability register. Reviewed-by: Joerg Roedel Tested-by: Joerg Roedel Signed-off-by: Alex Williamson Signed-off-by: Jesse Barnes --- drivers/pci/ats.c | 69 ++++++++++++++++++++-------------------- include/linux/pci_regs.h | 30 +++++++++-------- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 8e95a123d37a..2df49af6cc90 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -178,17 +178,18 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); - if ((control & PCI_PRI_ENABLE) || !(status & PCI_PRI_STATUS_STOPPED)) + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); + if ((control & PCI_PRI_CTRL_ENABLE) || + !(status & PCI_PRI_STATUS_STOPPED)) return -EBUSY; - pci_read_config_dword(pdev, pos + PCI_PRI_MAX_REQ_OFF, &max_requests); + pci_read_config_dword(pdev, pos + PCI_PRI_MAX_REQ, &max_requests); reqs = min(max_requests, reqs); - pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ_OFF, reqs); + pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ, reqs); - control |= PCI_PRI_ENABLE; - pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + control |= PCI_PRI_CTRL_ENABLE; + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); return 0; } @@ -209,9 +210,9 @@ void pci_disable_pri(struct pci_dev *pdev) if (!pos) return; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - control &= ~PCI_PRI_ENABLE; - pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + control &= ~PCI_PRI_CTRL_ENABLE; + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); } EXPORT_SYMBOL_GPL(pci_disable_pri); @@ -230,9 +231,9 @@ bool pci_pri_enabled(struct pci_dev *pdev) if (!pos) return false; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); - return (control & PCI_PRI_ENABLE) ? true : false; + return (control & PCI_PRI_CTRL_ENABLE) ? true : false; } EXPORT_SYMBOL_GPL(pci_pri_enabled); @@ -252,13 +253,13 @@ int pci_reset_pri(struct pci_dev *pdev) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - if (control & PCI_PRI_ENABLE) + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + if (control & PCI_PRI_CTRL_ENABLE) return -EBUSY; - control |= PCI_PRI_RESET; + control |= PCI_PRI_CTRL_RESET; - pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); return 0; } @@ -285,10 +286,10 @@ bool pci_pri_stopped(struct pci_dev *pdev) if (!pos) return true; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); - if (control & PCI_PRI_ENABLE) + if (control & PCI_PRI_CTRL_ENABLE) return false; return (status & PCI_PRI_STATUS_STOPPED) ? true : false; @@ -314,11 +315,11 @@ int pci_pri_status(struct pci_dev *pdev) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); /* Stopped bit is undefined when enable == 1, so clear it */ - if (control & PCI_PRI_ENABLE) + if (control & PCI_PRI_CTRL_ENABLE) status &= ~PCI_PRI_STATUS_STOPPED; return status; @@ -345,21 +346,21 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, &control); - pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + pci_read_config_word(pdev, pos + PCI_PASID_CTRL, &control); + pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported); - if (control & PCI_PASID_ENABLE) + if (control & PCI_PASID_CTRL_ENABLE) return -EINVAL; - supported &= PCI_PASID_EXEC | PCI_PASID_PRIV; + supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV; /* User wants to enable anything unsupported? */ if ((supported & features) != features) return -EINVAL; - control = PCI_PASID_ENABLE | features; + control = PCI_PASID_CTRL_ENABLE | features; - pci_write_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, control); + pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control); return 0; } @@ -379,7 +380,7 @@ void pci_disable_pasid(struct pci_dev *pdev) if (!pos) return; - pci_write_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, control); + pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control); } EXPORT_SYMBOL_GPL(pci_disable_pasid); @@ -390,8 +391,8 @@ EXPORT_SYMBOL_GPL(pci_disable_pasid); * Returns a negative value when no PASI capability is present. * Otherwise is returns a bitmask with supported features. Current * features reported are: - * PCI_PASID_EXEC - Execute permission supported - * PCI_PASID_PRIV - Priviledged mode supported + * PCI_PASID_CAP_EXEC - Execute permission supported + * PCI_PASID_CAP_PRIV - Priviledged mode supported */ int pci_pasid_features(struct pci_dev *pdev) { @@ -402,9 +403,9 @@ int pci_pasid_features(struct pci_dev *pdev) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported); - supported &= PCI_PASID_EXEC | PCI_PASID_PRIV; + supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV; return supported; } @@ -428,7 +429,7 @@ int pci_max_pasids(struct pci_dev *pdev) if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported); supported = (supported & PASID_NUMBER_MASK) >> PASID_NUMBER_SHIFT; diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index 090d3a9f5b26..28fe380cb19d 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -666,22 +666,24 @@ #define PCI_ATS_MIN_STU 12 /* shift of minimum STU block */ /* Page Request Interface */ -#define PCI_PRI_CONTROL_OFF 0x04 /* Offset of control register */ -#define PCI_PRI_STATUS_OFF 0x06 /* Offset of status register */ -#define PCI_PRI_ENABLE 0x0001 /* Enable mask */ -#define PCI_PRI_RESET 0x0002 /* Reset bit mask */ -#define PCI_PRI_STATUS_RF 0x0001 /* Request Failure */ -#define PCI_PRI_STATUS_UPRGI 0x0002 /* Unexpected PRG index */ -#define PCI_PRI_STATUS_STOPPED 0x0100 /* PRI Stopped */ -#define PCI_PRI_MAX_REQ_OFF 0x08 /* Cap offset for max reqs supported */ -#define PCI_PRI_ALLOC_REQ_OFF 0x0c /* Cap offset for max reqs allowed */ +#define PCI_PRI_CTRL 0x04 /* PRI control register */ +#define PCI_PRI_CTRL_ENABLE 0x01 /* Enable */ +#define PCI_PRI_CTRL_RESET 0x02 /* Reset */ +#define PCI_PRI_STATUS 0x06 /* PRI status register */ +#define PCI_PRI_STATUS_RF 0x001 /* Response Failure */ +#define PCI_PRI_STATUS_UPRGI 0x002 /* Unexpected PRG index */ +#define PCI_PRI_STATUS_STOPPED 0x100 /* PRI Stopped */ +#define PCI_PRI_MAX_REQ 0x08 /* PRI max reqs supported */ +#define PCI_PRI_ALLOC_REQ 0x0c /* PRI max reqs allowed */ /* PASID capability */ -#define PCI_PASID_CAP_OFF 0x04 /* PASID feature register */ -#define PCI_PASID_CONTROL_OFF 0x06 /* PASID control register */ -#define PCI_PASID_ENABLE 0x01 /* Enable/Supported bit */ -#define PCI_PASID_EXEC 0x02 /* Exec permissions Enable/Supported */ -#define PCI_PASID_PRIV 0x04 /* Priviledge Mode Enable/Support */ +#define PCI_PASID_CAP 0x04 /* PASID feature register */ +#define PCI_PASID_CAP_EXEC 0x02 /* Exec permissions Supported */ +#define PCI_PASID_CAP_PRIV 0x04 /* Priviledge Mode Supported */ +#define PCI_PASID_CTRL 0x06 /* PASID control register */ +#define PCI_PASID_CTRL_ENABLE 0x01 /* Enable bit */ +#define PCI_PASID_CTRL_EXEC 0x02 /* Exec permissions Enable */ +#define PCI_PASID_CTRL_PRIV 0x04 /* Priviledge Mode Enable */ /* Single Root I/O Virtualization */ #define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */ From 1a36ea815a3557c03819ec7c90a6b2fb128385ca Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Tue, 6 Dec 2011 15:22:10 +0200 Subject: [PATCH 26/53] iommu/omap: be verbose when omap_iommu_iova_to_phys fails An omap_iommu_iova_to_phys failure usually means that iova wasn't mapped. When that happens, it's helpful to know the value of iova, so add it to the error message. Signed-off-by: Ohad Ben-Cohen Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index b7f863d72c08..cbcbf31ec3be 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1177,14 +1177,14 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain, else if (iopte_is_large(*pte)) ret = omap_iommu_translate(*pte, da, IOLARGE_MASK); else - dev_err(dev, "bogus pte 0x%x", *pte); + dev_err(dev, "bogus pte 0x%x, da 0x%lx", *pte, da); } else { if (iopgd_is_section(*pgd)) ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK); else if (iopgd_is_super(*pgd)) ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK); else - dev_err(dev, "bogus pgd 0x%x", *pgd); + dev_err(dev, "bogus pgd 0x%x, da 0x%lx", *pgd, da); } return ret; From ee6c28684585a64fd79c5a56e849af58ebdc5948 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 9 Nov 2011 12:06:03 +0100 Subject: [PATCH 27/53] iommu/amd: Convert dev_table_entry to u64 Convert the contents of 'struct dev_table_entry' to u64 to allow updating the DTE wit 64bit writes as required by the spec. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 18 ++++++++++-------- drivers/iommu/amd_iommu_init.c | 12 ++++++------ drivers/iommu/amd_iommu_types.h | 4 ++-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4ee277a8521a..661e2bb4ac15 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -365,8 +365,8 @@ static void dump_dte_entry(u16 devid) { int i; - for (i = 0; i < 8; ++i) - pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: DTE[%d]: %016llx\n", i, amd_iommu_dev_table[devid].data[i]); } @@ -1583,19 +1583,22 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { u64 pte_root = virt_to_phys(domain->pt_root); - u32 flags = 0; + u64 flags = 0; pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + flags = amd_iommu_dev_table[devid].data[1]; + if (ats) flags |= DTE_FLAG_IOTLB; - amd_iommu_dev_table[devid].data[3] |= flags; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + flags &= ~(0xffffUL); + flags |= domain->id; + + amd_iommu_dev_table[devid].data[1] = flags; + amd_iommu_dev_table[devid].data[0] = pte_root; } static void clear_dte_entry(u16 devid) @@ -1603,7 +1606,6 @@ static void clear_dte_entry(u16 devid) /* remove entry from the device table seen by the hardware */ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; amd_iommu_apply_erratum_63(devid); } diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 82d2410f4205..17e0f77c7dad 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -584,18 +584,18 @@ static void __init free_event_buffer(struct amd_iommu *iommu) /* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; + int i = (bit >> 6) & 0x03; + int _bit = bit & 0x3f; - amd_iommu_dev_table[devid].data[i] |= (1 << _bit); + amd_iommu_dev_table[devid].data[i] |= (1UL << _bit); } static int get_dev_entry_bit(u16 devid, u8 bit) { - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; + int i = (bit >> 6) & 0x03; + int _bit = bit & 0x3f; - return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; + return (amd_iommu_dev_table[devid].data[i] & (1UL << _bit)) >> _bit; } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 5b9c5075e81a..f8dd9ae693d1 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -230,7 +230,7 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) -#define DTE_FLAG_IOTLB 0x01 +#define DTE_FLAG_IOTLB (0x01UL << 32) #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) @@ -484,7 +484,7 @@ extern struct list_head amd_iommu_pd_list; * Structure defining one entry in the device table */ struct dev_table_entry { - u32 data[8]; + u64 data[4]; }; /* From 62f71abbc64d686064a4caa10a3249c26776995e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Nov 2011 14:41:57 +0100 Subject: [PATCH 28/53] iommu/amd: Get the maximum number of PASIDs supported Read the number of PASIDs supported by each IOMMU in the system and take the smallest number as the maximum value supported by the IOMMU driver. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 13 +++++++++++++ drivers/iommu/amd_iommu_types.h | 6 ++++++ 2 files changed, 19 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 17e0f77c7dad..fb4afd6d357d 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -141,6 +141,8 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; bool amd_iommu_iotlb_sup __read_mostly = true; +u32 amd_iommu_max_pasids __read_mostly = ~0; + /* * The ACPI table parsing functions set this variable on an error */ @@ -699,6 +701,17 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->features = ((u64)high << 32) | low; + if (iommu_feature(iommu, FEATURE_GT)) { + u32 pasids; + u64 shift; + + shift = iommu->features & FEATURE_PASID_MASK; + shift >>= FEATURE_PASID_SHIFT; + pasids = (1 << shift); + + amd_iommu_max_pasids = min(amd_iommu_max_pasids, pasids); + } + if (!is_rd890_iommu(iommu->dev)) return; diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index f8dd9ae693d1..0d984ca925be 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -87,6 +87,9 @@ #define FEATURE_HE (1ULL<<8) #define FEATURE_PC (1ULL<<9) +#define FEATURE_PASID_SHIFT 32 +#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) + /* MMIO status bits */ #define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 @@ -549,6 +552,9 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap; */ extern bool amd_iommu_unmap_flush; +/* Smallest number of PASIDs supported by any IOMMU in the system */ +extern u32 amd_iommu_max_pasids; + /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) From 1a29ac014a68e5da8f96d5d8d142b5956eb00b7c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Nov 2011 15:41:40 +0100 Subject: [PATCH 29/53] iommu/amd: Setup PPR log when supported by IOMMU Allocate and enable a log buffer for peripheral page faults when the IOMMU supports this feature. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 51 +++++++++++++++++++++++++++++++++ drivers/iommu/amd_iommu_types.h | 14 +++++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index fb4afd6d357d..60716cefa7af 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -583,6 +583,46 @@ static void __init free_event_buffer(struct amd_iommu *iommu) free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); } +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_ppr_log(struct amd_iommu *iommu) +{ + iommu->ppr_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(PPR_LOG_SIZE)); + + if (iommu->ppr_log == NULL) + return NULL; + + return iommu->ppr_log; +} + +static void iommu_enable_ppr_log(struct amd_iommu *iommu) +{ + u64 entry; + + if (iommu->ppr_log == NULL) + return; + + entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; + + memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, + &entry, sizeof(entry)); + + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_PPFLOG_EN); + iommu_feature_enable(iommu, CONTROL_PPR_EN); +} + +static void __init free_ppr_log(struct amd_iommu *iommu) +{ + if (iommu->ppr_log == NULL) + return; + + free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); +} + /* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { @@ -914,6 +954,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu) { free_command_buffer(iommu); free_event_buffer(iommu); + free_ppr_log(iommu); iommu_unmap_mmio_space(iommu); } @@ -977,6 +1018,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu_feature(iommu, FEATURE_PPR)) { + iommu->ppr_log = alloc_ppr_log(iommu); + if (!iommu->ppr_log) + return -ENOMEM; + } + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) amd_iommu_np_cache = true; @@ -1063,6 +1110,9 @@ static int iommu_setup_msi(struct amd_iommu *iommu) iommu->int_enabled = true; iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + if (iommu->ppr_log != NULL) + iommu_feature_enable(iommu, CONTROL_PPFINT_EN); + return 0; } @@ -1287,6 +1337,7 @@ static void enable_iommus(void) iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); + iommu_enable_ppr_log(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 0d984ca925be..4dc230904fe9 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -69,11 +69,14 @@ #define MMIO_EXCL_BASE_OFFSET 0x0020 #define MMIO_EXCL_LIMIT_OFFSET 0x0028 #define MMIO_EXT_FEATURES 0x0030 +#define MMIO_PPR_LOG_OFFSET 0x0038 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 #define MMIO_EVT_TAIL_OFFSET 0x2018 #define MMIO_STATUS_OFFSET 0x2020 +#define MMIO_PPR_HEAD_OFFSET 0x2030 +#define MMIO_PPR_TAIL_OFFSET 0x2038 /* Extended Feature Bits */ @@ -125,6 +128,7 @@ #define CONTROL_CMDBUF_EN 0x0cULL #define CONTROL_PPFLOG_EN 0x0dULL #define CONTROL_PPFINT_EN 0x0eULL +#define CONTROL_PPR_EN 0x0fULL /* command specific defines */ #define CMD_COMPL_WAIT 0x01 @@ -168,6 +172,13 @@ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ #define EVT_LEN_MASK (0x9ULL << 56) +/* Constants for PPR Log handling */ +#define PPR_LOG_ENTRIES 512 +#define PPR_LOG_SIZE_SHIFT 56 +#define PPR_LOG_SIZE_512 (0x9ULL << PPR_LOG_SIZE_SHIFT) +#define PPR_ENTRY_SIZE 16 +#define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES) + #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 @@ -434,6 +445,9 @@ struct amd_iommu { /* MSI number for event interrupt */ u16 evt_msi_num; + /* Base of the PPR log, if present */ + u8 *ppr_log; + /* true if interrupts for this IOMMU are already enabled */ bool int_enabled; From cbc33a9085995e21f52a66380d108d64916b6787 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Nov 2011 11:41:31 +0100 Subject: [PATCH 30/53] iommu/amd: Enable GT mode when supported by IOMMU This feature needs to be enabled before IOMMUv2 DTEs can be set up. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 9 +++++++++ drivers/iommu/amd_iommu_types.h | 1 + 2 files changed, 10 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 60716cefa7af..2c25ae306e7c 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -623,6 +623,14 @@ static void __init free_ppr_log(struct amd_iommu *iommu) free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); } +static void iommu_enable_gt(struct amd_iommu *iommu) +{ + if (!iommu_feature(iommu, FEATURE_GT)) + return; + + iommu_feature_enable(iommu, CONTROL_GT_EN); +} + /* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { @@ -1338,6 +1346,7 @@ static void enable_iommus(void) iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); iommu_enable_ppr_log(iommu); + iommu_enable_gt(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 4dc230904fe9..a6e1dc616afe 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -129,6 +129,7 @@ #define CONTROL_PPFLOG_EN 0x0dULL #define CONTROL_PPFINT_EN 0x0eULL #define CONTROL_PPR_EN 0x0fULL +#define CONTROL_GT_EN 0x10ULL /* command specific defines */ #define CMD_COMPL_WAIT 0x01 From 400a28a05f2cc1a311acb4ff6ac64d8402d21678 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 28 Nov 2011 15:11:02 +0100 Subject: [PATCH 31/53] iommu/amd: Add iommuv2 flag to struct amd_iommu In mixed IOMMU setups this flag inidicates whether an IOMMU supports the v2 features or not. This patch also adds a global flag together with a function to query that flag from other code. The flag shows if at least one IOMMUv2 is in the system. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 15 +++++++++++++++ drivers/iommu/amd_iommu_proto.h | 3 +++ drivers/iommu/amd_iommu_types.h | 5 +++++ 3 files changed, 23 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 2c25ae306e7c..d1e5067852d4 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -143,6 +144,8 @@ bool amd_iommu_iotlb_sup __read_mostly = true; u32 amd_iommu_max_pasids __read_mostly = ~0; +bool amd_iommu_v2_present __read_mostly; + /* * The ACPI table parsing functions set this variable on an error */ @@ -760,6 +763,12 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) amd_iommu_max_pasids = min(amd_iommu_max_pasids, pasids); } + if (iommu_feature(iommu, FEATURE_GT) && + iommu_feature(iommu, FEATURE_PPR)) { + iommu->is_iommu_v2 = true; + amd_iommu_v2_present = true; + } + if (!is_rd890_iommu(iommu->dev)) return; @@ -1645,3 +1654,9 @@ IOMMU_INIT_FINISH(amd_iommu_detect, gart_iommu_hole_init, 0, 0); + +bool amd_iommu_v2_supported(void) +{ + return amd_iommu_v2_present; +} +EXPORT_SYMBOL(amd_iommu_v2_supported); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 7ffaa64410b0..3a46c300dffb 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -31,6 +31,9 @@ extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); extern void amd_iommu_init_notifier(void); extern void amd_iommu_init_api(void); + +extern bool amd_iommu_v2_supported(void); + #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index a6e1dc616afe..7e81094ab73b 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -414,6 +414,9 @@ struct amd_iommu { /* Extended features */ u64 features; + /* IOMMUv2 */ + bool is_iommu_v2; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -570,6 +573,8 @@ extern bool amd_iommu_unmap_flush; /* Smallest number of PASIDs supported by any IOMMU in the system */ extern u32 amd_iommu_max_pasids; +extern bool amd_iommu_v2_present; + /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) From 5abcdba4fa535c29f736455e37229ee97e0e7f5d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Dec 2011 15:49:45 +0100 Subject: [PATCH 32/53] iommu/amd: Put IOMMUv2 capable devices in pt_domain If the device starts to use IOMMUv2 features the dma handles need to stay valid. The only sane way to do this is to use a identity mapping for the device and not translate it by the iommu. This is implemented with this patch. Since this lifts the device-isolation there is also a new kernel parameter which allows to disable that feature. Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 5 ++ drivers/iommu/amd_iommu.c | 94 ++++++++++++++++++++++++----- drivers/iommu/amd_iommu_init.c | 4 ++ drivers/iommu/amd_iommu_types.h | 4 ++ 4 files changed, 91 insertions(+), 16 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 81c287fad79d..22152ed148a5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -329,6 +329,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. is a lot of faster off - do not initialize any AMD IOMMU found in the system + force_isolation - Force device isolation for all + devices. The IOMMU driver is not + allowed anymore to lift isolation + requirements as needed. This option + does not override iommu=pt amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 661e2bb4ac15..7ccfc80ceb7a 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -67,6 +67,7 @@ struct iommu_cmd { }; static void update_domain(struct protection_domain *domain); +static int __init alloc_passthrough_domain(void); /**************************************************************************** * @@ -147,6 +148,24 @@ static struct iommu_dev_data *get_dev_data(struct device *dev) return dev->archdata.iommu; } +static bool pci_iommuv2_capable(struct pci_dev *pdev) +{ + static const int caps[] = { + PCI_EXT_CAP_ID_ATS, + PCI_PRI_CAP, + PCI_PASID_CAP, + }; + int i, pos; + + for (i = 0; i < 3; ++i) { + pos = pci_find_ext_capability(pdev, caps[i]); + if (pos == 0) + return false; + } + + return true; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -204,6 +223,7 @@ static bool check_device(struct device *dev) static int iommu_init_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct iommu_dev_data *dev_data; u16 alias; @@ -228,6 +248,13 @@ static int iommu_init_device(struct device *dev) dev_data->alias_data = alias_data; } + if (pci_iommuv2_capable(pdev)) { + struct amd_iommu *iommu; + + iommu = amd_iommu_rlookup_table[dev_data->devid]; + dev_data->iommu_v2 = iommu->is_iommu_v2; + } + dev->archdata.iommu = dev_data; return 0; @@ -1762,7 +1789,7 @@ static void __detach_device(struct iommu_dev_data *dev_data) * passthrough domain if it is detached from any other domain. * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && + if (dev_data->passthrough && (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev_data, pt_domain); } @@ -1820,18 +1847,20 @@ static struct protection_domain *domain_for_device(struct device *dev) static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct device *dev = data; - u16 devid; - struct protection_domain *domain; struct dma_ops_domain *dma_domain; + struct protection_domain *domain; + struct iommu_dev_data *dev_data; + struct device *dev = data; struct amd_iommu *iommu; unsigned long flags; + u16 devid; if (!check_device(dev)) return 0; - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: @@ -1840,7 +1869,7 @@ static int device_change_notifier(struct notifier_block *nb, if (!domain) goto out; - if (iommu_pass_through) + if (dev_data->passthrough) break; detach_device(dev); break; @@ -2436,8 +2465,9 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) */ static void prealloc_protection_domains(void) { - struct pci_dev *dev = NULL; + struct iommu_dev_data *dev_data; struct dma_ops_domain *dma_dom; + struct pci_dev *dev = NULL; u16 devid; for_each_pci_dev(dev) { @@ -2446,6 +2476,16 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; + dev_data = get_dev_data(&dev->dev); + if (!amd_iommu_force_isolation && dev_data->iommu_v2) { + /* Make sure passthrough domain is allocated */ + alloc_passthrough_domain(); + dev_data->passthrough = true; + attach_device(&dev->dev, pt_domain); + pr_info("AMD-Vi: Using passthough domain for device %s\n", + dev_name(&dev->dev)); + } + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2476,6 +2516,7 @@ static struct dma_map_ops amd_iommu_dma_ops = { static unsigned device_dma_ops_init(void) { + struct iommu_dev_data *dev_data; struct pci_dev *pdev = NULL; unsigned unhandled = 0; @@ -2485,7 +2526,12 @@ static unsigned device_dma_ops_init(void) continue; } - pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + dev_data = get_dev_data(&pdev->dev); + + if (!dev_data->passthrough) + pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + else + pdev->dev.archdata.dma_ops = &nommu_dma_ops; } return unhandled; @@ -2612,6 +2658,20 @@ out_err: return NULL; } +static int __init alloc_passthrough_domain(void) +{ + if (pt_domain != NULL) + return 0; + + /* allocate passthrough domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode = PAGE_MODE_NONE; + + return 0; +} static int amd_iommu_domain_init(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2798,21 +2858,23 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { - struct amd_iommu *iommu; + struct iommu_dev_data *dev_data; struct pci_dev *dev = NULL; + struct amd_iommu *iommu; u16 devid; + int ret; - /* allocate passthrough domain */ - pt_domain = protection_domain_alloc(); - if (!pt_domain) - return -ENOMEM; - - pt_domain->mode |= PAGE_MODE_NONE; + ret = alloc_passthrough_domain(); + if (ret) + return ret; for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; + dev_data = get_dev_data(&dev->dev); + dev_data->passthrough = true; + devid = get_device_id(&dev->dev); iommu = amd_iommu_rlookup_table[devid]; diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index d1e5067852d4..7c3fd572a23b 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -146,6 +146,8 @@ u32 amd_iommu_max_pasids __read_mostly = ~0; bool amd_iommu_v2_present __read_mostly; +bool amd_iommu_force_isolation __read_mostly; + /* * The ACPI table parsing functions set this variable on an error */ @@ -1642,6 +1644,8 @@ static int __init parse_amd_iommu_options(char *str) amd_iommu_unmap_flush = true; if (strncmp(str, "off", 3) == 0) amd_iommu_disabled = true; + if (strncmp(str, "force_isolation", 15) == 0) + amd_iommu_force_isolation = true; } return 1; diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 7e81094ab73b..96c652fae0e5 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -330,6 +330,8 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ u16 devid; /* PCI Device ID */ + bool iommu_v2; /* Device can make use of IOMMUv2 */ + bool passthrough; /* Default for device is pt_domain */ struct { bool enabled; int qdep; @@ -575,6 +577,8 @@ extern u32 amd_iommu_max_pasids; extern bool amd_iommu_v2_present; +extern bool amd_iommu_force_isolation; + /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) From 72e1dcc4192288ad5e37888aa1dbb23b3ef4aa9a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Nov 2011 19:13:51 +0100 Subject: [PATCH 33/53] iommu/amd: Implement notifier for PPR faults Add a notifer at which a module can attach to get informed about incoming PPR faults. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 90 ++++++++++++++++++++++++++++++++- drivers/iommu/amd_iommu_proto.h | 3 ++ drivers/iommu/amd_iommu_types.h | 34 ++++++++++++- 3 files changed, 125 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7ccfc80ceb7a..db9b788c28ba 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -17,6 +17,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -28,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -59,6 +62,8 @@ static struct protection_domain *pt_domain; static struct iommu_ops amd_iommu_ops; +static ATOMIC_NOTIFIER_HEAD(ppr_notifier); + /* * general struct to manage commands send to an IOMMU */ @@ -488,12 +493,82 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } +static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) +{ + struct amd_iommu_fault fault; + volatile u64 *raw; + int i; + + raw = (u64 *)(iommu->ppr_log + head); + + /* + * Hardware bug: Interrupt may arrive before the entry is written to + * memory. If this happens we need to wait for the entry to arrive. + */ + for (i = 0; i < LOOP_TIMEOUT; ++i) { + if (PPR_REQ_TYPE(raw[0]) != 0) + break; + udelay(1); + } + + if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { + pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); + return; + } + + fault.address = raw[1]; + fault.pasid = PPR_PASID(raw[0]); + fault.device_id = PPR_DEVID(raw[0]); + fault.tag = PPR_TAG(raw[0]); + fault.flags = PPR_FLAGS(raw[0]); + + /* + * To detect the hardware bug we need to clear the entry + * to back to zero. + */ + raw[0] = raw[1] = 0; + + atomic_notifier_call_chain(&ppr_notifier, 0, &fault); +} + +static void iommu_poll_ppr_log(struct amd_iommu *iommu) +{ + unsigned long flags; + u32 head, tail; + + if (iommu->ppr_log == NULL) + return; + + spin_lock_irqsave(&iommu->lock, flags); + + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + + while (head != tail) { + + /* Handle PPR entry */ + iommu_handle_ppr_entry(iommu, head); + + /* Update and refresh ring-buffer state*/ + head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; + writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + } + + /* enable ppr interrupts again */ + writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu; - for_each_iommu(iommu) + for_each_iommu(iommu) { iommu_poll_events(iommu); + iommu_poll_ppr_log(iommu); + } return IRQ_HANDLED; } @@ -2888,3 +2963,16 @@ int __init amd_iommu_init_passthrough(void) return 0; } + +/* IOMMUv2 specific functions */ +int amd_iommu_register_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); + +int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 3a46c300dffb..cfe2dfc64523 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -32,7 +32,10 @@ extern void amd_iommu_uninit_devices(void); extern void amd_iommu_init_notifier(void); extern void amd_iommu_init_api(void); +/* IOMMUv2 specific functions */ extern bool amd_iommu_v2_supported(void); +extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); +extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); #ifndef CONFIG_AMD_IOMMU_STATS diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 96c652fae0e5..c9e080cf595f 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -94,7 +94,8 @@ #define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) /* MMIO status bits */ -#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 +#define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) +#define MMIO_STATUS_PPR_INT_MASK (1 << 6) /* event logging constants */ #define EVENT_ENTRY_SIZE 0x10 @@ -180,6 +181,16 @@ #define PPR_ENTRY_SIZE 16 #define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES) +#define PPR_REQ_TYPE(x) (((x) >> 60) & 0xfULL) +#define PPR_FLAGS(x) (((x) >> 48) & 0xfffULL) +#define PPR_DEVID(x) ((x) & 0xffffULL) +#define PPR_TAG(x) (((x) >> 32) & 0x3ffULL) +#define PPR_PASID1(x) (((x) >> 16) & 0xffffULL) +#define PPR_PASID2(x) (((x) >> 42) & 0xfULL) +#define PPR_PASID(x) ((PPR_PASID2(x) << 16) | PPR_PASID1(x)) + +#define PPR_REQ_FAULT 0x01 + #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 @@ -300,6 +311,27 @@ extern bool amd_iommu_iotlb_sup; #define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) #define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) + +/* + * This struct is used to pass information about + * incoming PPR faults around. + */ +struct amd_iommu_fault { + u64 address; /* IO virtual address of the fault*/ + u32 pasid; /* Address space identifier */ + u16 device_id; /* Originating PCI device id */ + u16 tag; /* PPR tag */ + u16 flags; /* Fault flags */ + +}; + +#define PPR_FAULT_EXEC (1 << 1) +#define PPR_FAULT_READ (1 << 2) +#define PPR_FAULT_WRITE (1 << 5) +#define PPR_FAULT_USER (1 << 6) +#define PPR_FAULT_RSVD (1 << 7) +#define PPR_FAULT_GN (1 << 8) + /* * This structure contains generic data for IOMMU protection domains * independent of their use. From 132bd68f180dd5de9176e20532910503f6393f14 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 17 Nov 2011 14:18:46 +0100 Subject: [PATCH 34/53] iommu/amd: Add amd_iommu_domain_direct_map function This function can be used to switch a domain into paging-mode 0. In this mode all devices can access physical system memory directly without any remapping. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 38 +++++++++++++++++++++++++++++++-- drivers/iommu/amd_iommu_proto.h | 3 +++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index db9b788c28ba..6ed536769102 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1684,9 +1684,12 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { - u64 pte_root = virt_to_phys(domain->pt_root); + u64 pte_root = 0; u64 flags = 0; + if (domain->mode != PAGE_MODE_NONE) + pte_root = virt_to_phys(domain->pt_root); + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -2782,7 +2785,8 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) BUG_ON(domain->dev_cnt != 0); - free_pagetable(domain); + if (domain->mode != PAGE_MODE_NONE) + free_pagetable(domain); protection_domain_free(domain); @@ -2846,6 +2850,9 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, int prot = 0; int ret; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; + if (iommu_prot & IOMMU_READ) prot |= IOMMU_PROT_IR; if (iommu_prot & IOMMU_WRITE) @@ -2864,6 +2871,9 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, struct protection_domain *domain = dom->priv; unsigned long page_size, unmap_size; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; + page_size = 0x1000UL << gfp_order; mutex_lock(&domain->api_lock); @@ -2883,6 +2893,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte, __pte; + if (domain->mode == PAGE_MODE_NONE) + return iova; + pte = fetch_pte(domain, iova); if (!pte || !IOMMU_PTE_PRESENT(*pte)) @@ -2976,3 +2989,24 @@ int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) return atomic_notifier_chain_unregister(&ppr_notifier, nb); } EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); + +void amd_iommu_domain_direct_map(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + + /* Update data structure */ + domain->mode = PAGE_MODE_NONE; + domain->updated = true; + + /* Make changes visible to IOMMUs */ + update_domain(domain); + + /* Page-table is not visible to IOMMU anymore, so free it */ + free_pagetable(domain); + + spin_unlock_irqrestore(&domain->lock, flags); +} +EXPORT_SYMBOL(amd_iommu_domain_direct_map); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index cfe2dfc64523..2c4554e963c7 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -33,9 +33,12 @@ extern void amd_iommu_init_notifier(void); extern void amd_iommu_init_api(void); /* IOMMUv2 specific functions */ +struct iommu_domain; + extern bool amd_iommu_v2_supported(void); extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); +extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); #ifndef CONFIG_AMD_IOMMU_STATS From 52815b75682e25db45545911fd2b09ef5856e695 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 17 Nov 2011 17:24:28 +0100 Subject: [PATCH 35/53] iommu/amd: Add support for IOMMUv2 domain mode This patch adds support for protection domains that implement two-level paging for devices. Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 4 +- drivers/iommu/amd_iommu.c | 144 +++++++++++++++++++++++++++++++- drivers/iommu/amd_iommu_init.c | 9 ++ drivers/iommu/amd_iommu_proto.h | 1 + drivers/iommu/amd_iommu_types.h | 27 ++++++ 5 files changed, 180 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 5414253b185a..220dfc24b311 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -34,7 +34,9 @@ config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB select PCI_MSI - select PCI_IOV + select PCI_ATS + select PCI_PRI + select PCI_PASID select IOMMU_API depends on X86_64 && PCI && ACPI ---help--- diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 6ed536769102..7dda0d4a8f8c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -63,6 +63,7 @@ static struct protection_domain *pt_domain; static struct iommu_ops amd_iommu_ops; static ATOMIC_NOTIFIER_HEAD(ppr_notifier); +int amd_iommu_max_glx_val = -1; /* * general struct to manage commands send to an IOMMU @@ -1598,6 +1599,11 @@ static void free_pagetable(struct protection_domain *domain) domain->pt_root = NULL; } +static void free_gcr3_table(struct protection_domain *domain) +{ + free_page((unsigned long)domain->gcr3_tbl); +} + /* * Free a domain, only used if something went wrong in the * allocation path and we need to free an already allocated page table @@ -1699,6 +1705,32 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) if (ats) flags |= DTE_FLAG_IOTLB; + if (domain->flags & PD_IOMMUV2_MASK) { + u64 gcr3 = __pa(domain->gcr3_tbl); + u64 glx = domain->glx; + u64 tmp; + + pte_root |= DTE_FLAG_GV; + pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; + + /* First mask out possible old values for GCR3 table */ + tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; + flags &= ~tmp; + + tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; + flags &= ~tmp; + + /* Encode GCR3 table into DTE */ + tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; + pte_root |= tmp; + + tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; + flags |= tmp; + + tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; + flags |= tmp; + } + flags &= ~(0xffffUL); flags |= domain->id; @@ -1803,6 +1835,46 @@ out_unlock: return ret; } + +static void pdev_iommuv2_disable(struct pci_dev *pdev) +{ + pci_disable_ats(pdev); + pci_disable_pri(pdev); + pci_disable_pasid(pdev); +} + +static int pdev_iommuv2_enable(struct pci_dev *pdev) +{ + int ret; + + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (ret) + goto out_err; + + /* First reset the PRI state of the device */ + ret = pci_reset_pri(pdev); + if (ret) + goto out_err; + + /* FIXME: Hardcode number of outstanding requests for now */ + ret = pci_enable_pri(pdev, 32); + if (ret) + goto out_err; + + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (ret) + goto out_err; + + return 0; + +out_err: + pci_disable_pri(pdev); + pci_disable_pasid(pdev); + + return ret; +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1817,7 +1889,17 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); - if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { + if (domain->flags & PD_IOMMUV2_MASK) { + if (!dev_data->iommu_v2 || !dev_data->passthrough) + return -EINVAL; + + if (pdev_iommuv2_enable(pdev) != 0) + return -EINVAL; + + dev_data->ats.enabled = true; + dev_data->ats.qdep = pci_ats_queue_depth(pdev); + } else if (amd_iommu_iotlb_sup && + pci_enable_ats(pdev, PAGE_SHIFT) == 0) { dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); } @@ -1877,20 +1959,24 @@ static void __detach_device(struct iommu_dev_data *dev_data) */ static void detach_device(struct device *dev) { + struct protection_domain *domain; struct iommu_dev_data *dev_data; unsigned long flags; dev_data = get_dev_data(dev); + domain = dev_data->domain; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - if (dev_data->ats.enabled) { + if (domain->flags & PD_IOMMUV2_MASK) + pdev_iommuv2_disable(to_pci_dev(dev)); + else if (dev_data->ats.enabled) pci_disable_ats(to_pci_dev(dev)); - dev_data->ats.enabled = false; - } + + dev_data->ats.enabled = false; } /* @@ -2788,6 +2874,9 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) if (domain->mode != PAGE_MODE_NONE) free_pagetable(domain); + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + protection_domain_free(domain); dom->priv = NULL; @@ -3010,3 +3099,50 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) spin_unlock_irqrestore(&domain->lock, flags); } EXPORT_SYMBOL(amd_iommu_domain_direct_map); + +int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int levels, ret; + + if (pasids <= 0 || pasids > (PASID_MASK + 1)) + return -EINVAL; + + /* Number of GCR3 table levels required */ + for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) + levels += 1; + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + spin_lock_irqsave(&domain->lock, flags); + + /* + * Save us all sanity checks whether devices already in the + * domain support IOMMUv2. Just force that the domain has no + * devices attached when it is switched into IOMMUv2 mode. + */ + ret = -EBUSY; + if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK) + goto out; + + ret = -ENOMEM; + domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + goto out; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + domain->updated = true; + + update_domain(domain); + + ret = 0; + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_enable_v2); diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 7c3fd572a23b..c7a5d7e14547 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -755,6 +755,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->features = ((u64)high << 32) | low; if (iommu_feature(iommu, FEATURE_GT)) { + int glxval; u32 pasids; u64 shift; @@ -763,6 +764,14 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) pasids = (1 << shift); amd_iommu_max_pasids = min(amd_iommu_max_pasids, pasids); + + glxval = iommu->features & FEATURE_GLXVAL_MASK; + glxval >>= FEATURE_GLXVAL_SHIFT; + + if (amd_iommu_max_glx_val == -1) + amd_iommu_max_glx_val = glxval; + else + amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval); } if (iommu_feature(iommu, FEATURE_GT) && diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 2c4554e963c7..d207b1d951b2 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -39,6 +39,7 @@ extern bool amd_iommu_v2_supported(void); extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); +extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); #ifndef CONFIG_AMD_IOMMU_STATS diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index c9e080cf595f..b7583cb5ad66 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -93,6 +93,11 @@ #define FEATURE_PASID_SHIFT 32 #define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) +#define FEATURE_GLXVAL_SHIFT 14 +#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT) + +#define PASID_MASK 0x000fffff + /* MMIO status bits */ #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) #define MMIO_STATUS_PPR_INT_MASK (1 << 6) @@ -257,6 +262,22 @@ #define IOMMU_PTE_IW (1ULL << 62) #define DTE_FLAG_IOTLB (0x01UL << 32) +#define DTE_FLAG_GV (0x01ULL << 55) +#define DTE_GLX_SHIFT (56) +#define DTE_GLX_MASK (3) + +#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL) +#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL) +#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0xfffffULL) + +#define DTE_GCR3_INDEX_A 0 +#define DTE_GCR3_INDEX_B 1 +#define DTE_GCR3_INDEX_C 1 + +#define DTE_GCR3_SHIFT_A 58 +#define DTE_GCR3_SHIFT_B 16 +#define DTE_GCR3_SHIFT_C 43 + #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) @@ -283,6 +304,7 @@ domain for an IOMMU */ #define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page translation */ +#define PD_IOMMUV2_MASK (1UL << 3) /* domain has gcr3 table */ extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ @@ -344,6 +366,8 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ + int glx; /* Number of levels for GCR3 table */ + u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ @@ -611,6 +635,9 @@ extern bool amd_iommu_v2_present; extern bool amd_iommu_force_isolation; +/* Max levels of glxval supported */ +extern int amd_iommu_max_glx_val; + /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) From 22e266c79b5bd5441243863c89ea237e6e845295 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Nov 2011 15:59:08 +0100 Subject: [PATCH 36/53] iommu/amd: Implement IOMMUv2 TLB flushing routines The functions added with this patch allow to manage the IOMMU and the device TLBs for all devices in an IOMMUv2 domain. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 136 ++++++++++++++++++++++++++++++++ drivers/iommu/amd_iommu_proto.h | 3 + drivers/iommu/amd_iommu_types.h | 1 + 3 files changed, 140 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7dda0d4a8f8c..b0861739a27d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -698,6 +698,44 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; } +static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid, + u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = pasid & PASID_MASK; + cmd->data[1] = domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); +} + +static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, + int qdep, u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = devid; + cmd->data[0] |= (pasid & 0xff) << 16; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[1] |= ((pasid >> 8) & 0xfff) << 16; + cmd->data[2] = lower_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + cmd->data[3] = upper_32_bits(address); + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); +} + static void build_inv_all(struct iommu_cmd *cmd) { memset(cmd, 0, sizeof(*cmd)); @@ -3146,3 +3184,101 @@ out: return ret; } EXPORT_SYMBOL(amd_iommu_domain_enable_v2); + +static int __flush_pasid(struct protection_domain *domain, int pasid, + u64 address, bool size) +{ + struct iommu_dev_data *dev_data; + struct iommu_cmd cmd; + int i, ret; + + if (!(domain->flags & PD_IOMMUV2_MASK)) + return -EINVAL; + + build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); + + /* + * IOMMU TLB needs to be flushed before Device TLB to + * prevent device TLB refill from IOMMU TLB + */ + for (i = 0; i < amd_iommus_present; ++i) { + if (domain->dev_iommu[i] == 0) + continue; + + ret = iommu_queue_command(amd_iommus[i], &cmd); + if (ret != 0) + goto out; + } + + /* Wait until IOMMU TLB flushes are complete */ + domain_flush_complete(domain); + + /* Now flush device TLBs */ + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct amd_iommu *iommu; + int qdep; + + BUG_ON(!dev_data->ats.enabled); + + qdep = dev_data->ats.qdep; + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, + qdep, address, size); + + ret = iommu_queue_command(iommu, &cmd); + if (ret != 0) + goto out; + } + + /* Wait until all device TLBs are flushed */ + domain_flush_complete(domain); + + ret = 0; + +out: + + return ret; +} + +static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, + u64 address) +{ + return __flush_pasid(domain, pasid, address, false); +} + +int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_page(domain, pasid, address); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_page); + +static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) +{ + return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + true); +} + +int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_tlb(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_tlb); + diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index d207b1d951b2..a92dc6117e2f 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -40,6 +40,9 @@ extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address); +extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); #ifndef CONFIG_AMD_IOMMU_STATS diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index b7583cb5ad66..ff1dfe9ad579 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -148,6 +148,7 @@ #define CMD_COMPL_WAIT_INT_MASK 0x02 #define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 #define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 +#define CMD_INV_IOMMU_PAGES_GN_MASK 0x04 #define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL From b16137b11b4b4d4bb27b61bba7e05f5fda5968f4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Nov 2011 16:50:23 +0100 Subject: [PATCH 37/53] iommu/amd: Implement functions to manage GCR3 table This patch adds functions necessary to set and clear the GCR3 values associated with a particular PASID in an IOMMUv2 domain. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 130 ++++++++++++++++++++++++++++++++ drivers/iommu/amd_iommu_proto.h | 4 + drivers/iommu/amd_iommu_types.h | 1 + 3 files changed, 135 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b0861739a27d..65a118ce626e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1637,8 +1637,45 @@ static void free_pagetable(struct protection_domain *domain) domain->pt_root = NULL; } +static void free_gcr3_tbl_level1(u64 *tbl) +{ + u64 *ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (!(tbl[i] & GCR3_VALID)) + continue; + + ptr = __va(tbl[i] & PAGE_MASK); + + free_page((unsigned long)ptr); + } +} + +static void free_gcr3_tbl_level2(u64 *tbl) +{ + u64 *ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (!(tbl[i] & GCR3_VALID)) + continue; + + ptr = __va(tbl[i] & PAGE_MASK); + + free_gcr3_tbl_level1(ptr); + } +} + static void free_gcr3_table(struct protection_domain *domain) { + if (domain->glx == 2) + free_gcr3_tbl_level2(domain->gcr3_tbl); + else if (domain->glx == 1) + free_gcr3_tbl_level1(domain->gcr3_tbl); + else if (domain->glx != 0) + BUG(); + free_page((unsigned long)domain->gcr3_tbl); } @@ -3282,3 +3319,96 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) } EXPORT_SYMBOL(amd_iommu_flush_tlb); +static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) +{ + int index; + u64 *pte; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + pte = &root[index]; + + if (level == 0) + break; + + if (!(*pte & GCR3_VALID)) { + if (!alloc) + return NULL; + + root = (void *)get_zeroed_page(GFP_ATOMIC); + if (root == NULL) + return NULL; + + *pte = __pa(root) | GCR3_VALID; + } + + root = __va(*pte & PAGE_MASK); + + level -= 1; + } + + return pte; +} + +static int __set_gcr3(struct protection_domain *domain, int pasid, + unsigned long cr3) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); + if (pte == NULL) + return -ENOMEM; + + *pte = (cr3 & PAGE_MASK) | GCR3_VALID; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +static int __clear_gcr3(struct protection_domain *domain, int pasid) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); + if (pte == NULL) + return 0; + + *pte = 0; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __set_gcr3(domain, pasid, cr3); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); + +int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __clear_gcr3(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index a92dc6117e2f..a951a7090e35 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -43,6 +43,10 @@ extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, u64 address); extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); +extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3); +extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); + #ifndef CONFIG_AMD_IOMMU_STATS diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index ff1dfe9ad579..060724e02e9f 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -279,6 +279,7 @@ #define DTE_GCR3_SHIFT_B 16 #define DTE_GCR3_SHIFT_C 43 +#define GCR3_VALID 0x01ULL #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) From c99afa25b67339b5fa7ef3767398878be9f60e1f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Nov 2011 18:19:25 +0100 Subject: [PATCH 38/53] iommu/amd: Implement function to send PPR completions To send completions for PPR requests this patch adds a function which can be used by the IOMMUv2 driver. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 51 +++++++++++++++++++++++++++++++++ drivers/iommu/amd_iommu_proto.h | 6 ++++ drivers/iommu/amd_iommu_types.h | 6 ++++ 3 files changed, 63 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 65a118ce626e..9a7e64b245a7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -736,6 +736,22 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } +static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid, + int status, int tag, bool gn) +{ + memset(cmd, 0, sizeof(*cmd)); + + cmd->data[0] = devid; + if (gn) { + cmd->data[1] = pasid & PASID_MASK; + cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; + } + cmd->data[3] = tag & 0x1ff; + cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; + + CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); +} + static void build_inv_all(struct iommu_cmd *cmd) { memset(cmd, 0, sizeof(*cmd)); @@ -1950,6 +1966,23 @@ out_err: return ret; } +/* FIXME: Move this to PCI code */ +#define PCI_PRI_TLP_OFF (1 << 2) + +bool pci_pri_tlp_required(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return false; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + + return (control & PCI_PRI_TLP_OFF) ? true : false; +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1973,6 +2006,7 @@ static int attach_device(struct device *dev, dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); + dev_data->pri_tlp = pci_pri_tlp_required(pdev); } else if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { dev_data->ats.enabled = true; @@ -3412,3 +3446,20 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) return ret; } EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); + +int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + dev_data = get_dev_data(&pdev->dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_complete_ppr(&cmd, dev_data->devid, pasid, status, + tag, dev_data->pri_tlp); + + return iommu_queue_command(iommu, &cmd); +} +EXPORT_SYMBOL(amd_iommu_complete_ppr); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index a951a7090e35..bb5ecfecfa44 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -47,6 +47,12 @@ extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, unsigned long cr3); extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); +#define PPR_SUCCESS 0x0 +#define PPR_INVALID 0x1 +#define PPR_FAILURE 0xf + +extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag); #ifndef CONFIG_AMD_IOMMU_STATS diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 060724e02e9f..8e1a6460f683 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -142,6 +142,7 @@ #define CMD_INV_DEV_ENTRY 0x02 #define CMD_INV_IOMMU_PAGES 0x03 #define CMD_INV_IOTLB_PAGES 0x04 +#define CMD_COMPLETE_PPR 0x07 #define CMD_INV_ALL 0x08 #define CMD_COMPL_WAIT_STORE_MASK 0x01 @@ -150,6 +151,9 @@ #define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 #define CMD_INV_IOMMU_PAGES_GN_MASK 0x04 +#define PPR_STATUS_MASK 0xf +#define PPR_STATUS_SHIFT 12 + #define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL /* macros and definitions for device table entries */ @@ -394,6 +398,8 @@ struct iommu_dev_data { bool enabled; int qdep; } ats; /* ATS state */ + bool pri_tlp; /* PASID TLB required for + PPR completions */ }; /* From f3572db823decfd747e6afd4c4ddfd67e8af8b6d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 23 Nov 2011 12:36:25 +0100 Subject: [PATCH 39/53] iommu/amd: Add function to get IOMMUv2 domain for pdev The AMD IOMMUv2 driver needs to get the IOMMUv2 domain associated with a particular device. This patch adds a function to get this information. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 18 ++++++++++++++++++ drivers/iommu/amd_iommu_proto.h | 1 + drivers/iommu/amd_iommu_types.h | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 9a7e64b245a7..71773d0fb769 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2958,6 +2958,8 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) if (!domain->pt_root) goto out_free; + domain->iommu_domain = dom; + dom->priv = domain; return 0; @@ -3463,3 +3465,19 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, return iommu_queue_command(iommu, &cmd); } EXPORT_SYMBOL(amd_iommu_complete_ppr); + +struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) +{ + struct protection_domain *domain; + + domain = get_domain(&pdev->dev); + if (IS_ERR(domain)) + return NULL; + + /* Only return IOMMUv2 domains */ + if (!(domain->flags & PD_IOMMUV2_MASK)) + return NULL; + + return domain->iommu_domain; +} +EXPORT_SYMBOL(amd_iommu_get_v2_domain); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index bb5ecfecfa44..1a7f41c6cc66 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -46,6 +46,7 @@ extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, unsigned long cr3); extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); +extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev); #define PPR_SUCCESS 0x0 #define PPR_INVALID 0x1 diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 8e1a6460f683..c39988fbcbbb 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -360,6 +360,8 @@ struct amd_iommu_fault { #define PPR_FAULT_RSVD (1 << 7) #define PPR_FAULT_GN (1 << 8) +struct iommu_domain; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -379,6 +381,8 @@ struct protection_domain { unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + struct iommu_domain *iommu_domain; /* Pointer to generic + domain structure */ }; From 6a113ddc03bcc32d3d440dce42b445868d5be093 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Dec 2011 12:04:58 +0100 Subject: [PATCH 40/53] iommu/amd: Add device errata handling Add infrastructure for errata-handling and handle two known erratas in the IOMMUv2 code. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 57 +++++++++++++++++++++++++++++++-- drivers/iommu/amd_iommu_types.h | 1 + include/linux/amd-iommu.h | 18 +++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 71773d0fb769..e453bbd09445 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -172,6 +172,15 @@ static bool pci_iommuv2_capable(struct pci_dev *pdev) return true; } +static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + dev_data = get_dev_data(&pdev->dev); + + return dev_data->errata & (1 << erratum) ? true : false; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -1934,9 +1943,33 @@ static void pdev_iommuv2_disable(struct pci_dev *pdev) pci_disable_pasid(pdev); } +/* FIXME: Change generic reset-function to do the same */ +static int pri_reset_while_enabled(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + control |= PCI_PRI_RESET; + pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + + return 0; +} + static int pdev_iommuv2_enable(struct pci_dev *pdev) { - int ret; + bool reset_enable; + int reqs, ret; + + /* FIXME: Hardcode number of outstanding requests for now */ + reqs = 32; + if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE)) + reqs = 1; + reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET); /* Only allow access to user-accessible pages */ ret = pci_enable_pasid(pdev, 0); @@ -1948,11 +1981,17 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev) if (ret) goto out_err; - /* FIXME: Hardcode number of outstanding requests for now */ - ret = pci_enable_pri(pdev, 32); + /* Enable PRI */ + ret = pci_enable_pri(pdev, reqs); if (ret) goto out_err; + if (reset_enable) { + ret = pri_reset_while_enabled(pdev); + if (ret) + goto out_err; + } + ret = pci_enable_ats(pdev, PAGE_SHIFT); if (ret) goto out_err; @@ -3481,3 +3520,15 @@ struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) return domain->iommu_domain; } EXPORT_SYMBOL(amd_iommu_get_v2_domain); + +void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + if (!amd_iommu_v2_supported()) + return; + + dev_data = get_dev_data(&pdev->dev); + dev_data->errata |= (1 << erratum); +} +EXPORT_SYMBOL(amd_iommu_enable_device_erratum); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index c39988fbcbbb..6ad8b10b3130 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -404,6 +404,7 @@ struct iommu_dev_data { } ats; /* ATS state */ bool pri_tlp; /* PASID TLB required for PPR completions */ + u32 errata; /* Bitmap for errata to apply */ }; /* diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index a6863a2dec1f..4152c3073db4 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -26,6 +26,24 @@ extern int amd_iommu_detect(void); + +/** + * amd_iommu_enable_device_erratum() - Enable erratum workaround for device + * in the IOMMUv2 driver + * @pdev: The PCI device the workaround is necessary for + * @erratum: The erratum workaround to enable + * + * Possible values for the erratum number are for now: + * - AMD_PRI_DEV_ERRATUM_ENABLE_RESET - Reset PRI capability when PRI + * is enabled + * - AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE - Limit number of outstanding PRI + * requests to one + */ +#define AMD_PRI_DEV_ERRATUM_ENABLE_RESET 0 +#define AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE 1 + +extern void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From 399be2f51979aaf4e3391d98d11e3aeaac7633a4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Dec 2011 16:53:47 +0100 Subject: [PATCH 41/53] iommu/amd: Add stat counter for IOMMUv2 events Add some interesting statistic counters for events when IOMMUv2 is active. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index e453bbd09445..2a46b1d7a601 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -359,6 +359,11 @@ DECLARE_STATS_COUNTER(domain_flush_single); DECLARE_STATS_COUNTER(domain_flush_all); DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); +DECLARE_STATS_COUNTER(complete_ppr); +DECLARE_STATS_COUNTER(invalidate_iotlb); +DECLARE_STATS_COUNTER(invalidate_iotlb_all); +DECLARE_STATS_COUNTER(pri_requests); + static struct dentry *stats_dir; static struct dentry *de_fflush; @@ -393,6 +398,10 @@ static void amd_iommu_stats_init(void) amd_iommu_stats_add(&domain_flush_all); amd_iommu_stats_add(&alloced_io_mem); amd_iommu_stats_add(&total_map_requests); + amd_iommu_stats_add(&complete_ppr); + amd_iommu_stats_add(&invalidate_iotlb); + amd_iommu_stats_add(&invalidate_iotlb_all); + amd_iommu_stats_add(&pri_requests); } #endif @@ -509,6 +518,8 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) volatile u64 *raw; int i; + INC_STATS_COUNTER(pri_requests); + raw = (u64 *)(iommu->ppr_log + head); /* @@ -3356,6 +3367,8 @@ out: static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, u64 address) { + INC_STATS_COUNTER(invalidate_iotlb); + return __flush_pasid(domain, pasid, address, false); } @@ -3376,6 +3389,8 @@ EXPORT_SYMBOL(amd_iommu_flush_page); static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) { + INC_STATS_COUNTER(invalidate_iotlb_all); + return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, true); } @@ -3495,6 +3510,8 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, struct amd_iommu *iommu; struct iommu_cmd cmd; + INC_STATS_COUNTER(complete_ppr); + dev_data = get_dev_data(&pdev->dev); iommu = amd_iommu_rlookup_table[dev_data->devid]; From e3c495c74bc008a5c3671bf3411bc7f5382fa58f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 9 Nov 2011 12:31:15 +0100 Subject: [PATCH 42/53] iommu/amd: Add driver stub for AMD IOMMUv2 support Add a Kconfig option for the optional driver. Since it is optional it can be compiled as a module and will only be loaded when required by another driver. Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 8 ++++++++ drivers/iommu/Makefile | 1 + drivers/iommu/amd_iommu_v2.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 drivers/iommu/amd_iommu_v2.c diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 220dfc24b311..e608a36bb0b1 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -60,6 +60,14 @@ config AMD_IOMMU_STATS information to userspace via debugfs. If unsure, say N. +config AMD_IOMMU_V2 + tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)" + depends on AMD_IOMMU && EXPERIMENTAL + ---help--- + This option enables support for the AMD IOMMUv2 features of the IOMMU + hardware. Select this option if you want to use devices that support + the the PCI PRI and PASID interface. + # Intel IOMMU support config DMAR_TABLE bool diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 2f4448794bc7..0e36b4934aff 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o +obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o obj-$(CONFIG_IRQ_REMAP) += intr_remapping.o diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c new file mode 100644 index 000000000000..a19e07d11d12 --- /dev/null +++ b/drivers/iommu/amd_iommu_v2.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joerg Roedel "); + +static int __init amd_iommu_v2_init(void) +{ + pr_info("AMD IOMMUv2 driver by Joerg Roedel "); + + return 0; +} + +static void __exit amd_iommu_v2_exit(void) +{ +} + +module_init(amd_iommu_v2_init); +module_exit(amd_iommu_v2_exit); From ed96f228ba9725edf69385bffdc19ee5bb0ec641 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 23 Nov 2011 17:30:39 +0100 Subject: [PATCH 43/53] iommu/amd: Implement device aquisition code for IOMMUv2 This patch adds the amd_iommu_init_device() and amd_iommu_free_device() functions which make a device and the IOMMU ready for IOMMUv2 usage. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_v2.c | 210 +++++++++++++++++++++++++++++++++++ include/linux/amd-iommu.h | 23 +++- 2 files changed, 232 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index a19e07d11d12..bfceed25c186 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -16,20 +16,230 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include +#include #include +#include +#include +#include + +#include "amd_iommu_proto.h" MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Joerg Roedel "); +#define MAX_DEVICES 0x10000 +#define PRI_QUEUE_SIZE 512 + +struct pri_queue { + atomic_t inflight; + bool finish; +}; + +struct pasid_state { + struct list_head list; /* For global state-list */ + atomic_t count; /* Reference count */ + struct task_struct *task; /* Task bound to this PASID */ + struct mm_struct *mm; /* mm_struct for the faults */ + struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ + struct device_state *device_state; /* Link to our device_state */ + int pasid; /* PASID index */ +}; + +struct device_state { + atomic_t count; + struct pci_dev *pdev; + struct pasid_state **states; + struct iommu_domain *domain; + int pasid_levels; + int max_pasids; + spinlock_t lock; +}; + +struct device_state **state_table; +static spinlock_t state_lock; + +/* List and lock for all pasid_states */ +static LIST_HEAD(pasid_state_list); + +static u16 device_id(struct pci_dev *pdev) +{ + u16 devid; + + devid = pdev->bus->number; + devid = (devid << 8) | pdev->devfn; + + return devid; +} + +static struct device_state *get_device_state(u16 devid) +{ + struct device_state *dev_state; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + dev_state = state_table[devid]; + if (dev_state != NULL) + atomic_inc(&dev_state->count); + spin_unlock_irqrestore(&state_lock, flags); + + return dev_state; +} + +static void free_device_state(struct device_state *dev_state) +{ + iommu_detach_device(dev_state->domain, &dev_state->pdev->dev); + iommu_domain_free(dev_state->domain); + kfree(dev_state); +} + +static void put_device_state(struct device_state *dev_state) +{ + if (atomic_dec_and_test(&dev_state->count)) + free_device_state(dev_state); +} + +int amd_iommu_init_device(struct pci_dev *pdev, int pasids) +{ + struct device_state *dev_state; + unsigned long flags; + int ret, tmp; + u16 devid; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + if (pasids <= 0 || pasids > (PASID_MASK + 1)) + return -EINVAL; + + devid = device_id(pdev); + + dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL); + if (dev_state == NULL) + return -ENOMEM; + + spin_lock_init(&dev_state->lock); + dev_state->pdev = pdev; + + tmp = pasids; + for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) + dev_state->pasid_levels += 1; + + atomic_set(&dev_state->count, 1); + dev_state->max_pasids = pasids; + + ret = -ENOMEM; + dev_state->states = (void *)get_zeroed_page(GFP_KERNEL); + if (dev_state->states == NULL) + goto out_free_dev_state; + + dev_state->domain = iommu_domain_alloc(&pci_bus_type); + if (dev_state->domain == NULL) + goto out_free_states; + + amd_iommu_domain_direct_map(dev_state->domain); + + ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); + if (ret) + goto out_free_domain; + + ret = iommu_attach_device(dev_state->domain, &pdev->dev); + if (ret != 0) + goto out_free_domain; + + spin_lock_irqsave(&state_lock, flags); + + if (state_table[devid] != NULL) { + spin_unlock_irqrestore(&state_lock, flags); + ret = -EBUSY; + goto out_free_domain; + } + + state_table[devid] = dev_state; + + spin_unlock_irqrestore(&state_lock, flags); + + return 0; + +out_free_domain: + iommu_domain_free(dev_state->domain); + +out_free_states: + free_page((unsigned long)dev_state->states); + +out_free_dev_state: + kfree(dev_state); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_init_device); + +void amd_iommu_free_device(struct pci_dev *pdev) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + + if (!amd_iommu_v2_supported()) + return; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + dev_state = state_table[devid]; + if (dev_state == NULL) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + + state_table[devid] = NULL; + + spin_unlock_irqrestore(&state_lock, flags); + + put_device_state(dev_state); +} +EXPORT_SYMBOL(amd_iommu_free_device); + static int __init amd_iommu_v2_init(void) { + size_t state_table_size; + pr_info("AMD IOMMUv2 driver by Joerg Roedel "); + spin_lock_init(&state_lock); + + state_table_size = MAX_DEVICES * sizeof(struct device_state *); + state_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(state_table_size)); + if (state_table == NULL) + return -ENOMEM; + return 0; } static void __exit amd_iommu_v2_exit(void) { + struct device_state *dev_state; + size_t state_table_size; + int i; + + for (i = 0; i < MAX_DEVICES; ++i) { + dev_state = get_device_state(i); + + if (dev_state == NULL) + continue; + + WARN_ON_ONCE(1); + + amd_iommu_free_device(dev_state->pdev); + put_device_state(dev_state); + } + + state_table_size = MAX_DEVICES * sizeof(struct device_state *); + free_pages((unsigned long)state_table, get_order(state_table_size)); } module_init(amd_iommu_v2_init); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 4152c3073db4..e8c7a2ec86b3 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -20,10 +20,12 @@ #ifndef _ASM_X86_AMD_IOMMU_H #define _ASM_X86_AMD_IOMMU_H -#include +#include #ifdef CONFIG_AMD_IOMMU +struct pci_dev; + extern int amd_iommu_detect(void); @@ -33,6 +35,7 @@ extern int amd_iommu_detect(void); * @pdev: The PCI device the workaround is necessary for * @erratum: The erratum workaround to enable * + * The function needs to be called before amd_iommu_init_device(). * Possible values for the erratum number are for now: * - AMD_PRI_DEV_ERRATUM_ENABLE_RESET - Reset PRI capability when PRI * is enabled @@ -44,6 +47,24 @@ extern int amd_iommu_detect(void); extern void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum); +/** + * amd_iommu_init_device() - Init device for use with IOMMUv2 driver + * @pdev: The PCI device to initialize + * @pasids: Number of PASIDs to support for this device + * + * This function does all setup for the device pdev so that it can be + * used with IOMMUv2. + * Returns 0 on success or negative value on error. + */ +extern int amd_iommu_init_device(struct pci_dev *pdev, int pasids); + +/** + * amd_iommu_free_device() - Free all IOMMUv2 related device resources + * and disable IOMMUv2 usage for this device + * @pdev: The PCI device to disable IOMMUv2 usage for' + */ +extern void amd_iommu_free_device(struct pci_dev *pdev); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From 2d5503b624736abfe0e0bad281f9b8d8a705b930 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 24 Nov 2011 10:41:57 +0100 Subject: [PATCH 44/53] iommu/amd: Add routines to bind/unbind a pasid This patch adds routines to bind a specific process address-space to a given PASID. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_v2.c | 306 +++++++++++++++++++++++++++++++++++ include/linux/amd-iommu.h | 26 +++ 2 files changed, 332 insertions(+) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index bfceed25c186..b5ee09ece651 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,10 @@ static spinlock_t state_lock; /* List and lock for all pasid_states */ static LIST_HEAD(pasid_state_list); +static DEFINE_SPINLOCK(ps_lock); + +static void free_pasid_states(struct device_state *dev_state); +static void unbind_pasid(struct device_state *dev_state, int pasid); static u16 device_id(struct pci_dev *pdev) { @@ -88,8 +93,16 @@ static struct device_state *get_device_state(u16 devid) static void free_device_state(struct device_state *dev_state) { + /* + * First detach device from domain - No more PRI requests will arrive + * from that device after it is unbound from the IOMMUv2 domain. + */ iommu_detach_device(dev_state->domain, &dev_state->pdev->dev); + + /* Everything is down now, free the IOMMUv2 domain */ iommu_domain_free(dev_state->domain); + + /* Finally get rid of the device-state */ kfree(dev_state); } @@ -99,6 +112,296 @@ static void put_device_state(struct device_state *dev_state) free_device_state(dev_state); } +static void link_pasid_state(struct pasid_state *pasid_state) +{ + spin_lock(&ps_lock); + list_add_tail(&pasid_state->list, &pasid_state_list); + spin_unlock(&ps_lock); +} + +static void __unlink_pasid_state(struct pasid_state *pasid_state) +{ + list_del(&pasid_state->list); +} + +static void unlink_pasid_state(struct pasid_state *pasid_state) +{ + spin_lock(&ps_lock); + __unlink_pasid_state(pasid_state); + spin_unlock(&ps_lock); +} + +/* Must be called under dev_state->lock */ +static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, + int pasid, bool alloc) +{ + struct pasid_state **root, **ptr; + int level, index; + + level = dev_state->pasid_levels; + root = dev_state->states; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + ptr = &root[index]; + + if (level == 0) + break; + + if (*ptr == NULL) { + if (!alloc) + return NULL; + + *ptr = (void *)get_zeroed_page(GFP_ATOMIC); + if (*ptr == NULL) + return NULL; + } + + root = (struct pasid_state **)*ptr; + level -= 1; + } + + return ptr; +} + +static int set_pasid_state(struct device_state *dev_state, + struct pasid_state *pasid_state, + int pasid) +{ + struct pasid_state **ptr; + unsigned long flags; + int ret; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, true); + + ret = -ENOMEM; + if (ptr == NULL) + goto out_unlock; + + ret = -ENOMEM; + if (*ptr != NULL) + goto out_unlock; + + *ptr = pasid_state; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); + + return ret; +} + +static void clear_pasid_state(struct device_state *dev_state, int pasid) +{ + struct pasid_state **ptr; + unsigned long flags; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, true); + + if (ptr == NULL) + goto out_unlock; + + *ptr = NULL; + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); +} + +static struct pasid_state *get_pasid_state(struct device_state *dev_state, + int pasid) +{ + struct pasid_state **ptr, *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, false); + + if (ptr == NULL) + goto out_unlock; + + ret = *ptr; + if (ret) + atomic_inc(&ret->count); + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); + + return ret; +} + +static void free_pasid_state(struct pasid_state *pasid_state) +{ + kfree(pasid_state); +} + +static void put_pasid_state(struct pasid_state *pasid_state) +{ + if (atomic_dec_and_test(&pasid_state->count)) { + put_device_state(pasid_state->device_state); + mmput(pasid_state->mm); + free_pasid_state(pasid_state); + } +} + +static void unbind_pasid(struct device_state *dev_state, int pasid) +{ + struct pasid_state *pasid_state; + + pasid_state = get_pasid_state(dev_state, pasid); + if (pasid_state == NULL) + return; + + unlink_pasid_state(pasid_state); + + amd_iommu_domain_clear_gcr3(dev_state->domain, pasid); + clear_pasid_state(dev_state, pasid); + + put_pasid_state(pasid_state); /* Reference taken in this function */ + put_pasid_state(pasid_state); /* Reference taken in bind() function */ +} + +static void free_pasid_states_level1(struct pasid_state **tbl) +{ + int i; + + for (i = 0; i < 512; ++i) { + if (tbl[i] == NULL) + continue; + + free_page((unsigned long)tbl[i]); + } +} + +static void free_pasid_states_level2(struct pasid_state **tbl) +{ + struct pasid_state **ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (tbl[i] == NULL) + continue; + + ptr = (struct pasid_state **)tbl[i]; + free_pasid_states_level1(ptr); + } +} + +static void free_pasid_states(struct device_state *dev_state) +{ + struct pasid_state *pasid_state; + int i; + + for (i = 0; i < dev_state->max_pasids; ++i) { + pasid_state = get_pasid_state(dev_state, i); + if (pasid_state == NULL) + continue; + + unbind_pasid(dev_state, i); + put_pasid_state(pasid_state); + } + + if (dev_state->pasid_levels == 2) + free_pasid_states_level2(dev_state->states); + else if (dev_state->pasid_levels == 1) + free_pasid_states_level1(dev_state->states); + else if (dev_state->pasid_levels != 0) + BUG(); + + free_page((unsigned long)dev_state->states); +} + +int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, + struct task_struct *task) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + u16 devid; + int ret; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + dev_state = get_device_state(devid); + + if (dev_state == NULL) + return -EINVAL; + + ret = -EINVAL; + if (pasid < 0 || pasid >= dev_state->max_pasids) + goto out; + + ret = -ENOMEM; + pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL); + if (pasid_state == NULL) + goto out; + + atomic_set(&pasid_state->count, 1); + pasid_state->task = task; + pasid_state->mm = get_task_mm(task); + pasid_state->device_state = dev_state; + pasid_state->pasid = pasid; + + if (pasid_state->mm == NULL) + goto out_free; + + ret = set_pasid_state(dev_state, pasid_state, pasid); + if (ret) + goto out_free; + + ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, + __pa(pasid_state->mm->pgd)); + if (ret) + goto out_clear_state; + + link_pasid_state(pasid_state); + + return 0; + +out_clear_state: + clear_pasid_state(dev_state, pasid); + +out_free: + put_pasid_state(pasid_state); + +out: + put_device_state(dev_state); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_bind_pasid); + +void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid) +{ + struct device_state *dev_state; + u16 devid; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return; + + devid = device_id(pdev); + dev_state = get_device_state(devid); + if (dev_state == NULL) + return; + + if (pasid < 0 || pasid >= dev_state->max_pasids) + goto out; + + unbind_pasid(dev_state, pasid); + +out: + put_device_state(dev_state); +} +EXPORT_SYMBOL(amd_iommu_unbind_pasid); + int amd_iommu_init_device(struct pci_dev *pdev, int pasids) { struct device_state *dev_state; @@ -199,6 +502,9 @@ void amd_iommu_free_device(struct pci_dev *pdev) spin_unlock_irqrestore(&state_lock, flags); + /* Get rid of any remaining pasid states */ + free_pasid_states(dev_state); + put_device_state(dev_state); } EXPORT_SYMBOL(amd_iommu_free_device); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index e8c7a2ec86b3..23e21e15dfab 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -24,9 +24,13 @@ #ifdef CONFIG_AMD_IOMMU +struct task_struct; struct pci_dev; extern int amd_iommu_detect(void); +extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, + struct task_struct *task); +extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid); /** @@ -65,6 +69,28 @@ extern int amd_iommu_init_device(struct pci_dev *pdev, int pasids); */ extern void amd_iommu_free_device(struct pci_dev *pdev); +/** + * amd_iommu_bind_pasid() - Bind a given task to a PASID on a device + * @pdev: The PCI device to bind the task to + * @pasid: The PASID on the device the task should be bound to + * @task: the task to bind + * + * The function returns 0 on success or a negative value on error. + */ +extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, + struct task_struct *task); + +/** + * amd_iommu_unbind_pasid() - Unbind a PASID from its task on + * a device + * @pdev: The device of the PASID + * @pasid: The PASID to unbind + * + * When this function returns the device is no longer using the PASID + * and the PASID is no longer bound to its task. + */ +extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From 028eeacc412a8bebf6711e58629b0cab56a9ba87 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 24 Nov 2011 12:48:13 +0100 Subject: [PATCH 45/53] iommu/amd: Implement IO page-fault handler Register the notifier for PPR faults and handle them as necessary. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_v2.c | 204 +++++++++++++++++++++++++++++++++-- 1 file changed, 196 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index b5ee09ece651..8804b2247694 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -21,9 +21,11 @@ #include #include #include +#include #include #include +#include "amd_iommu_types.h" #include "amd_iommu_proto.h" MODULE_LICENSE("GPL v2"); @@ -35,6 +37,7 @@ MODULE_AUTHOR("Joerg Roedel "); struct pri_queue { atomic_t inflight; bool finish; + int status; }; struct pasid_state { @@ -45,6 +48,8 @@ struct pasid_state { struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ struct device_state *device_state; /* Link to our device_state */ int pasid; /* PASID index */ + spinlock_t lock; /* Protect pri_queues */ + wait_queue_head_t wq; /* To wait for count == 0 */ }; struct device_state { @@ -55,6 +60,20 @@ struct device_state { int pasid_levels; int max_pasids; spinlock_t lock; + wait_queue_head_t wq; +}; + +struct fault { + struct work_struct work; + struct device_state *dev_state; + struct pasid_state *state; + struct mm_struct *mm; + u64 address; + u16 devid; + u16 pasid; + u16 tag; + u16 finish; + u16 flags; }; struct device_state **state_table; @@ -64,6 +83,8 @@ static spinlock_t state_lock; static LIST_HEAD(pasid_state_list); static DEFINE_SPINLOCK(ps_lock); +static struct workqueue_struct *iommu_wq; + static void free_pasid_states(struct device_state *dev_state); static void unbind_pasid(struct device_state *dev_state, int pasid); @@ -109,9 +130,20 @@ static void free_device_state(struct device_state *dev_state) static void put_device_state(struct device_state *dev_state) { if (atomic_dec_and_test(&dev_state->count)) - free_device_state(dev_state); + wake_up(&dev_state->wq); } +static void put_device_state_wait(struct device_state *dev_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_dec_and_test(&dev_state->count)) + schedule(); + finish_wait(&dev_state->wq, &wait); + + free_device_state(dev_state); +} static void link_pasid_state(struct pasid_state *pasid_state) { spin_lock(&ps_lock); @@ -242,11 +274,26 @@ static void put_pasid_state(struct pasid_state *pasid_state) { if (atomic_dec_and_test(&pasid_state->count)) { put_device_state(pasid_state->device_state); - mmput(pasid_state->mm); - free_pasid_state(pasid_state); + wake_up(&pasid_state->wq); } } +static void put_pasid_state_wait(struct pasid_state *pasid_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE); + + if (atomic_dec_and_test(&pasid_state->count)) + put_device_state(pasid_state->device_state); + else + schedule(); + + finish_wait(&pasid_state->wq, &wait); + mmput(pasid_state->mm); + free_pasid_state(pasid_state); +} + static void unbind_pasid(struct device_state *dev_state, int pasid) { struct pasid_state *pasid_state; @@ -261,7 +308,7 @@ static void unbind_pasid(struct device_state *dev_state, int pasid) clear_pasid_state(dev_state, pasid); put_pasid_state(pasid_state); /* Reference taken in this function */ - put_pasid_state(pasid_state); /* Reference taken in bind() function */ + put_pasid_state_wait(pasid_state); /* Reference from bind() function */ } static void free_pasid_states_level1(struct pasid_state **tbl) @@ -300,8 +347,8 @@ static void free_pasid_states(struct device_state *dev_state) if (pasid_state == NULL) continue; - unbind_pasid(dev_state, i); put_pasid_state(pasid_state); + unbind_pasid(dev_state, i); } if (dev_state->pasid_levels == 2) @@ -314,6 +361,120 @@ static void free_pasid_states(struct device_state *dev_state) free_page((unsigned long)dev_state->states); } +static void set_pri_tag_status(struct pasid_state *pasid_state, + u16 tag, int status) +{ + unsigned long flags; + + spin_lock_irqsave(&pasid_state->lock, flags); + pasid_state->pri[tag].status = status; + spin_unlock_irqrestore(&pasid_state->lock, flags); +} + +static void finish_pri_tag(struct device_state *dev_state, + struct pasid_state *pasid_state, + u16 tag) +{ + unsigned long flags; + + spin_lock_irqsave(&pasid_state->lock, flags); + if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) && + pasid_state->pri[tag].finish) { + amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid, + pasid_state->pri[tag].status, tag); + pasid_state->pri[tag].finish = false; + pasid_state->pri[tag].status = PPR_SUCCESS; + } + spin_unlock_irqrestore(&pasid_state->lock, flags); +} + +static void do_fault(struct work_struct *work) +{ + struct fault *fault = container_of(work, struct fault, work); + int npages, write; + struct page *page; + + write = !!(fault->flags & PPR_FAULT_WRITE); + + npages = get_user_pages(fault->state->task, fault->state->mm, + fault->address, 1, write, 0, &page, NULL); + + if (npages == 1) + put_page(page); + else + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + + finish_pri_tag(fault->dev_state, fault->state, fault->tag); + + put_pasid_state(fault->state); + + kfree(fault); +} + +static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) +{ + struct amd_iommu_fault *iommu_fault; + struct pasid_state *pasid_state; + struct device_state *dev_state; + unsigned long flags; + struct fault *fault; + bool finish; + u16 tag; + int ret; + + iommu_fault = data; + tag = iommu_fault->tag & 0x1ff; + finish = (iommu_fault->tag >> 9) & 1; + + ret = NOTIFY_DONE; + dev_state = get_device_state(iommu_fault->device_id); + if (dev_state == NULL) + goto out; + + pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); + if (pasid_state == NULL) { + /* We know the device but not the PASID -> send INVALID */ + amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, + PPR_INVALID, tag); + goto out_drop_state; + } + + spin_lock_irqsave(&pasid_state->lock, flags); + atomic_inc(&pasid_state->pri[tag].inflight); + if (finish) + pasid_state->pri[tag].finish = true; + spin_unlock_irqrestore(&pasid_state->lock, flags); + + fault = kzalloc(sizeof(*fault), GFP_ATOMIC); + if (fault == NULL) { + /* We are OOM - send success and let the device re-fault */ + finish_pri_tag(dev_state, pasid_state, tag); + goto out_drop_state; + } + + fault->dev_state = dev_state; + fault->address = iommu_fault->address; + fault->state = pasid_state; + fault->tag = tag; + fault->finish = finish; + fault->flags = iommu_fault->flags; + INIT_WORK(&fault->work, do_fault); + + queue_work(iommu_wq, &fault->work); + + ret = NOTIFY_OK; + +out_drop_state: + put_device_state(dev_state); + +out: + return ret; +} + +static struct notifier_block ppr_nb = { + .notifier_call = ppr_notifier, +}; + int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, struct task_struct *task) { @@ -343,6 +504,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, goto out; atomic_set(&pasid_state->count, 1); + init_waitqueue_head(&pasid_state->wq); pasid_state->task = task; pasid_state->mm = get_task_mm(task); pasid_state->device_state = dev_state; @@ -368,7 +530,7 @@ out_clear_state: clear_pasid_state(dev_state, pasid); out_free: - put_pasid_state(pasid_state); + free_pasid_state(pasid_state); out: put_device_state(dev_state); @@ -424,6 +586,7 @@ int amd_iommu_init_device(struct pci_dev *pdev, int pasids) return -ENOMEM; spin_lock_init(&dev_state->lock); + init_waitqueue_head(&dev_state->wq); dev_state->pdev = pdev; tmp = pasids; @@ -505,13 +668,14 @@ void amd_iommu_free_device(struct pci_dev *pdev) /* Get rid of any remaining pasid states */ free_pasid_states(dev_state); - put_device_state(dev_state); + put_device_state_wait(dev_state); } EXPORT_SYMBOL(amd_iommu_free_device); static int __init amd_iommu_v2_init(void) { size_t state_table_size; + int ret; pr_info("AMD IOMMUv2 driver by Joerg Roedel "); @@ -523,7 +687,21 @@ static int __init amd_iommu_v2_init(void) if (state_table == NULL) return -ENOMEM; + ret = -ENOMEM; + iommu_wq = create_workqueue("amd_iommu_v2"); + if (iommu_wq == NULL) { + ret = -ENOMEM; + goto out_free; + } + + amd_iommu_register_ppr_notifier(&ppr_nb); + return 0; + +out_free: + free_pages((unsigned long)state_table, get_order(state_table_size)); + + return ret; } static void __exit amd_iommu_v2_exit(void) @@ -532,6 +710,14 @@ static void __exit amd_iommu_v2_exit(void) size_t state_table_size; int i; + amd_iommu_unregister_ppr_notifier(&ppr_nb); + + flush_workqueue(iommu_wq); + + /* + * The loop below might call flush_workqueue(), so call + * destroy_workqueue() after it + */ for (i = 0; i < MAX_DEVICES; ++i) { dev_state = get_device_state(i); @@ -540,10 +726,12 @@ static void __exit amd_iommu_v2_exit(void) WARN_ON_ONCE(1); - amd_iommu_free_device(dev_state->pdev); put_device_state(dev_state); + amd_iommu_free_device(dev_state->pdev); } + destroy_workqueue(iommu_wq); + state_table_size = MAX_DEVICES * sizeof(struct device_state *); free_pages((unsigned long)state_table, get_order(state_table_size)); } From 8736b2c331030733c5d619170dc6e9ef211a4039 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 24 Nov 2011 16:21:52 +0100 Subject: [PATCH 46/53] iommu/amd: Implement notifiers for IOMMUv2 Since pages are not pinned anymore we need notifications when the VMM changes the page-tables. Use mmu_notifiers for that. Also use the task_exit notifier from the profiling subsystem to shutdown all contexts related to this task. Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 3 +- drivers/iommu/amd_iommu_v2.c | 186 +++++++++++++++++++++++++++++++++-- 2 files changed, 178 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index e608a36bb0b1..6bea6962f8ee 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -62,7 +62,8 @@ config AMD_IOMMU_STATS config AMD_IOMMU_V2 tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)" - depends on AMD_IOMMU && EXPERIMENTAL + depends on AMD_IOMMU && PROFILING && EXPERIMENTAL + select MMU_NOTIFIER ---help--- This option enables support for the AMD IOMMUv2 features of the IOMMU hardware. Select this option if you want to use devices that support diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 8804b2247694..abdb8396f89a 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -16,8 +16,10 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include +#include #include #include #include @@ -45,6 +47,7 @@ struct pasid_state { atomic_t count; /* Reference count */ struct task_struct *task; /* Task bound to this PASID */ struct mm_struct *mm; /* mm_struct for the faults */ + struct mmu_notifier mn; /* mmu_otifier handle */ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ struct device_state *device_state; /* Link to our device_state */ int pasid; /* PASID index */ @@ -85,8 +88,16 @@ static DEFINE_SPINLOCK(ps_lock); static struct workqueue_struct *iommu_wq; +/* + * Empty page table - Used between + * mmu_notifier_invalidate_range_start and + * mmu_notifier_invalidate_range_end + */ +static u64 *empty_page_table; + static void free_pasid_states(struct device_state *dev_state); static void unbind_pasid(struct device_state *dev_state, int pasid); +static int task_exit(struct notifier_block *nb, unsigned long e, void *data); static u16 device_id(struct pci_dev *pdev) { @@ -144,6 +155,11 @@ static void put_device_state_wait(struct device_state *dev_state) free_device_state(dev_state); } + +static struct notifier_block profile_nb = { + .notifier_call = task_exit, +}; + static void link_pasid_state(struct pasid_state *pasid_state) { spin_lock(&ps_lock); @@ -294,6 +310,23 @@ static void put_pasid_state_wait(struct pasid_state *pasid_state) free_pasid_state(pasid_state); } +static void __unbind_pasid(struct pasid_state *pasid_state) +{ + struct iommu_domain *domain; + + domain = pasid_state->device_state->domain; + + amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); + clear_pasid_state(pasid_state->device_state, pasid_state->pasid); + + /* Make sure no more pending faults are in the queue */ + flush_workqueue(iommu_wq); + + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + + put_pasid_state(pasid_state); /* Reference taken in bind() function */ +} + static void unbind_pasid(struct device_state *dev_state, int pasid) { struct pasid_state *pasid_state; @@ -303,12 +336,8 @@ static void unbind_pasid(struct device_state *dev_state, int pasid) return; unlink_pasid_state(pasid_state); - - amd_iommu_domain_clear_gcr3(dev_state->domain, pasid); - clear_pasid_state(dev_state, pasid); - - put_pasid_state(pasid_state); /* Reference taken in this function */ - put_pasid_state_wait(pasid_state); /* Reference from bind() function */ + __unbind_pasid(pasid_state); + put_pasid_state_wait(pasid_state); /* Reference taken in this function */ } static void free_pasid_states_level1(struct pasid_state **tbl) @@ -361,6 +390,83 @@ static void free_pasid_states(struct device_state *dev_state) free_page((unsigned long)dev_state->states); } +static struct pasid_state *mn_to_state(struct mmu_notifier *mn) +{ + return container_of(mn, struct pasid_state, mn); +} + +static void __mn_flush_page(struct mmu_notifier *mn, + unsigned long address) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address); +} + +static int mn_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + __mn_flush_page(mn, address); + + return 0; +} + +static void mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + __mn_flush_page(mn, address); +} + +static void mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + __mn_flush_page(mn, address); +} + +static void mn_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, + __pa(empty_page_table)); +} + +static void mn_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, + __pa(pasid_state->mm->pgd)); +} + +static struct mmu_notifier_ops iommu_mn = { + .clear_flush_young = mn_clear_flush_young, + .change_pte = mn_change_pte, + .invalidate_page = mn_invalidate_page, + .invalidate_range_start = mn_invalidate_range_start, + .invalidate_range_end = mn_invalidate_range_end, +}; + static void set_pri_tag_status(struct pasid_state *pasid_state, u16 tag, int status) { @@ -475,6 +581,50 @@ static struct notifier_block ppr_nb = { .notifier_call = ppr_notifier, }; +static int task_exit(struct notifier_block *nb, unsigned long e, void *data) +{ + struct pasid_state *pasid_state; + struct task_struct *task; + + task = data; + + /* + * Using this notifier is a hack - but there is no other choice + * at the moment. What I really want is a sleeping notifier that + * is called when an MM goes down. But such a notifier doesn't + * exist yet. The notifier needs to sleep because it has to make + * sure that the device does not use the PASID and the address + * space anymore before it is destroyed. This includes waiting + * for pending PRI requests to pass the workqueue. The + * MMU-Notifiers would be a good fit, but they use RCU and so + * they are not allowed to sleep. Lets see how we can solve this + * in a more intelligent way in the future. + */ +again: + spin_lock(&ps_lock); + list_for_each_entry(pasid_state, &pasid_state_list, list) { + struct device_state *dev_state; + int pasid; + + if (pasid_state->task != task) + continue; + + /* Drop Lock and unbind */ + spin_unlock(&ps_lock); + + dev_state = pasid_state->device_state; + pasid = pasid_state->pasid; + + unbind_pasid(dev_state, pasid); + + /* Task may be in the list multiple times */ + goto again; + } + spin_unlock(&ps_lock); + + return NOTIFY_OK; +} + int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, struct task_struct *task) { @@ -509,13 +659,16 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, pasid_state->mm = get_task_mm(task); pasid_state->device_state = dev_state; pasid_state->pasid = pasid; + pasid_state->mn.ops = &iommu_mn; if (pasid_state->mm == NULL) goto out_free; + mmu_notifier_register(&pasid_state->mn, pasid_state->mm); + ret = set_pasid_state(dev_state, pasid_state, pasid); if (ret) - goto out_free; + goto out_unregister; ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, __pa(pasid_state->mm->pgd)); @@ -529,6 +682,9 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, out_clear_state: clear_pasid_state(dev_state, pasid); +out_unregister: + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + out_free: free_pasid_state(pasid_state); @@ -689,15 +845,22 @@ static int __init amd_iommu_v2_init(void) ret = -ENOMEM; iommu_wq = create_workqueue("amd_iommu_v2"); - if (iommu_wq == NULL) { - ret = -ENOMEM; + if (iommu_wq == NULL) goto out_free; - } + + ret = -ENOMEM; + empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL); + if (empty_page_table == NULL) + goto out_destroy_wq; amd_iommu_register_ppr_notifier(&ppr_nb); + profile_event_register(PROFILE_TASK_EXIT, &profile_nb); return 0; +out_destroy_wq: + destroy_workqueue(iommu_wq); + out_free: free_pages((unsigned long)state_table, get_order(state_table_size)); @@ -710,6 +873,7 @@ static void __exit amd_iommu_v2_exit(void) size_t state_table_size; int i; + profile_event_unregister(PROFILE_TASK_EXIT, &profile_nb); amd_iommu_unregister_ppr_notifier(&ppr_nb); flush_workqueue(iommu_wq); @@ -734,6 +898,8 @@ static void __exit amd_iommu_v2_exit(void) state_table_size = MAX_DEVICES * sizeof(struct device_state *); free_pages((unsigned long)state_table, get_order(state_table_size)); + + free_page((unsigned long)empty_page_table); } module_init(amd_iommu_v2_init); From 175d6146738b3d04e1adcaa4a971a3b2b0dbd8af Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 28 Nov 2011 14:36:36 +0100 Subject: [PATCH 47/53] iommu/amd: Add invalid_ppr callback This callback can be used to change the PRI response code sent to a device when a PPR fault fails. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_v2.c | 57 ++++++++++++++++++++++++++++++++++-- include/linux/amd-iommu.h | 34 +++++++++++++++++++-- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index abdb8396f89a..fe812e2a0474 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -62,6 +62,7 @@ struct device_state { struct iommu_domain *domain; int pasid_levels; int max_pasids; + amd_iommu_invalid_ppr_cb inv_ppr_cb; spinlock_t lock; wait_queue_head_t wq; }; @@ -505,10 +506,31 @@ static void do_fault(struct work_struct *work) npages = get_user_pages(fault->state->task, fault->state->mm, fault->address, 1, write, 0, &page, NULL); - if (npages == 1) + if (npages == 1) { put_page(page); - else + } else if (fault->dev_state->inv_ppr_cb) { + int status; + + status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, + fault->pasid, + fault->address, + fault->flags); + switch (status) { + case AMD_IOMMU_INV_PRI_RSP_SUCCESS: + set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); + break; + case AMD_IOMMU_INV_PRI_RSP_INVALID: + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + break; + case AMD_IOMMU_INV_PRI_RSP_FAIL: + set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); + break; + default: + BUG(); + } + } else { set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + } finish_pri_tag(fault->dev_state, fault->state, fault->tag); @@ -828,6 +850,37 @@ void amd_iommu_free_device(struct pci_dev *pdev) } EXPORT_SYMBOL(amd_iommu_free_device); +int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, + amd_iommu_invalid_ppr_cb cb) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + int ret; + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + ret = -EINVAL; + dev_state = state_table[devid]; + if (dev_state == NULL) + goto out_unlock; + + dev_state->inv_ppr_cb = cb; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&state_lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); + static int __init amd_iommu_v2_init(void) { size_t state_table_size; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 23e21e15dfab..06688c42167d 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -28,9 +28,6 @@ struct task_struct; struct pci_dev; extern int amd_iommu_detect(void); -extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, - struct task_struct *task); -extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid); /** @@ -91,6 +88,37 @@ extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, */ extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid); +/** + * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed + * PRI requests + * @pdev: The PCI device the call-back should be registered for + * @cb: The call-back function + * + * The IOMMUv2 driver invokes this call-back when it is unable to + * successfully handle a PRI request. The device driver can then decide + * which PRI response the device should see. Possible return values for + * the call-back are: + * + * - AMD_IOMMU_INV_PRI_RSP_SUCCESS - Send SUCCESS back to the device + * - AMD_IOMMU_INV_PRI_RSP_INVALID - Send INVALID back to the device + * - AMD_IOMMU_INV_PRI_RSP_FAIL - Send Failure back to the device, + * the device is required to disable + * PRI when it receives this response + * + * The function returns 0 on success or negative value on error. + */ +#define AMD_IOMMU_INV_PRI_RSP_SUCCESS 0 +#define AMD_IOMMU_INV_PRI_RSP_INVALID 1 +#define AMD_IOMMU_INV_PRI_RSP_FAIL 2 + +typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev, + int pasid, + unsigned long address, + u16); + +extern int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, + amd_iommu_invalid_ppr_cb cb); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From 46277b75da1b6c57159496d536acc2e9352a7ee0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 7 Dec 2011 14:34:02 +0100 Subject: [PATCH 48/53] iommu/amd: Adapt IOMMU driver to PCI register name changes The symbolic register names for PCI and PASID changed in PCI code. This patch adapts the AMD IOMMU driver to these changes. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a7cbcd46af9e..d5074f428423 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -176,8 +176,8 @@ static bool pci_iommuv2_capable(struct pci_dev *pdev) { static const int caps[] = { PCI_EXT_CAP_ID_ATS, - PCI_PRI_CAP, - PCI_PASID_CAP, + PCI_EXT_CAP_ID_PRI, + PCI_EXT_CAP_ID_PASID, }; int i, pos; @@ -1978,13 +1978,13 @@ static int pri_reset_while_enabled(struct pci_dev *pdev) u16 control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); - control |= PCI_PRI_RESET; - pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + control |= PCI_PRI_CTRL_RESET; + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); return 0; } @@ -2042,11 +2042,11 @@ bool pci_pri_tlp_required(struct pci_dev *pdev) u16 control; int pos; - pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return false; - pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); return (control & PCI_PRI_TLP_OFF) ? true : false; } From 52efdb89d60a0f19949129a08af3437a7aab70be Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 7 Dec 2011 12:01:36 +0100 Subject: [PATCH 49/53] iommu/amd: Add amd_iommu_device_info() function This function can be used to find out which features necessary for IOMMUv2 usage are available on a given device. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 43 +++++++++++++++++++++++++++++++++++++++ include/linux/amd-iommu.h | 26 +++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d5074f428423..03944e76b700 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3565,3 +3565,46 @@ void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) dev_data->errata |= (1 << erratum); } EXPORT_SYMBOL(amd_iommu_enable_device_erratum); + +int amd_iommu_device_info(struct pci_dev *pdev, + struct amd_iommu_device_info *info) +{ + int max_pasids; + int pos; + + if (pdev == NULL || info == NULL) + return -EINVAL; + + if (!amd_iommu_v2_supported()) + return -EINVAL; + + memset(info, 0, sizeof(*info)); + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); + if (pos) { + int features; + + max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); + max_pasids = min(max_pasids, (1 << 20)); + + info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + info->max_pasids = min(pci_max_pasids(pdev), max_pasids); + + features = pci_pasid_features(pdev); + if (features & PCI_PASID_CAP_EXEC) + info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; + if (features & PCI_PASID_CAP_PRIV) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; + } + + return 0; +} +EXPORT_SYMBOL(amd_iommu_device_info); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 06688c42167d..c03c281ae6ee 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -119,6 +119,32 @@ typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev, extern int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, amd_iommu_invalid_ppr_cb cb); +/** + * amd_iommu_device_info() - Get information about IOMMUv2 support of a + * PCI device + * @pdev: PCI device to query information from + * @info: A pointer to an amd_iommu_device_info structure which will contain + * the information about the PCI device + * + * Returns 0 on success, negative value on error + */ + +#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP 0x1 /* ATS feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP 0x2 /* PRI feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP 0x4 /* PASID context supported */ +#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP 0x8 /* Device may request execution + on memory pages */ +#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP 0x10 /* Device may request + super-user privileges */ + +struct amd_iommu_device_info { + int max_pasids; + u32 flags; +}; + +extern int amd_iommu_device_info(struct pci_dev *pdev, + struct amd_iommu_device_info *info); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From bc21662f729cd17d2af93e149f4eccafc7b10181 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 7 Dec 2011 12:24:42 +0100 Subject: [PATCH 50/53] iommu/amd: Add invalidate-context call-back This call-back is invoked when the task that is bound to a pasid is about to exit. The driver can use it to shutdown all context related to that context in a safe way. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_v2.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/amd-iommu.h | 17 +++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index fe812e2a0474..8add9f125d3e 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -63,6 +63,7 @@ struct device_state { int pasid_levels; int max_pasids; amd_iommu_invalid_ppr_cb inv_ppr_cb; + amd_iommu_invalidate_ctx inv_ctx_cb; spinlock_t lock; wait_queue_head_t wq; }; @@ -637,6 +638,9 @@ again: dev_state = pasid_state->device_state; pasid = pasid_state->pasid; + if (pasid_state->device_state->inv_ctx_cb) + dev_state->inv_ctx_cb(dev_state->pdev, pasid); + unbind_pasid(dev_state, pasid); /* Task may be in the list multiple times */ @@ -881,6 +885,37 @@ out_unlock: } EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); +int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, + amd_iommu_invalidate_ctx cb) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + int ret; + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + ret = -EINVAL; + dev_state = state_table[devid]; + if (dev_state == NULL) + goto out_unlock; + + dev_state->inv_ctx_cb = cb; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&state_lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb); + static int __init amd_iommu_v2_init(void) { size_t state_table_size; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index c03c281ae6ee..ef00610837d4 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -145,6 +145,23 @@ struct amd_iommu_device_info { extern int amd_iommu_device_info(struct pci_dev *pdev, struct amd_iommu_device_info *info); +/** + * amd_iommu_set_invalidate_ctx_cb() - Register a call-back for invalidating + * a pasid context. This call-back is + * invoked when the IOMMUv2 driver needs to + * invalidate a PASID context, for example + * because the task that is bound to that + * context is about to exit. + * + * @pdev: The PCI device the call-back should be registered for + * @cb: The call-back function + */ + +typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid); + +extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, + amd_iommu_invalidate_ctx cb); + #else static inline int amd_iommu_detect(void) { return -ENODEV; } From 31342b58b72b2b8480f12ffee648c8ba3297dee1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 22 Dec 2011 12:18:45 +0100 Subject: [PATCH 51/53] iommu/amd: Remove unnecessary cache flushes in amd_iommu_resume The caches are already flushed in enable_iommus(), so this flush is not necessary. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c7a5d7e14547..38784733cbb4 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1396,13 +1396,6 @@ static void amd_iommu_resume(void) /* re-load the hardware */ enable_iommus(); - - /* - * we have to flush after the IOMMUs are enabled because a - * disabled IOMMU will never execute the commands we send - */ - for_each_iommu(iommu) - iommu_flush_all_caches(iommu); } static int amd_iommu_suspend(void) From 2655d7a29703f29d4b6b8e4ee1f4d682b3b28f9c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 22 Dec 2011 12:35:38 +0100 Subject: [PATCH 52/53] iommu/amd: Init stats for iommu=pt The IOMMUv2 driver added a few statistic counter which are interesting in the iommu=pt mode too. So initialize the statistic counter for that mode too. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 03944e76b700..edd291070b0b 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3238,6 +3238,8 @@ int __init amd_iommu_init_passthrough(void) attach_device(&dev->dev, pt_domain); } + amd_iommu_stats_init(); + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); return 0; From 1456e9d2c4667a6e9221eda27b9648fb3bcc1e8e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 22 Dec 2011 14:51:53 +0100 Subject: [PATCH 53/53] iommu/amd: Set IOTLB invalidation timeout To protect the command buffer from hanging when a device does not respond to an IOTLB invalidation, set a timeout of 1s for outstanding IOTLB invalidations. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 13 +++++++++++++ drivers/iommu/amd_iommu_types.h | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 38784733cbb4..bdea288dc185 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -306,6 +306,16 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout) +{ + u32 ctrl; + + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl &= ~CTRL_INV_TO_MASK; + ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK; + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { @@ -1300,6 +1310,9 @@ static void iommu_init_flags(struct amd_iommu *iommu) * make IOMMU memory accesses cache coherent */ iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + + /* Set IOTLB invalidation timeout to 1s */ + iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 6ad8b10b3130..2452f3b71736 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -127,6 +127,7 @@ #define CONTROL_EVT_LOG_EN 0x02ULL #define CONTROL_EVT_INT_EN 0x03ULL #define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_INV_TIMEOUT 0x05ULL #define CONTROL_PASSPW_EN 0x08ULL #define CONTROL_RESPASSPW_EN 0x09ULL #define CONTROL_COHERENT_EN 0x0aULL @@ -137,6 +138,15 @@ #define CONTROL_PPR_EN 0x0fULL #define CONTROL_GT_EN 0x10ULL +#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) +#define CTRL_INV_TO_NONE 0 +#define CTRL_INV_TO_1MS 1 +#define CTRL_INV_TO_10MS 2 +#define CTRL_INV_TO_100MS 3 +#define CTRL_INV_TO_1S 4 +#define CTRL_INV_TO_10S 5 +#define CTRL_INV_TO_100S 6 + /* command specific defines */ #define CMD_COMPL_WAIT 0x01 #define CMD_INV_DEV_ENTRY 0x02