From 6b5c19c55266f6efd10ffac0e9f9f2b7aa420a58 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:47 +1100
Subject: [PATCH 01/22] powerpc/mmu: Add real mode support for IOMMU
 preregistered memory

This makes mm_iommu_lookup() able to work in realmode by replacing
list_for_each_entry_rcu() (which can do debug stuff which can fail in
real mode) with list_for_each_entry_lockless().

This adds realmode version of mm_iommu_ua_to_hpa() which adds
explicit vmalloc'd-to-linear address conversion.
Unlike mm_iommu_ua_to_hpa(), mm_iommu_ua_to_hpa_rm() can fail.

This changes mm_iommu_preregistered() to receive @mm as in real mode
@current does not always have a correct pointer.

This adds realmode version of mm_iommu_lookup() which receives @mm
(for the same reason as for mm_iommu_preregistered()) and uses
lockless version of list_for_each_entry_rcu().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h |  4 +++
 arch/powerpc/mm/mmu_context_iommu.c    | 39 ++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..c70c8272523d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+		struct mm_struct *mm, unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..fc67bd766eaf 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_lookup);
 
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
+
 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries)
 {
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
 
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	void *va = &mem->hpas[entry];
+	unsigned long *pa;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	pa = (void *) vmalloc_to_phys(va);
+	if (!pa)
+		return -EFAULT;
+
+	*hpa = *pa | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))

From a540aa56ba3d29084f28710c8b93cc9c3c422943 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:48 +1100
Subject: [PATCH 02/22] powerpc/powernv/iommu: Add real mode version of
 iommu_table_ops::exchange()

In real mode, TCE tables are invalidated using special
cache-inhibited store instructions which are not available in
virtual mode

This defines and implements exchange_rm() callback. This does not
define set_rm/clear_rm/flush_rm callbacks as there is no user for those -
exchange/exchange_rm are only to be used by KVM for VFIO.

The exchange_rm callback is defined for IODA1/IODA2 powernv platforms.

This replaces list_for_each_entry_rcu with its lockless version as
from now on pnv_pci_ioda2_tce_invalidate() can be called in
the real mode too.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/iommu.h          |  7 ++++++
 arch/powerpc/kernel/iommu.c               | 25 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 26 ++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d50792944..4554699aec02 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
 			long index,
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
+	/* Real mode */
+	int (*exchange_rm)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
 #endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
@@ -208,6 +213,8 @@ extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a566ec5..685a4767b722 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1004,6 +1004,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret;
+
+	ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL))) {
+		struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
+
+		if (likely(pg)) {
+			SetPageDirty(pg);
+		} else {
+			tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
+#endif
+
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..572e9c9f1ea0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda1_tce_xchg,
+	.exchange_rm = pnv_ioda1_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
@@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 {
 	struct iommu_table_group_link *tgl;
 
-	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 				struct pnv_ioda_pe, table_group);
 		struct pnv_phb *phb = pe->phb;
@@ -2004,6 +2016,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2024,6 +2047,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda2_tce_xchg,
+	.exchange_rm = pnv_ioda2_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,

From 11edf116e3a6352cfee6b1437d41603c9aff79c9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:49 +1100
Subject: [PATCH 03/22] powerpc/iommu/vfio_spapr_tce: Cleanup iommu_table
 disposal

At the moment iommu_table can be disposed by either calling
iommu_table_free() directly or it_ops::free(); the only implementation
of free() is in IODA2 - pnv_ioda2_table_free() - and it calls
iommu_table_free() anyway.

As we are going to have reference counting on tables, we need an unified
way of disposing tables.

This moves it_ops::free() call into iommu_free_table() and makes use
of the latter. The free() callback now handles only platform-specific
data.

As from now on the iommu_free_table() calls it_ops->free(), we need
to have it_ops initialized before calling iommu_free_table() so this
moves this initialization in pnv_pci_ioda2_create_table().

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/iommu.c               |  4 ++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++++------
 drivers/vfio/vfio_iommu_spapr_tce.c       |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 685a4767b722..a3689fdedd4a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -719,6 +719,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (!tbl)
 		return;
 
+	if (tbl->it_ops->free)
+		tbl->it_ops->free(tbl);
+
 	if (!tbl->it_map) {
 		kfree(tbl);
 		return;
@@ -745,6 +748,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	/* free table */
 	kfree(tbl);
 }
+EXPORT_SYMBOL_GPL(iommu_free_table);
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 572e9c9f1ea0..5dae54cb11e3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1424,7 +1424,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 }
 
@@ -2040,7 +2039,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static void pnv_ioda2_table_free(struct iommu_table *tbl)
 {
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
@@ -2317,6 +2315,8 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 	if (!tbl)
 		return -ENOMEM;
 
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+
 	ret = pnv_pci_ioda2_table_alloc_pages(nid,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
@@ -2325,8 +2325,6 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 		return ret;
 	}
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-
 	*ptbl = tbl;
 
 	return 0;
@@ -2367,7 +2365,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		pnv_ioda2_table_free(tbl);
+		iommu_free_table(tbl, "");
 		return rc;
 	}
 
@@ -2455,7 +2453,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	pnv_ioda2_table_free(tbl);
+	iommu_free_table(tbl, "pnv");
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index cf3de91fbfe7..fbec7348a7e5 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -680,7 +680,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	tce_iommu_userspace_view_free(tbl, container->mm);
-	tbl->it_ops->free(tbl);
+	iommu_free_table(tbl, "");
 	decrement_locked_vm(container->mm, pages);
 }
 

From e5afdf9dd515a9446c009f44f99f9bc2f91b89a7 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:50 +1100
Subject: [PATCH 04/22] powerpc/vfio_spapr_tce: Add reference counting to
 iommu_table

So far iommu_table obejcts were only used in virtual mode and had
a single owner. We are going to change this by implementing in-kernel
acceleration of DMA mapping requests. The proposed acceleration
will handle requests in real mode and KVM will keep references to tables.

This adds a kref to iommu_table and defines new helpers to update it.
This replaces iommu_free_table() with iommu_tce_table_put() and makes
iommu_free_table() static. iommu_tce_table_get() is not used in this patch
but it will be in the following patch.

Since this touches prototypes, this also removes @node_name parameter as
it has never been really useful on powernv and carrying it for
the pseries platform code to iommu_free_table() seems to be quite
useless as well.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/iommu.h          |  5 +++--
 arch/powerpc/kernel/iommu.c               | 27 ++++++++++++++++++-----
 arch/powerpc/platforms/powernv/pci-ioda.c | 14 ++++++------
 arch/powerpc/platforms/powernv/pci.c      |  1 +
 arch/powerpc/platforms/pseries/iommu.c    |  3 ++-
 arch/powerpc/platforms/pseries/vio.c      |  2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c       |  2 +-
 7 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4554699aec02..d96142572e6d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -119,6 +119,7 @@ struct iommu_table {
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
+	struct kref    it_kref;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -151,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
 
-/* Frees table for an individual device node */
-extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
+extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
+extern int iommu_tce_table_put(struct iommu_table *tbl);
 
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a3689fdedd4a..5a3231fedf08 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,13 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	return tbl;
 }
 
-void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
 	unsigned int order;
+	struct iommu_table *tbl;
 
-	if (!tbl)
-		return;
+	tbl = container_of(kref, struct iommu_table, it_kref);
 
 	if (tbl->it_ops->free)
 		tbl->it_ops->free(tbl);
@@ -736,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
-		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
+		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
 	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -748,7 +748,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	/* free table */
 	kfree(tbl);
 }
-EXPORT_SYMBOL_GPL(iommu_free_table);
+
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+	if (kref_get_unless_zero(&tbl->it_kref))
+		return tbl;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+	if (WARN_ON(!tbl))
+		return 0;
+
+	return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5dae54cb11e3..ee4cdb5b893f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1424,7 +1424,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -2225,7 +2225,7 @@ found:
 		__free_pages(tce_mem, get_order(tce32_segsz * segs));
 	if (tbl) {
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 	}
 }
 
@@ -2321,7 +2321,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
 	if (ret) {
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 		return ret;
 	}
 
@@ -2365,7 +2365,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		iommu_free_table(tbl, "");
+		iommu_tce_table_put(tbl);
 		return rc;
 	}
 
@@ -2453,7 +2453,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -3428,7 +3428,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3455,7 +3455,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..204a829ff506 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -767,6 +767,7 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 
 	return tbl;
 }
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757eaa46bf..7ce5db209abf 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 		goto fail_exit;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 	tgl->table_group = table_group;
 	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		BUG_ON(table_group->group);
 	}
 #endif
-	iommu_free_table(tbl, node_name);
+	iommu_tce_table_put(tbl);
 
 	kfree(table_group);
 }
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 720493932486..28b09fd797ec 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (tbl)
-		iommu_free_table(tbl, of_node_full_name(dev->of_node));
+		iommu_tce_table_put(tbl);
 	of_node_put(dev->of_node);
 	kfree(to_vio_dev(dev));
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index fbec7348a7e5..8031d3a55a17 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -680,7 +680,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	tce_iommu_userspace_view_free(tbl, container->mm);
-	iommu_free_table(tbl, "");
+	iommu_tce_table_put(tbl);
 	decrement_locked_vm(container->mm, pages);
 }
 

From 307d927967007acef98cfd3f0639c7a4bf234ede Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Mar 2017 21:02:08 +1100
Subject: [PATCH 05/22] KVM: PPC: Provide functions for queueing up FP/VEC/VSX
 unavailable interrupts

This provides functions that can be used for generating interrupts
indicating that a given functional unit (floating point, vector, or
VSX) is unavailable.  These functions will be used in instruction
emulation code.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h |  3 +++
 arch/powerpc/kvm/book3s.c          | 18 ++++++++++++++++++
 arch/powerpc/kvm/booke.c           |  5 +++++
 3 files changed, 26 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a..4f1f22fc0ea1 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -132,6 +132,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
+extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92..0ff0d07c0757 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -197,6 +197,24 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 }
 EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, 0);
+}
+
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_ALTIVEC, 0);
+}
+
+void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_VSX, 0);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0514cbd4e533..3c296c2eacf8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -300,6 +300,11 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 }
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);

From 6f63e81bda98cbb549b01faf978884692ded438d Mon Sep 17 00:00:00 2001
From: Bin Lu <lblulb@linux.vnet.ibm.com>
Date: Tue, 21 Feb 2017 21:12:36 +0800
Subject: [PATCH 06/22] KVM: PPC: Book3S: Add MMIO emulation for FP and VSX
 instructions

This patch provides the MMIO load/store emulation for instructions
of 'double & vector unsigned char & vector signed char & vector
unsigned short & vector signed short & vector unsigned int & vector
signed int & vector double '.

The instructions that this adds emulation for are:

- ldx, ldux, lwax,
- lfs, lfsx, lfsu, lfsux, lfd, lfdx, lfdu, lfdux,
- stfs, stfsx, stfsu, stfsux, stfd, stfdx, stfdu, stfdux, stfiwx,
- lxsdx, lxsspx, lxsiwax, lxsiwzx, lxvd2x, lxvw4x, lxvdsx,
- stxsdx, stxsspx, stxsiwx, stxvd2x, stxvw4x

[paulus@ozlabs.org - some cleanups, fixes and rework, make it
 compile for Book E, fix build when PR KVM is built in]

Signed-off-by: Bin Lu <lblulb@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/disassemble.h |   5 +
 arch/powerpc/include/asm/kvm_host.h    |  23 ++
 arch/powerpc/include/asm/kvm_ppc.h     |   7 +
 arch/powerpc/include/asm/ppc-opcode.h  |  50 ++++
 arch/powerpc/kvm/emulate_loadstore.c   | 335 ++++++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c             | 318 ++++++++++++++++++++++-
 6 files changed, 731 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
index 4852e849128b..c0a55050f70f 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -87,6 +87,11 @@ static inline unsigned int get_oc(u32 inst)
 	return (inst >> 11) & 0x7fff;
 }
 
+static inline unsigned int get_tx_or_sx(u32 inst)
+{
+	return (inst) & 0x1;
+}
+
 #define IS_XFORM(inst)	(get_op(inst)  == 31)
 #define IS_DSFORM(inst)	(get_op(inst) >= 56)
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 01d05c76f1c7..2acc691ed9d0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -438,6 +438,11 @@ struct mmio_hpte_cache {
 	unsigned int index;
 };
 
+#define KVMPPC_VSX_COPY_NONE		0
+#define KVMPPC_VSX_COPY_WORD		1
+#define KVMPPC_VSX_COPY_DWORD		2
+#define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP	3
+
 struct openpic;
 
 struct kvm_vcpu_arch {
@@ -641,6 +646,21 @@ struct kvm_vcpu_arch {
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
+	/* conversion between single and double precision */
+	u8 mmio_sp64_extend;
+	/*
+	 * Number of simulations for vsx.
+	 * If we use 2*8bytes to simulate 1*16bytes,
+	 * then the number should be 2 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD.
+	 * If we use 4*4bytes to simulate 1*16bytes,
+	 * the number should be 4 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD.
+	 */
+	u8 mmio_vsx_copy_nums;
+	u8 mmio_vsx_offset;
+	u8 mmio_vsx_copy_type;
+	u8 mmio_vsx_tx_sx_enabled;
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
@@ -729,6 +749,8 @@ struct kvm_vcpu_arch {
 };
 
 #define VCPU_FPR(vcpu, i)	(vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
+#define VCPU_VSX_FPR(vcpu, i, j)	((vcpu)->arch.fp.fpr[i][j])
+#define VCPU_VSX_VR(vcpu, i)		((vcpu)->arch.vr.vr[i])
 
 /* Values for vcpu->arch.state */
 #define KVMPPC_VCPU_NOTREADY		0
@@ -742,6 +764,7 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FPR	0x0020
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
+#define KVM_MMIO_REG_VSX	0x0080
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4f1f22fc0ea1..bbecec4e753a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -78,9 +78,15 @@ extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned int rt, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			       u64 val, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				int rs, unsigned int bytes,
+				int is_default_endian);
 
 extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 				 enum instruction_type type, u32 *inst);
@@ -243,6 +249,7 @@ union kvmppc_one_reg {
 	u64	dval;
 	vector128 vval;
 	u64	vsxval[2];
+	u32	vsx32val[4];
 	struct {
 		u64	addr;
 		u64	length;
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563ee..1e37c3c0733a 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -86,7 +86,9 @@
 #define OP_TRAP_64 2
 
 #define OP_31_XOP_TRAP      4
+#define OP_31_XOP_LDX       21
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_LDUX      53
 #define OP_31_XOP_DCBST     54
 #define OP_31_XOP_LWZUX     55
 #define OP_31_XOP_TRAP_64   68
@@ -99,6 +101,7 @@
 #define OP_31_XOP_LHZX      279
 #define OP_31_XOP_LHZUX     311
 #define OP_31_XOP_MFSPR     339
+#define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
 #define OP_31_XOP_LHAUX     375
 #define OP_31_XOP_STHX      407
@@ -108,10 +111,46 @@
 #define OP_31_XOP_LWBRX     534
 #define OP_31_XOP_TLBSYNC   566
 #define OP_31_XOP_STWBRX    662
+#define OP_31_XOP_STFSX	    663
+#define OP_31_XOP_STFSUX    695
+#define OP_31_XOP_STFDX     727
+#define OP_31_XOP_STFDUX    759
 #define OP_31_XOP_LHBRX     790
 #define OP_31_XOP_STHBRX    918
+#define OP_31_XOP_STFIWX    983
+
+/* VSX Scalar Load Instructions */
+#define OP_31_XOP_LXSDX         588
+#define OP_31_XOP_LXSSPX        524
+#define OP_31_XOP_LXSIWAX       76
+#define OP_31_XOP_LXSIWZX       12
+
+/* VSX Scalar Store Instructions */
+#define OP_31_XOP_STXSDX        716
+#define OP_31_XOP_STXSSPX       652
+#define OP_31_XOP_STXSIWX       140
+
+/* VSX Vector Load Instructions */
+#define OP_31_XOP_LXVD2X        844
+#define OP_31_XOP_LXVW4X        780
+
+/* VSX Vector Load and Splat Instruction */
+#define OP_31_XOP_LXVDSX        332
+
+/* VSX Vector Store Instructions */
+#define OP_31_XOP_STXVD2X       972
+#define OP_31_XOP_STXVW4X       908
+
+#define OP_31_XOP_LFSX          535
+#define OP_31_XOP_LFSUX         567
+#define OP_31_XOP_LFDX          599
+#define OP_31_XOP_LFDUX		631
 
 #define OP_LWZ  32
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD 54
+#define OP_STFDU 55
 #define OP_LD   58
 #define OP_LWZU 33
 #define OP_LBZ  34
@@ -127,6 +166,17 @@
 #define OP_LHAU 43
 #define OP_STH  44
 #define OP_STHU 45
+#define OP_LMW  46
+#define OP_STMW 47
+#define OP_LFS  48
+#define OP_LFSU 49
+#define OP_LFD  50
+#define OP_LFDU 51
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD  54
+#define OP_STFDU 55
+#define OP_LQ    56
 
 /* sorted alphabetically */
 #define PPC_INST_BHRBE			0x7c00025c
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 6d3c0ee1d744..9cda1b9dbc51 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -34,6 +34,30 @@
 #include "timing.h"
 #include "trace.h"
 
+#ifdef CONFIG_PPC_FPU
+static bool kvmppc_check_fp_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_core_queue_fpunavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_PPC_FPU */
+
+#ifdef CONFIG_VSX
+static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VSX)) {
+		kvmppc_core_queue_vsx_unavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_VSX */
+
 /* XXX to do:
  * lhax
  * lhaux
@@ -66,6 +90,19 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	rs = get_rs(inst);
 	rt = get_rt(inst);
 
+	/*
+	 * if mmio_vsx_tx_sx_enabled == 0, copy data between
+	 * VSR[0..31] and memory
+	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
+	 * VSR[32..63] and memory
+	 */
+	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
+	vcpu->arch.mmio_vsx_copy_nums = 0;
+	vcpu->arch.mmio_vsx_offset = 0;
+	vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
+	vcpu->arch.mmio_sp64_extend = 0;
+	vcpu->arch.mmio_sign_extend = 0;
+
 	switch (get_op(inst)) {
 	case 31:
 		switch (get_xop(inst)) {
@@ -157,6 +194,230 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			                               2, 0);
 			break;
 
+		case OP_31_XOP_LDX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LDUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LWAX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+
+#ifdef CONFIG_PPC_FPU
+		case OP_31_XOP_LFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               8, 1);
+			break;
+
+		case OP_31_XOP_STFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFIWX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               4, 1);
+			break;
+#endif
+
+#ifdef CONFIG_VSX
+		case OP_31_XOP_LXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSIWAX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 1);
+			break;
+
+		case OP_31_XOP_LXSIWZX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVD2X:
+		/*
+		 * In this case, the official load/store process is like this:
+		 * Step1, exit from vm by page fault isr, then kvm save vsr.
+		 * Please see guest_exit_cont->store_fp_state->SAVE_32VSRS
+		 * as reference.
+		 *
+		 * Step2, copy data between memory and VCPU
+		 * Notice: for LXVD2X/STXVD2X/LXVW4X/STXVW4X, we use
+		 * 2copies*8bytes or 4copies*4bytes
+		 * to simulate one copy of 16bytes.
+		 * Also there is an endian issue here, we should notice the
+		 * layout of memory.
+		 * Please see MARCO of LXVD2X_ROT/STXVD2X_ROT as more reference.
+		 * If host is little-endian, kvm will call XXSWAPD for
+		 * LXVD2X_ROT/STXVD2X_ROT.
+		 * So, if host is little-endian,
+		 * the postion of memeory should be swapped.
+		 *
+		 * Step3, return to guest, kvm reset register.
+		 * Please see kvmppc_hv_entry->load_fp_state->REST_32VSRS
+		 * as reference.
+		 */
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVDSX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type =
+				 KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_STXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 8, 1);
+			break;
+
+		case OP_31_XOP_STXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXSIWX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_offset = 1;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXVD2X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 8, 1);
+			break;
+
+		case OP_31_XOP_STXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+#endif /* CONFIG_VSX */
 		default:
 			emulated = EMULATE_FAIL;
 			break;
@@ -167,7 +428,45 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
+#ifdef CONFIG_PPC_FPU
+	case OP_STFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		break;
+
+	case OP_STFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		break;
+
+	case OP_STFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+#endif
+
+	/* TBD: Add support for other 64 bit load variants like ldu etc. */
 	case OP_LD:
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
@@ -252,6 +551,40 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
+#ifdef CONFIG_PPC_FPU
+	case OP_LFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		break;
+
+	case OP_LFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
+		break;
+
+	case OP_LFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+#endif
+
 	default:
 		emulated = EMULATE_FAIL;
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0e42aa8a279f..6c7244879bfe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -37,6 +37,7 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
+#include <asm/switch_to.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -801,6 +802,129 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
 }
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_dword_offset(int index)
+{
+	int offset;
+
+	if ((index != 0) && (index != 1))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset =  index;
+#else
+	offset = 1 - index;
+#endif
+
+	return offset;
+}
+
+static inline int kvmppc_get_vsr_word_offset(int index)
+{
+	int offset;
+
+	if ((index > 3) || (index < 0))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset = index;
+#else
+	offset = 3 - index;
+#endif
+	return offset;
+}
+
+static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[offset] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[0] = gpr;
+		val.vsxval[1] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
+		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
+	u32 gpr32)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+	int dword_offset, word_offset;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsx32val[offset] = gpr32;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		dword_offset = offset / 2;
+		word_offset = offset % 2;
+		val.vsxval[0] = VCPU_VSX_FPR(vcpu, index, dword_offset);
+		val.vsx32val[word_offset] = gpr32;
+		VCPU_VSX_FPR(vcpu, index, dword_offset) = val.vsxval[0];
+	}
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_PPC_FPU
+static inline u64 sp_to_dp(u32 fprs)
+{
+	u64 fprd;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfs%U1%X1 0,%1; stfd%U0%X0 0,%0" : "=m" (fprd) : "m" (fprs)
+	     : "fr0");
+	preempt_enable();
+	return fprd;
+}
+
+static inline u32 dp_to_sp(u64 fprd)
+{
+	u32 fprs;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfd%U1%X1 0,%1; stfs%U0%X0 0,%0" : "=m" (fprs) : "m" (fprd)
+	     : "fr0");
+	preempt_enable();
+	return fprs;
+}
+
+#else
+#define sp_to_dp(x)	(x)
+#define dp_to_sp(x)	(x)
+#endif /* CONFIG_PPC_FPU */
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -827,6 +951,10 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	/* conversion between single and double precision */
+	if ((vcpu->arch.mmio_sp64_extend) && (run->mmio.len == 4))
+		gpr = sp_to_dp(gpr);
+
 	if (vcpu->arch.mmio_sign_extend) {
 		switch (run->mmio.len) {
 #ifdef CONFIG_PPC64
@@ -843,8 +971,6 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
-
 	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
 	case KVM_MMIO_REG_GPR:
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
@@ -860,6 +986,17 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
 		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
+#endif
+#ifdef CONFIG_VSX
+	case KVM_MMIO_REG_VSX:
+		if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_DWORD)
+			kvmppc_set_vsr_dword(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_WORD)
+			kvmppc_set_vsr_word(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type ==
+				KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
+			kvmppc_set_vsr_dword_dump(vcpu, gpr);
+		break;
 #endif
 	default:
 		BUG();
@@ -927,6 +1064,35 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 1);
 }
 
+#ifdef CONFIG_VSX
+int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
+			is_default_endian, mmio_sign_extend);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+	return emulated;
+}
+#endif /* CONFIG_VSX */
+
 int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			u64 val, unsigned int bytes, int is_default_endian)
 {
@@ -952,6 +1118,9 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_is_write = 1;
 
+	if ((vcpu->arch.mmio_sp64_extend) && (bytes == 4))
+		val = dp_to_sp(val);
+
 	/* Store the value at the lowest bytes in 'data'. */
 	if (!host_swabbed) {
 		switch (bytes) {
@@ -985,6 +1154,129 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvmppc_handle_store);
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
+{
+	u32 dword_offset, word_offset;
+	union kvmppc_one_reg reg;
+	int vsx_offset = 0;
+	int copy_type = vcpu->arch.mmio_vsx_copy_type;
+	int result = 0;
+
+	switch (copy_type) {
+	case KVMPPC_VSX_COPY_DWORD:
+		vsx_offset =
+			kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsxval[vsx_offset];
+		}
+		break;
+
+	case KVMPPC_VSX_COPY_WORD:
+		vsx_offset =
+			kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			dword_offset = vsx_offset / 2;
+			word_offset = vsx_offset % 2;
+			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
+			*val = reg.vsx32val[word_offset];
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsx32val[vsx_offset];
+		}
+		break;
+
+	default:
+		result = -1;
+		break;
+	}
+
+	return result;
+}
+
+int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			int rs, unsigned int bytes, int is_default_endian)
+{
+	u64 val;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	vcpu->arch.io_gpr = rs;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
+			return EMULATE_FAIL;
+
+		emulated = kvmppc_handle_store(run, vcpu,
+			 val, bytes, is_default_endian);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu,
+			struct kvm_run *run)
+{
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_vsx_load(run, vcpu, vcpu->arch.io_gpr,
+			 run->mmio.len, 1, vcpu->arch.mmio_sign_extend);
+	} else {
+		emulated = kvmppc_handle_vsx_store(run, vcpu,
+			 vcpu->arch.io_gpr, run->mmio.len, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VSX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_VSX */
+
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
 	int r = 0;
@@ -1087,13 +1379,24 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
 	if (vcpu->mmio_needed) {
+		vcpu->mmio_needed = 0;
 		if (!vcpu->mmio_is_write)
 			kvmppc_complete_mmio_load(vcpu, run);
-		vcpu->mmio_needed = 0;
+#ifdef CONFIG_VSX
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			vcpu->arch.mmio_vsx_copy_nums--;
+			vcpu->arch.mmio_vsx_offset++;
+		}
+
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				return r;
+			}
+		}
+#endif
 	} else if (vcpu->arch.osi_needed) {
 		u64 *gprs = run->osi.gprs;
 		int i;
@@ -1115,6 +1418,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
 	if (run->immediate_exit)
 		r = -EINTR;
 	else

From 91242fd1a3eb96e4efe43bdf96c2fcec97fdf4ff Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 17 Mar 2017 19:31:38 +1100
Subject: [PATCH 07/22] KVM: PPC: Add MMIO emulation for stdx (store doubleword
 indexed)

This adds missing stdx emulation for emulated MMIO accesses by KVM
guests.  This allows the Mellanox mlx5_core driver from recent kernels
to work when MMIO emulation is enforced by userspace.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h | 1 +
 arch/powerpc/kvm/emulate_loadstore.c  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 1e37c3c0733a..94e7df22630e 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -94,6 +94,7 @@
 #define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_DCBF      86
 #define OP_31_XOP_LBZX      87
+#define OP_31_XOP_STDX      149
 #define OP_31_XOP_STWX      151
 #define OP_31_XOP_STBX      215
 #define OP_31_XOP_LBZUX     119
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 9cda1b9dbc51..a0f27a30379d 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -119,6 +119,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
+		case OP_31_XOP_STDX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+							8, 1);
+			break;
+
 		case OP_31_XOP_STWX:
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),

From ceba57df43a25ede55a96a795703c46022b1d1d0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 21 Mar 2017 15:43:47 +1100
Subject: [PATCH 08/22] KVM: PPC: Emulation for more integer loads and stores

This adds emulation for the following integer loads and stores,
thus enabling them to be used in a guest for accessing emulated
MMIO locations.

- lhaux
- lwaux
- lwzux
- ldu
- lwa
- stdux
- stwux
- stdu
- ldbrx
- stdbrx

Previously, most of these would cause an emulation failure exit to
userspace, though ldu and lwa got treated incorrectly as ld, and
stdu got treated incorrectly as std.

This also tidies up some of the formatting and updates the comment
listing instructions that still need to be implemented.

With this, all integer loads and stores that are defined in the Power
ISA v2.07 are emulated, except for those that are permitted to trap
when used on cache-inhibited or write-through mappings (and which do
in fact trap on POWER8), that is, lmw/stmw, lswi/stswi, lswx/stswx,
lq/stq, and l[bhwdq]arx/st[bhwdq]cx.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h |   5 +
 arch/powerpc/kvm/emulate_loadstore.c  | 135 ++++++++++++++++----------
 2 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 94e7df22630e..738bac164f8b 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -96,6 +96,8 @@
 #define OP_31_XOP_LBZX      87
 #define OP_31_XOP_STDX      149
 #define OP_31_XOP_STWX      151
+#define OP_31_XOP_STDUX     181
+#define OP_31_XOP_STWUX     183
 #define OP_31_XOP_STBX      215
 #define OP_31_XOP_LBZUX     119
 #define OP_31_XOP_STBUX     247
@@ -104,13 +106,16 @@
 #define OP_31_XOP_MFSPR     339
 #define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
+#define OP_31_XOP_LWAUX     373
 #define OP_31_XOP_LHAUX     375
 #define OP_31_XOP_STHX      407
 #define OP_31_XOP_STHUX     439
 #define OP_31_XOP_MTSPR     467
 #define OP_31_XOP_DCBI      470
+#define OP_31_XOP_LDBRX     532
 #define OP_31_XOP_LWBRX     534
 #define OP_31_XOP_TLBSYNC   566
+#define OP_31_XOP_STDBRX    660
 #define OP_31_XOP_STWBRX    662
 #define OP_31_XOP_STFSX	    663
 #define OP_31_XOP_STFSUX    695
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index a0f27a30379d..f10ba0c2c202 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -58,18 +58,14 @@ static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_VSX */
 
-/* XXX to do:
- * lhax
- * lhaux
- * lswx
- * lswi
- * stswx
- * stswi
- * lha
- * lhau
- * lmw
- * stmw
+/*
+ * XXX to do:
+ * lfiwax, lfiwzx
+ * vector loads and stores
  *
+ * Instructions that trap when used on cache-inhibited mappings
+ * are not emulated here: multiple and string instructions,
+ * lq/stq, and the load-reserve/store-conditional instructions.
  */
 int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 {
@@ -110,6 +106,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
+		case OP_31_XOP_LWZUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LBZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
@@ -121,26 +122,34 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STDX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-							8, 1);
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+
+		case OP_31_XOP_STDUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STWX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 1);
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STWUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STBX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			break;
 
 		case OP_31_XOP_STBUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -148,6 +157,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 			break;
 
+		case OP_31_XOP_LHAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LHZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			break;
@@ -159,14 +173,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			break;
 
 		case OP_31_XOP_STHUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -186,8 +198,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STWBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 0);
+					kvmppc_get_gpr(vcpu, rs), 4, 0);
 			break;
 
 		case OP_31_XOP_LHBRX:
@@ -196,8 +207,16 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 0);
+					kvmppc_get_gpr(vcpu, rs), 2, 0);
+			break;
+
+		case OP_31_XOP_LDBRX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 0);
+			break;
+
+		case OP_31_XOP_STDBRX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 0);
 			break;
 
 		case OP_31_XOP_LDX:
@@ -213,6 +232,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
 			break;
 
+		case OP_31_XOP_LWAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 #ifdef CONFIG_PPC_FPU
 		case OP_31_XOP_LFSX:
 			if (kvmppc_check_fp_disabled(vcpu))
@@ -267,16 +291,14 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               8, 1);
+				VCPU_FPR(vcpu, rs), 8, 1);
 			break;
 
 		case OP_31_XOP_STFDUX:
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               8, 1);
+				VCPU_FPR(vcpu, rs), 8, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -284,8 +306,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               4, 1);
+				VCPU_FPR(vcpu, rs), 4, 1);
 			break;
 #endif
 
@@ -472,10 +493,22 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		break;
 #endif
 
-	/* TBD: Add support for other 64 bit load variants like ldu etc. */
 	case OP_LD:
 		rt = get_rt(inst);
-		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+		switch (inst & 3) {
+		case 0:	/* ld */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+		case 1: /* ldu */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		case 2:	/* lwa */
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_LWZU:
@@ -498,31 +531,37 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		                               4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
 	case OP_STD:
 		rs = get_rs(inst);
-		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               8, 1);
+		switch (inst & 3) {
+		case 0:	/* std */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+		case 1: /* stdu */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_STWU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               4, 1);
+				kvmppc_get_gpr(vcpu, rs), 4, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STB:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		break;
 
 	case OP_STBU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
@@ -546,14 +585,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 	case OP_STH:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
 		break;
 
 	case OP_STHU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 

From 9b5ab0051306033b0b1e83dd72e1f84a0c8dd4bf Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 23 Mar 2017 11:55:16 +1100
Subject: [PATCH 09/22] KVM: PPC: Add MMIO emulation for remaining
 floating-point instructions

For completeness, this adds emulation of the lfiwax and lfiwzx
instructions.  With this, all floating-point load and store instructions
as of Power ISA V2.07 are emulated.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h |  2 ++
 arch/powerpc/kvm/emulate_loadstore.c  | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 738bac164f8b..73f06f4dddc7 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -122,6 +122,8 @@
 #define OP_31_XOP_STFDX     727
 #define OP_31_XOP_STFDUX    759
 #define OP_31_XOP_LHBRX     790
+#define OP_31_XOP_LFIWAX    855
+#define OP_31_XOP_LFIWZX    887
 #define OP_31_XOP_STHBRX    918
 #define OP_31_XOP_STFIWX    983
 
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index f10ba0c2c202..af833531af31 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -270,6 +270,20 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
+		case OP_31_XOP_LFIWAX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_loads(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFIWZX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
 		case OP_31_XOP_STFSX:
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;

From a1c52e1c7cb9fb8714b89b7bb5773c498e39e38b Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 20 Jan 2017 11:00:08 +0100
Subject: [PATCH 10/22] KVM: PPC: Book3S HV: Use common error handling code in
 kvmppc_clr_passthru_irq()

Add a jump target so that a bit of exception handling can be better reused
at the end of this function.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82..06b7d8ae27e5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3624,11 +3624,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -EIO;
 
 	mutex_lock(&kvm->lock);
+	if (!kvm->arch.pimap)
+		goto unlock;
 
-	if (kvm->arch.pimap == NULL) {
-		mutex_unlock(&kvm->lock);
-		return 0;
-	}
 	pimap = kvm->arch.pimap;
 
 	for (i = 0; i < pimap->n_mapped; i++) {
@@ -3650,7 +3648,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 * We don't free this structure even when the count goes to
 	 * zero. The structure is freed when we destroy the VM.
 	 */
-
+ unlock:
 	mutex_unlock(&kvm->lock);
 	return 0;
 }

From 37655490db757a8ef9141830428485395b86e8fa Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 20 Jan 2017 16:30:18 +0100
Subject: [PATCH 11/22] KVM: PPC: e500: Use kcalloc() in e500_mmu_host_init()

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

  This issue was detected by using the Coccinelle software.

* Replace the specification of a data type by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/e500_mmu_host.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 0fda4230f6c0..77fd043b3ecc 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -797,9 +797,8 @@ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	host_tlb_params[0].sets =
 		host_tlb_params[0].entries / host_tlb_params[0].ways;
 	host_tlb_params[1].sets = 1;
-
-	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
-					   host_tlb_params[1].entries,
+	vcpu_e500->h2g_tlb1_rmap = kcalloc(host_tlb_params[1].entries,
+					   sizeof(*vcpu_e500->h2g_tlb1_rmap),
 					   GFP_KERNEL);
 	if (!vcpu_e500->h2g_tlb1_rmap)
 		return -EINVAL;

From 9eecec126e006768c21ba2b4588ce5b0e23fda0b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:47:13 +1100
Subject: [PATCH 12/22] KVM: PPC: Book3S PR: Get rid of unused local variable

@is_mmio has never been used since introduction in
commit 2f4cf5e42d13 ("Add book3s.c") from 2009.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d4dfc0ca2a44..633502f52bbb 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -538,7 +538,6 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int relocated;
 	int page_found = 0;
 	struct kvmppc_pte pte;
-	bool is_mmio = false;
 	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
 	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
 	u64 vsid;
@@ -616,8 +615,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Page not found in guest SLB */
 		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
 		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-	} else if (!is_mmio &&
-		   kvmppc_visible_gpa(vcpu, pte.raddr)) {
+	} else if (kvmppc_visible_gpa(vcpu, pte.raddr)) {
 		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
 			/*
 			 * There is already a host HPTE there, presumably

From bd9166ffe624000140fc6b606b256df01fc0d060 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:48:10 +1100
Subject: [PATCH 13/22] KVM: PPC: Book3S PR: Exit KVM on failed mapping

At the moment kvmppc_mmu_map_page() returns -1 if
mmu_hash_ops.hpte_insert() fails for any reason so the page fault handler
resumes the guest and it faults on the same address again.

This adds distinction to kvmppc_mmu_map_page() to return -EIO if
mmu_hash_ops.hpte_insert() failed for a reason other than full pteg.
At the moment only pSeries_lpar_hpte_insert() returns -2 if
plpar_pte_enter() failed with a code other than H_PTEG_FULL.
Other mmu_hash_ops.hpte_insert() instances can only fail with
-1 "full pteg".

With this change, if PR KVM fails to update HPT, it can signal
the userspace about this instead of returning to guest and having
the very same page fault over and over again.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_host.c | 5 ++++-
 arch/powerpc/kvm/book3s_pr.c          | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd26..4b4e927c4822 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -177,12 +177,15 @@ map_again:
 	ret = mmu_hash_ops.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
 				       hpsize, hpsize, MMU_SEGSIZE_256M);
 
-	if (ret < 0) {
+	if (ret == -1) {
 		/* If we couldn't map a primary PTE, try a secondary */
 		hash = ~hash;
 		vflags ^= HPTE_V_SECONDARY;
 		attempt++;
 		goto map_again;
+	} else if (ret < 0) {
+		r = -EIO;
+		goto out_unlock;
 	} else {
 		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
 					    vpn, hpaddr, orig_pte);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 633502f52bbb..ce437b98477e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -625,7 +625,11 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_unmap_page(vcpu, &pte);
 		}
 		/* The guest's PTE is not mapped yet. Map on the host */
-		kvmppc_mmu_map_page(vcpu, &pte, iswrite);
+		if (kvmppc_mmu_map_page(vcpu, &pte, iswrite) == -EIO) {
+			/* Exit KVM if mapping failed */
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
 		if (data)
 			vcpu->stat.sp_storage++;
 		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&

From 96df2267695199b9377bd641c7eb68c393b81b0b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:49:22 +1100
Subject: [PATCH 14/22] KVM: PPC: Book3S PR: Preserve storage control bits

PR KVM page fault handler performs eaddr to pte translation for a guest,
however kvmppc_mmu_book3s_64_xlate() does not preserve WIMG bits
(storage control) in the kvmppc_pte struct. If PR KVM is running as
a second level guest under HV KVM, and PR KVM tries inserting HPT entry,
this fails in HV KVM if it already has this mapping.

This preserves WIMG bits between kvmppc_mmu_book3s_64_xlate() and
kvmppc_mmu_map_page().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h   | 1 +
 arch/powerpc/kvm/book3s_64_mmu.c      | 1 +
 arch/powerpc/kvm/book3s_64_mmu_host.c | 2 ++
 arch/powerpc/kvm/book3s_pr.c          | 2 +-
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2acc691ed9d0..0f3ac09cbfe0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -342,6 +342,7 @@ struct kvmppc_pte {
 	bool may_read		: 1;
 	bool may_write		: 1;
 	bool may_execute	: 1;
+	unsigned long wimg;
 	u8 page_size;		/* MMU_PAGE_xxx */
 };
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 70153578131a..29ebe2fd5867 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -319,6 +319,7 @@ do_second:
 		gpte->may_execute = true;
 	gpte->may_read = false;
 	gpte->may_write = false;
+	gpte->wimg = r & HPTE_R_WIMG;
 
 	switch (pp) {
 	case 0:
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4b4e927c4822..145a61892c48 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -145,6 +145,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	else
 		kvmppc_mmu_flush_icache(pfn);
 
+	rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg;
+
 	/*
 	 * Use 64K pages if possible; otherwise, on 64K page kernels,
 	 * we need to transfer 4 more bits from guest real to host real addr.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index ce437b98477e..f026b062c0ed 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -537,7 +537,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int r = RESUME_GUEST;
 	int relocated;
 	int page_found = 0;
-	struct kvmppc_pte pte;
+	struct kvmppc_pte pte = { 0 };
 	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
 	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
 	u64 vsid;

From 3762d45aa71d546cc0afa29da2f1cd360c099484 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 13 Apr 2017 17:04:20 +1000
Subject: [PATCH 15/22] KVM: PPC: Align the table size to system page size

At the moment the userspace can request a table smaller than a page size
and this value will be stored as kvmppc_spapr_tce_table::size.
However the actual allocated size will still be aligned to the system
page size as alloc_page() is used there.

This aligns the table size up to the system page size. It should not
change the existing behaviour but when in-kernel TCE acceleration patchset
reaches the upstream kernel, this will allow small TCE tables be
accelerated as well: PCI IODA iommu_table allocator already aligns
the size and, without this patch, an IOMMU group won't attach to LIOBN
due to the mismatching table size.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 3e26cd4979f9..ab20a47a8821 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -164,7 +164,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			return -EBUSY;
 	}
 
-	size = args->size;
+	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
 	if (ret) {

From 4898d3f49b5b156c33f0ae0f49ede417ab86195e Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:51 +1100
Subject: [PATCH 16/22] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability
 number

This adds a capability number for in-kernel support for VFIO on
SPAPR platform.

The capability will tell the user space whether in-kernel handlers of
H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space
must not attempt allocating a TCE table in the host kernel via
the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests
will not be passed to the user space which is desired action in
the situation like that.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6180ea50e9ef..7b488eae61b8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -892,6 +892,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_MIPS_64BIT 139
 #define KVM_CAP_S390_GS 140
 #define KVM_CAP_S390_AIS 141
+#define KVM_CAP_SPAPR_TCE_VFIO 142
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From e91aa8e6ecd5e8193df6dec109f9d5deda5c7317 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:52 +1100
Subject: [PATCH 17/22] KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64
 permanently

It does not make much sense to have KVM in book3s-64 and
not to have IOMMU bits for PCI pass through support as it costs little
and allows VFIO to function on book3s KVM.

Having IOMMU_API always enabled makes it unnecessary to have a lot of
"#ifdef IOMMU_API" in arch/powerpc/kvm/book3s_64_vio*. With those
ifdef's we could have only user space emulated devices accelerated
(but not VFIO) which do not seem to be very useful.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 029be26b5a17..65a471de96de 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -67,6 +67,7 @@ config KVM_BOOK3S_64
 	select KVM_BOOK3S_64_HANDLER
 	select KVM
 	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.

From 503bfcbe18576a79be0bc5173b23b530845e704a Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:53 +1100
Subject: [PATCH 18/22] KVM: PPC: Pass kvm* to kvmppc_find_table()

The guest view TCE tables are per KVM anyway (not per VCPU) so pass kvm*
there. This will be used in the following patches where we will be
attaching VFIO containers to LIOBNs via ioctl() to KVM (rather than
to VCPU).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/kvm/book3s_64_vio.c    |  7 ++++---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 13 +++++++------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bbecec4e753a..2e66800a870b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -177,7 +177,7 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
-		struct kvm_vcpu *vcpu, unsigned long liobn);
+		struct kvm *kvm, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
 		unsigned long ioba, unsigned long npages);
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index ab20a47a8821..d507d94e020c 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -214,12 +214,13 @@ fail:
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -247,7 +248,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	u64 __user *tces;
 	u64 tce;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -301,7 +302,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index e4c4ea973e57..918af76ab2b6 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -48,10 +48,9 @@
  * WARNING: This will be called in real or virtual mode on HV KVM and virtual
  *          mode on PR KVM
  */
-struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 		unsigned long liobn)
 {
-	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;
 
 	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
@@ -182,12 +181,13 @@ EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -240,7 +240,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -301,7 +301,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -322,12 +322,13 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 	unsigned long idx;
 	struct page *page;
 	u64 *tbl;
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 

From da6f59e19233efdda58f196afbae8e05f6030d7f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:54 +1100
Subject: [PATCH 19/22] KVM: PPC: Use preregistered memory API to access TCE
 list

VFIO on sPAPR already implements guest memory pre-registration
when the entire guest RAM gets pinned. This can be used to translate
the physical address of a guest page containing the TCE list
from H_PUT_TCE_INDIRECT.

This makes use of the pre-registrered memory API to access TCE list
pages in order to avoid unnecessary locking on the KVM memory
reverse map as we know that all of guest memory is pinned and
we have a flat array mapping GPA to HPA which makes it simpler and
quicker to index into that array (even with looking up the
kernel page tables in vmalloc_to_phys) than it is to find the memslot,
lock the rmap entry, look up the user page tables, and unlock the rmap
entry. Note that the rmap pointer is initialized to NULL
where declared (not in this patch).

If a requested chunk of memory has not been preregistered, this will
fall back to non-preregistered case and lock rmap.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 58 +++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 918af76ab2b6..0f145fc7a3a5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -239,6 +239,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	long i, ret = H_SUCCESS;
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
+	bool prereg = false;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -259,23 +260,47 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
-		return H_TOO_HARD;
+	if (mm_iommu_preregistered(vcpu->kvm->mm)) {
+		/*
+		 * We get here if guest memory was pre-registered which
+		 * is normally VFIO case and gpa->hpa translation does not
+		 * depend on hpt.
+		 */
+		struct mm_iommu_table_group_mem_t *mem;
 
-	rmap = (void *) vmalloc_to_phys(rmap);
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+			return H_TOO_HARD;
 
-	/*
-	 * Synchronize with the MMU notifier callbacks in
-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
-	 * While we have the rmap lock, code running on other CPUs
-	 * cannot finish unmapping the host real page that backs
-	 * this guest real page, so we are OK to access the host
-	 * real page.
-	 */
-	lock_rmap(rmap);
-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
-		ret = H_TOO_HARD;
-		goto unlock_exit;
+		mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
+		if (mem)
+			prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0;
+	}
+
+	if (!prereg) {
+		/*
+		 * This is usually a case of a guest with emulated devices only
+		 * when TCE list is not in preregistered memory.
+		 * We do not require memory to be preregistered in this case
+		 * so lock rmap and do __find_linux_pte_or_hugepte().
+		 */
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+			return H_TOO_HARD;
+
+		rmap = (void *) vmalloc_to_phys(rmap);
+
+		/*
+		 * Synchronize with the MMU notifier callbacks in
+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+		 * While we have the rmap lock, code running on other CPUs
+		 * cannot finish unmapping the host real page that backs
+		 * this guest real page, so we are OK to access the host
+		 * real page.
+		 */
+		lock_rmap(rmap);
+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
 	}
 
 	for (i = 0; i < npages; ++i) {
@@ -289,7 +314,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	}
 
 unlock_exit:
-	unlock_rmap(rmap);
+	if (rmap)
+		unlock_rmap(rmap);
 
 	return ret;
 }

From b1af23d836f811137d504d14d4cbdd01929dec34 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:55 +1100
Subject: [PATCH 20/22] KVM: PPC: iommu: Unify TCE checking

This reworks helpers for checking TCE update parameters in way they
can be used in KVM.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/iommu.h    | 20 +++++++++++----
 arch/powerpc/include/asm/kvm_ppc.h  |  6 +++--
 arch/powerpc/kernel/iommu.c         | 39 +++++++++++------------------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 31 ++++++-----------------
 4 files changed, 40 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d96142572e6d..8a8ce220d7d0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -296,11 +296,21 @@ static inline void iommu_restore(void)
 #endif
 
 /* The API to support IOMMU operations for VFIO */
-extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages);
-extern int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce);
+extern int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages);
+extern int iommu_tce_check_gpa(unsigned long page_shift,
+		unsigned long gpa);
+
+#define iommu_tce_clear_param_check(tbl, ioba, tce_value, npages) \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), (npages)) || (tce_value))
+#define iommu_tce_put_param_check(tbl, ioba, gpa)                 \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), 1) ||                     \
+		iommu_tce_check_gpa((tbl)->it_page_shift, (gpa)))
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2e66800a870b..4d079a29eae2 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -178,8 +178,10 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
 		struct kvm *kvm, unsigned long liobn);
-extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages);
+#define kvmppc_ioba_validate(stt, ioba, npages)                         \
+		(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
+				(stt)->size, (ioba), (npages)) ?        \
+				H_PARAMETER : H_SUCCESS)
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
 		unsigned long tce);
 extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5a3231fedf08..f2b724cd9e64 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -963,47 +963,36 @@ void iommu_flush_tce(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_flush_tce);
 
-int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages)
+int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages)
 {
-	/* tbl->it_ops->clear() does not support any value but 0 */
-	if (tce_value)
+	unsigned long mask = (1UL << page_shift) - 1;
+
+	if (ioba & mask)
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
+	ioba >>= page_shift;
+	if (ioba < offset)
 		return -EINVAL;
 
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
-		return -EINVAL;
-
-	if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+	if ((ioba + 1) > (offset + size))
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
 
-int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce)
+int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 {
-	if (tce & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
+	unsigned long mask = (1UL << page_shift) - 1;
 
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
-
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
-		return -EINVAL;
-
-	if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+	if (gpa & mask)
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
 long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction)
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 0f145fc7a3a5..440d3ab5dc32 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -61,27 +61,6 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
-/*
- * Validates IO address.
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
- */
-long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages)
-{
-	unsigned long mask = (1ULL << stt->page_shift) - 1;
-	unsigned long idx = ioba >> stt->page_shift;
-
-	if ((ioba & mask) || (idx < stt->offset) ||
-			(idx - stt->offset + npages > stt->size) ||
-			(idx + npages < idx))
-		return H_PARAMETER;
-
-	return H_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
-
 /*
  * Validates TCE address.
  * At the moment flags and page mask are validated.
@@ -95,10 +74,14 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
  */
 long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 {
-	unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
-	unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	enum dma_data_direction dir = iommu_tce_direction(tce);
 
-	if (tce & mask)
+	/* Allow userspace to poison TCE table */
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
 		return H_PARAMETER;
 
 	return H_SUCCESS;

From 121f80ba68f1a5779a36d7b3247206e60e0a7418 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:56 +1100
Subject: [PATCH 21/22] KVM: PPC: VFIO: Add in-kernel acceleration for VFIO

This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO
without passing them to user space which saves time on switching
to user space and back.

This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM.
KVM tries to handle a TCE request in the real mode, if failed
it passes the request to the virtual mode to complete the operation.
If it a virtual mode handler fails, the request is passed to
the user space; this is not expected to happen though.

To avoid dealing with page use counters (which is tricky in real mode),
this only accelerates SPAPR TCE IOMMU v2 clients which are required
to pre-register the userspace memory. The very first TCE request will
be handled in the VFIO SPAPR TCE driver anyway as the userspace view
of the TCE table (iommu_table::it_userspace) is not allocated till
the very first mapping happens and we cannot call vmalloc in real mode.

If we fail to update a hardware IOMMU table unexpected reason, we just
clear it and move on as there is nothing really we can do about it -
for example, if we hot plug a VFIO device to a guest, existing TCE tables
will be mirrored automatically to the hardware and there is no interface
to report to the guest about possible failures.

This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to
the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd
and associates a physical IOMMU table with the SPAPR TCE table (which
is a guest view of the hardware IOMMU table). The iommu_table object
is cached and referenced so we do not have to look up for it in real mode.

This does not implement the UNSET counterpart as there is no use for it -
once the acceleration is enabled, the existing userspace won't
disable it unless a VFIO container is destroyed; this adds necessary
cleanup to the KVM_DEV_VFIO_GROUP_DEL handler.

This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user
space.

This adds real mode version of WARN_ON_ONCE() as the generic version
causes problems with rcu_sched. Since we testing what vmalloc_to_phys()
returns in the code, this also adds a check for already existing
vmalloc_to_phys() call in kvmppc_rm_h_put_tce_indirect().

This finally makes use of vfio_external_user_iommu_id() which was
introduced quite some time ago and was considered for removal.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/devices/vfio.txt |  18 +-
 arch/powerpc/include/asm/kvm_host.h        |   8 +
 arch/powerpc/include/asm/kvm_ppc.h         |   4 +
 arch/powerpc/kvm/book3s_64_vio.c           | 306 ++++++++++++++++++++-
 arch/powerpc/kvm/book3s_64_vio_hv.c        | 201 +++++++++++++-
 arch/powerpc/kvm/powerpc.c                 |   2 +
 include/uapi/linux/kvm.h                   |   6 +
 virt/kvm/vfio.c                            | 105 +++++++
 8 files changed, 645 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
index ef51740c67ca..528c77c8022c 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -16,7 +16,21 @@ Groups:
 
 KVM_DEV_VFIO_GROUP attributes:
   KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
   KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
+  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table
+	allocated by sPAPR KVM.
+	kvm_device_attr.addr points to a struct:
 
-For each, kvm_device_attr.addr points to an int32_t file descriptor
-for the VFIO group.
+	struct kvm_vfio_spapr_tce {
+		__s32	groupfd;
+		__s32	tablefd;
+	};
+
+	where
+	@groupfd is a file descriptor for a VFIO group;
+	@tablefd is a file descriptor for a TCE table allocated via
+		KVM_CREATE_SPAPR_TCE.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f3ac09cbfe0..77c60826d145 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,13 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_iommu_table {
+	struct rcu_head rcu;
+	struct list_head next;
+	struct iommu_table *tbl;
+	struct kref kref;
+};
+
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -196,6 +203,7 @@ struct kvmppc_spapr_tce_table {
 	u32 page_shift;
 	u64 offset;		/* in pages */
 	u64 size;		/* window size in pages */
+	struct list_head iommu_tables;
 	struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4d079a29eae2..5885d327c025 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -173,6 +173,10 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp);
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d507d94e020c..a160c14304eb 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,8 @@
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -40,6 +42,7 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
 {
@@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 	return ret;
 }
 
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+			struct kvmppc_spapr_tce_iommu_table, rcu);
+
+	iommu_tce_table_put(stit->tbl);
+
+	kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+			struct kvmppc_spapr_tce_iommu_table, kref);
+
+	list_del_rcu(&stit->next);
+
+	call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp)
+{
+	int i;
+	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct iommu_table_group *table_group = NULL;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+		table_group = iommu_group_get_iommudata(grp);
+		if (WARN_ON(!table_group))
+			continue;
+
+		list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+			for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+				if (table_group->tables[i] != stit->tbl)
+					continue;
+
+				kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+				return;
+			}
+		}
+	}
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	bool found = false;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group *table_group;
+	long i;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	struct fd f;
+
+	f = fdget(tablefd);
+	if (!f.file)
+		return -EBADF;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt == f.file->private_data) {
+			found = true;
+			break;
+		}
+	}
+
+	fdput(f);
+
+	if (!found)
+		return -EINVAL;
+
+	table_group = iommu_group_get_iommudata(grp);
+	if (WARN_ON(!table_group))
+		return -EFAULT;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbltmp = table_group->tables[i];
+
+		if (!tbltmp)
+			continue;
+		/*
+		 * Make sure hardware table parameters are exactly the same;
+		 * this is used in the TCE handlers where boundary checks
+		 * use only the first attached table.
+		 */
+		if ((tbltmp->it_page_shift == stt->page_shift) &&
+				(tbltmp->it_offset == stt->offset) &&
+				(tbltmp->it_size == stt->size)) {
+			/*
+			 * Reference the table to avoid races with
+			 * add/remove DMA windows.
+			 */
+			tbl = iommu_tce_table_get(tbltmp);
+			break;
+		}
+	}
+	if (!tbl)
+		return -EINVAL;
+
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		if (tbl != stit->tbl)
+			continue;
+
+		if (!kref_get_unless_zero(&stit->kref)) {
+			/* stit is being destroyed */
+			iommu_tce_table_put(tbl);
+			return -ENOTTY;
+		}
+		/*
+		 * The table is already known to this KVM, we just increased
+		 * its KVM reference counter and can return.
+		 */
+		return 0;
+	}
+
+	stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+	if (!stit) {
+		iommu_tce_table_put(tbl);
+		return -ENOMEM;
+	}
+
+	stit->tbl = tbl;
+	kref_init(&stit->kref);
+
+	list_add_rcu(&stit->next, &stt->iommu_tables);
+
+	return 0;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_table *stt = container_of(head,
@@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
 
 	list_del_rcu(&stt->list);
 
+	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+		WARN_ON(!kref_read(&stit->kref));
+		while (1) {
+			if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+				break;
+		}
+	}
+
 	kvm_put_kvm(stt->kvm);
 
 	kvmppc_account_memlimit(
@@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	stt->offset = args->offset;
 	stt->size = size;
 	stt->kvm = kvm;
+	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
 
 	for (i = 0; i < npages; i++) {
 		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -211,11 +355,101 @@ fail:
 	return ret;
 }
 
+static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+		return H_HARDWARE;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret != H_SUCCESS)
+		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	if (mm_iommu_mapped_inc(mem))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	if (WARN_ON_ONCE(ret)) {
+		mm_iommu_mapped_dec(mem);
+		return H_HARDWARE;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
-	long ret;
+	long ret, idx;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -232,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		} else {
+			idx = srcu_read_lock(&vcpu->kvm->srcu);
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
+					entry, ua, dir);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		}
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE(1);
+		kvmppc_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -247,6 +509,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long entry, ua = 0;
 	u64 __user *tces;
 	u64 tce;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -285,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -301,6 +584,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -314,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 440d3ab5dc32..eda0a8f6fae8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,31 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 
+#ifdef CONFIG_BUG
+
+#define WARN_ON_ONCE_RM(condition)	({			\
+	static bool __section(.data.unlikely) __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n",	\
+				__stringify(condition),		\
+				__func__, __LINE__);		\
+		dump_stack();					\
+	}							\
+	unlikely(__ret_warn_once);				\
+})
+
+#else
+
+#define WARN_ON_ONCE_RM(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	unlikely(__ret_warn_on);				\
+})
+
+#endif
+
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
 /*
@@ -161,11 +186,117 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret)
+		iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+	if (ret) {
+		mm_iommu_mapped_dec(mem);
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -182,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE)
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		else
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry, ua, dir);
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE_RM(1);
+		kvmppc_rm_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -223,6 +379,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
 	bool prereg = false;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -270,6 +427,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 			return H_TOO_HARD;
 
 		rmap = (void *) vmalloc_to_phys(rmap);
+		if (WARN_ON_ONCE_RM(!rmap))
+			return H_HARDWARE;
 
 		/*
 		 * Synchronize with the MMU notifier callbacks in
@@ -293,6 +452,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		ua = 0;
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -309,6 +489,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -322,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6c7244879bfe..cf725c580fc5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -534,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_SPAPR_TCE_64:
+		/* fallthrough */
+	case KVM_CAP_SPAPR_TCE_VFIO:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
 	case KVM_CAP_PPC_ENABLE_HCALL:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7b488eae61b8..3c168b6fd74b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1097,6 +1097,7 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP			1
 #define   KVM_DEV_VFIO_GROUP_ADD			1
 #define   KVM_DEV_VFIO_GROUP_DEL			2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
 
 enum kvm_device_type {
 	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
@@ -1118,6 +1119,11 @@ enum kvm_device_type {
 	KVM_DEV_TYPE_MAX,
 };
 
+struct kvm_vfio_spapr_tce {
+	__s32	groupfd;
+	__s32	tablefd;
+};
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d32f239eb471..37d9118fd84b 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -20,6 +20,10 @@
 #include <linux/vfio.h>
 #include "vfio.h"
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+#include <asm/kvm_ppc.h>
+#endif
+
 struct kvm_vfio_group {
 	struct list_head node;
 	struct vfio_group *vfio_group;
@@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
 	return ret > 0;
 }
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
+{
+	int (*fn)(struct vfio_group *);
+	int ret = -EINVAL;
+
+	fn = symbol_get(vfio_external_user_iommu_id);
+	if (!fn)
+		return ret;
+
+	ret = fn(vfio_group);
+
+	symbol_put(vfio_external_user_iommu_id);
+
+	return ret;
+}
+
+static struct iommu_group *kvm_vfio_group_get_iommu_group(
+		struct vfio_group *group)
+{
+	int group_id = kvm_vfio_external_user_iommu_id(group);
+
+	if (group_id < 0)
+		return NULL;
+
+	return iommu_group_get_by_id(group_id);
+}
+
+static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
+		struct vfio_group *vfio_group)
+{
+	struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group);
+
+	if (WARN_ON_ONCE(!grp))
+		return;
+
+	kvm_spapr_tce_release_iommu_group(kvm, grp);
+	iommu_group_put(grp);
+}
+#endif
+
 /*
  * Groups can use the same or different IOMMU domains.  If the same then
  * adding a new group may change the coherency of groups we've previously
@@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 
 		mutex_unlock(&kv->lock);
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(vfio_group, NULL);
 
 		kvm_vfio_group_put_external_user(vfio_group);
@@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 		kvm_vfio_update_coherency(dev);
 
 		return ret;
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: {
+		struct kvm_vfio_spapr_tce param;
+		struct kvm_vfio *kv = dev->private;
+		struct vfio_group *vfio_group;
+		struct kvm_vfio_group *kvg;
+		struct fd f;
+		struct iommu_group *grp;
+
+		if (copy_from_user(&param, (void __user *)arg,
+				sizeof(struct kvm_vfio_spapr_tce)))
+			return -EFAULT;
+
+		f = fdget(param.groupfd);
+		if (!f.file)
+			return -EBADF;
+
+		vfio_group = kvm_vfio_group_get_external_user(f.file);
+		fdput(f);
+
+		if (IS_ERR(vfio_group))
+			return PTR_ERR(vfio_group);
+
+		grp = kvm_vfio_group_get_iommu_group(vfio_group);
+		if (WARN_ON_ONCE(!grp)) {
+			kvm_vfio_group_put_external_user(vfio_group);
+			return -EIO;
+		}
+
+		ret = -ENOENT;
+
+		mutex_lock(&kv->lock);
+
+		list_for_each_entry(kvg, &kv->group_list, node) {
+			if (kvg->vfio_group != vfio_group)
+				continue;
+
+			ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
+					param.tablefd, grp);
+			break;
+		}
+
+		mutex_unlock(&kv->lock);
+
+		iommu_group_put(grp);
+		kvm_vfio_group_put_external_user(vfio_group);
+
+		return ret;
+	}
+#endif /* CONFIG_SPAPR_TCE_IOMMU */
 	}
 
 	return -ENXIO;
@@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 		switch (attr->attr) {
 		case KVM_DEV_VFIO_GROUP_ADD:
 		case KVM_DEV_VFIO_GROUP_DEL:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
+#endif
 			return 0;
 		}
 
@@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
 	struct kvm_vfio_group *kvg, *tmp;
 
 	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
 		kvm_vfio_group_put_external_user(kvg->vfio_group);
 		list_del(&kvg->node);

From feafd13c96d6e2998d3da81bb55db379fcf02499 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 5 Apr 2017 15:58:51 +0200
Subject: [PATCH 22/22] KVM: PPC: Book3S PR: Do not fail emulation with
 mtspr/mfspr for unknown SPRs

According to the PowerISA 2.07, mtspr and mfspr should not always
generate an illegal instruction exception when being used with an
undefined SPR, but rather treat the instruction as a NOP or inject a
privilege exception in some cases, too - depending on the SPR number.
Also turn the printk here into a ratelimited print statement, so that
the guest can not flood the dmesg log of the host by issueing lots of
illegal mtspr/mfspr instruction here.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_emulate.c | 34 +++++++++++++++++++++++--------
 arch/powerpc/kvm/emulate.c        |  8 ++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 8359752b3efc..68d68983948e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -503,10 +503,18 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		break;
 unprivileged:
 	default:
-		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR write: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
 		break;
 	}
 
@@ -648,10 +656,20 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
 		break;
 	default:
 unprivileged:
-		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR read: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0 ||
+			    sprn == 4 || sprn == 5 || sprn == 6) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
+
 		break;
 	}
 
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b379146de55b..c873ffe55362 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -259,10 +259,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_MFSPR:
 			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_MTSPR:
 			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_TLBSYNC: