From e76779339ba9c3d56f1a39ff0bfb1dfe7614b062 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 11 May 2011 08:30:51 -0400
Subject: [PATCH 001/143] KVM: Document KVM_GET_LAPIC, KVM_SET_LAPIC ioctl

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 32 +++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 42542eb802ca..67cc0f5b9974 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1291,6 +1291,38 @@ Returns the tsc frequency of the guest. The unit of the return value is
 KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
 error.
 
+4.56 KVM_GET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (out)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
+Reads the Local APIC registers and copies them into the input argument.  The
+data format and layout are the same as documented in the architecture manual.
+
+4.57 KVM_SET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (in)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
+Copies the input argument into the the Local APIC registers.  The data format
+and layout are the same as documented in the architecture manual.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by

From 67cbc90db5c0f03aa5e54204c388403ec8c25862 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 15 May 2011 00:54:58 +0900
Subject: [PATCH 002/143] KVM: x86 emulator: Place insn_fetch helpers together

The two macros need special care to use:
  Assume rc, ctxt, ops and done exist outside of them.
  Can goto outside.

Considering the fact that these are used only in decode functions,
moving these right after do_insn_fetch() seems to be a right thing
to improve the readability.

We also rename do_fetch_insn_byte() to do_insn_fetch_byte() to be
consistent.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda0..6e4722c3dcdb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,23 +407,6 @@ struct gprefix {
 		}							\
 	} while (0)
 
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({	unsigned long _x;						\
-	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != X86EMUL_CONTINUE)					\
-		goto done;						\
-	(_eip) += (_size);						\
-	(_type)_x;							\
-})
-
-#define insn_fetch_arr(_arr, _size, _eip)				\
-({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
-	if (rc != X86EMUL_CONTINUE)					\
-		goto done;						\
-	(_eip) += (_size);						\
-})
-
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 				    enum x86_intercept intercept,
 				    enum x86_intercept_stage stage)
@@ -671,7 +654,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 	return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
 {
@@ -707,13 +690,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
 	while (size--) {
-		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+		rc = do_insn_fetch_byte(ctxt, ops, eip++, dest++);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
 	return X86EMUL_CONTINUE;
 }
 
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)					\
+({	unsigned long _x;						\
+	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+	(_type)_x;							\
+})
+
+#define insn_fetch_arr(_arr, _size, _eip)				\
+({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+})
+
 /*
  * Given the 'reg' portion of a ModRM byte, and a register block, return a
  * pointer into the block that addresses the relevant register.

From ef5d75cc9af2bca7c525158666b5f9696846ffb6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 15 May 2011 00:57:43 +0900
Subject: [PATCH 003/143] KVM: x86 emulator: Stop passing ctxt->ops as arg of
 decode helpers

Dereference it in the actual users: only do_insn_fetch_byte().

This is consistent with the way __linearize() dereferences it.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6e4722c3dcdb..df9082c811e0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -655,7 +655,6 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 }
 
 static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
 {
 	struct fetch_cache *fc = &ctxt->decode.fetch;
@@ -670,8 +669,8 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
 		rc = __linearize(ctxt, addr, size, false, true, &linear);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
-		rc = ops->fetch(ctxt, linear, fc->data + cur_size,
-				size, &ctxt->exception);
+		rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+				      size, &ctxt->exception);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		fc->end += size;
@@ -681,7 +680,6 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 unsigned long eip, void *dest, unsigned size)
 {
 	int rc;
@@ -690,7 +688,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
 	while (size--) {
-		rc = do_insn_fetch_byte(ctxt, ops, eip++, dest++);
+		rc = do_insn_fetch_byte(ctxt, eip++, dest++);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
@@ -700,7 +698,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)					\
 ({	unsigned long _x;						\
-	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
+	rc = do_insn_fetch(ctxt, (_eip), &_x, (_size));			\
 	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	(_eip) += (_size);						\
@@ -708,7 +706,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 })
 
 #define insn_fetch_arr(_arr, _size, _eip)				\
-({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
+({	rc = do_insn_fetch(ctxt, (_eip), _arr, (_size));		\
 	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	(_eip) += (_size);						\
@@ -887,7 +885,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops,
 			struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -1014,7 +1011,6 @@ done:
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops,
 		      struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -3327,7 +3323,6 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		      unsigned size, bool sign_extension)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc = X86EMUL_CONTINUE;
 
 	op->type = OP_IMM;
@@ -3362,10 +3357,8 @@ done:
 	return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
@@ -3531,11 +3524,11 @@ done_prefixes:
 
 	/* ModRM and SIB bytes. */
 	if (c->d & ModRM) {
-		rc = decode_modrm(ctxt, ops, &memop);
+		rc = decode_modrm(ctxt, &memop);
 		if (!c->has_seg_override)
 			set_seg_override(c, c->modrm_seg);
 	} else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops, &memop);
+		rc = decode_abs(ctxt, &memop);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 

From 7b105ca2903b84f023c49965d9a511c5e55256dc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 15 May 2011 01:00:52 +0900
Subject: [PATCH 004/143] KVM: x86 emulator: Stop passing ctxt->ops as arg of
 emul functions

Dereference it in the actual users.

This not only cleans up the emulator but also makes it easy to convert
the old emulation functions to the new em_xxx() form later.

Note: Remove some inline keywords to let the compiler decide inlining.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   3 +-
 arch/x86/kvm/emulate.c             | 259 +++++++++++++----------------
 arch/x86/kvm/x86.c                 |   2 +-
 3 files changed, 118 insertions(+), 146 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0049211959c0..ab09ba290db3 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -373,6 +373,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops, int irq);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index df9082c811e0..d3f4466cd19c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -475,13 +475,12 @@ static void set_seg_override(struct decode_cache *c, int seg)
 	c->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
 {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 		return 0;
 
-	return ops->get_cached_segment_base(ctxt, seg);
+	return ctxt->ops->get_cached_segment_base(ctxt, seg);
 }
 
 static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
@@ -570,7 +569,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 	u16 sel;
 	unsigned cpl, rpl;
 
-	la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+	la = seg_base(ctxt, addr.seg) + addr.ea;
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_REAL:
 		break;
@@ -1052,7 +1051,6 @@ static void fetch_bit_operand(struct decode_cache *c)
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 unsigned long addr, void *dest, unsigned size)
 {
 	int rc;
@@ -1064,8 +1062,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		if (mc->pos < mc->end)
 			goto read_cached;
 
-		rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
-					&ctxt->exception);
+		rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
+					      &ctxt->exception);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1090,7 +1088,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
 	rc = linearize(ctxt, addr, size, false, &linear);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	return read_emulated(ctxt, ctxt->ops, linear, data, size);
+	return read_emulated(ctxt, linear, data, size);
 }
 
 static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1124,7 +1122,6 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
 }
 
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
 			   unsigned int size, unsigned short port,
 			   void *dest)
 {
@@ -1143,7 +1140,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 		if (n == 0)
 			n = 1;
 		rc->pos = rc->end = 0;
-		if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+		if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
 			return 0;
 		rc->end = n * size;
 	}
@@ -1154,9 +1151,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 }
 
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
-				     struct x86_emulate_ops *ops,
 				     u16 selector, struct desc_ptr *dt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
+
 	if (selector & 1 << 2) {
 		struct desc_struct desc;
 		u16 sel;
@@ -1173,48 +1171,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 
 /* allowed just for 8 bytes segments */
 static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 selector, struct desc_struct *desc)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
-	int ret;
 	ulong addr;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	get_descriptor_table_ptr(ctxt, selector, &dt);
 
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
-	addr = dt.address + index * 8;
-	ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
 
-       return ret;
+	addr = dt.address + index * 8;
+	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+				   &ctxt->exception);
 }
 
 /* allowed just for 8 bytes segments */
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
 				    u16 selector, struct desc_struct *desc)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
 	ulong addr;
-	int ret;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	get_descriptor_table_ptr(ctxt, selector, &dt);
 
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
 
 	addr = dt.address + index * 8;
-	ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
-
-	return ret;
+	return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
+				    &ctxt->exception);
 }
 
 /* Does not support long mode */
 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 selector, int seg)
 {
 	struct desc_struct seg_desc;
@@ -1249,7 +1241,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (null_selector) /* for NULL selector skip all following checks */
 		goto load;
 
-	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	ret = read_segment_descriptor(ctxt, selector, &seg_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -1267,7 +1259,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	rpl = selector & 3;
 	dpl = seg_desc.dpl;
-	cpl = ops->cpl(ctxt);
+	cpl = ctxt->ops->cpl(ctxt);
 
 	switch (seg) {
 	case VCPU_SREG_SS:
@@ -1318,12 +1310,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (seg_desc.s) {
 		/* mark segment as accessed */
 		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		ret = write_segment_descriptor(ctxt, selector, &seg_desc);
 		if (ret != X86EMUL_CONTINUE)
 			return ret;
 	}
 load:
-	ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+	ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
 	return X86EMUL_CONTINUE;
 exception:
 	emulate_exception(ctxt, err_vec, err_code, true);
@@ -1424,13 +1416,12 @@ static int em_pop(struct x86_emulate_ctxt *ctxt)
 }
 
 static int emulate_popf(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
+			void *dest, int len)
 {
 	int rc;
 	unsigned long val, change_mask;
 	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = ops->cpl(ctxt);
+	int cpl = ctxt->ops->cpl(ctxt);
 
 	rc = emulate_pop(ctxt, &val, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1471,11 +1462,10 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
 	c->dst.type = OP_REG;
 	c->dst.addr.reg = &ctxt->eflags;
 	c->dst.bytes = c->op_bytes;
-	return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+	return emulate_popf(ctxt, &c->dst.val, c->op_bytes);
 }
 
-static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
+static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
 	struct decode_cache *c = &ctxt->decode;
 
@@ -1484,8 +1474,7 @@ static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
 	return em_push(ctxt);
 }
 
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long selector;
@@ -1495,7 +1484,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+	rc = load_segment_descriptor(ctxt, (u16)selector, seg);
 	return rc;
 }
 
@@ -1549,10 +1538,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops, int irq)
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
 	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc;
 	struct desc_ptr dt;
 	gva_t cs_addr;
@@ -1590,7 +1579,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1599,12 +1588,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static int emulate_int(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops, int irq)
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
-		return emulate_int_real(ctxt, ops, irq);
+		return emulate_int_real(ctxt, irq);
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_PROT16:
 	case X86EMUL_MODE_PROT32:
@@ -1615,8 +1603,7 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
 	}
 }
 
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
@@ -1648,7 +1635,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1669,12 +1656,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops* ops)
+static int emulate_iret(struct x86_emulate_ctxt *ctxt)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
-		return emulate_iret_real(ctxt, ops);
+		return emulate_iret_real(ctxt);
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_PROT16:
 	case X86EMUL_MODE_PROT32:
@@ -1693,7 +1679,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 
 	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
 
-	rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1830,8 +1816,7 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops)
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
@@ -1845,12 +1830,11 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	rc = emulate_pop(ctxt, &cs, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
 	return rc;
 }
 
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops, int seg)
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned short sel;
@@ -1858,7 +1842,7 @@ static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
 
 	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
 
-	rc = load_segment_descriptor(ctxt, ops, sel, seg);
+	rc = load_segment_descriptor(ctxt, sel, seg);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1866,15 +1850,14 @@ static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static inline void
+static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops, struct desc_struct *cs,
-			struct desc_struct *ss)
+			struct desc_struct *cs, struct desc_struct *ss)
 {
 	u16 selector;
 
 	memset(cs, 0, sizeof(struct desc_struct));
-	ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
+	ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
 	memset(ss, 0, sizeof(struct desc_struct));
 
 	cs->l = 0;		/* will be adjusted later */
@@ -1897,10 +1880,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 }
 
-static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int emulate_syscall(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
@@ -1912,7 +1895,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return emulate_ud(ctxt);
 
 	ops->get_msr(ctxt, MSR_EFER, &efer);
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
 	ops->get_msr(ctxt, MSR_STAR, &msr_data);
 	msr_data >>= 32;
@@ -1950,16 +1933,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
 	u64 efer = 0;
 
-	ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+	ops->get_msr(ctxt, MSR_EFER, &efer);
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL)
 		return emulate_gp(ctxt, 0);
@@ -1970,7 +1953,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	if (ctxt->mode == X86EMUL_MODE_PROT64)
 		return emulate_ud(ctxt);
 
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
@@ -2006,10 +1989,10 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
@@ -2020,7 +2003,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	    ctxt->mode == X86EMUL_MODE_VM86)
 		return emulate_gp(ctxt, 0);
 
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
 	if ((c->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
@@ -2058,8 +2041,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 {
 	int iopl;
 	if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2067,13 +2049,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
 	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	return ops->cpl(ctxt) > iopl;
+	return ctxt->ops->cpl(ctxt) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
-					    struct x86_emulate_ops *ops,
 					    u16 port, u16 len)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct tr_seg;
 	u32 base3;
 	int r;
@@ -2104,14 +2086,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 }
 
 static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
 	if (ctxt->perm_ok)
 		return true;
 
-	if (emulator_bad_iopl(ctxt, ops))
-		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+	if (emulator_bad_iopl(ctxt))
+		if (!emulator_io_port_access_allowed(ctxt, port, len))
 			return false;
 
 	ctxt->perm_ok = true;
@@ -2120,7 +2101,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 }
 
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops,
 				struct tss_segment_16 *tss)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -2144,7 +2124,6 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 struct tss_segment_16 *tss)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -2175,19 +2154,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+	ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2195,10 +2174,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_16(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_16 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2209,7 +2188,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
-	save_state_to_tss16(ctxt, ops, &tss_seg);
+	save_state_to_tss16(ctxt, &tss_seg);
 
 	ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			     &ctxt->exception);
@@ -2235,16 +2214,15 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			return ret;
 	}
 
-	return load_state_from_tss16(ctxt, ops, &tss_seg);
+	return load_state_from_tss16(ctxt, &tss_seg);
 }
 
 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops,
 				struct tss_segment_32 *tss)
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	tss->cr3 = ops->get_cr(ctxt, 3);
+	tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
 	tss->eip = c->eip;
 	tss->eflags = ctxt->eflags;
 	tss->eax = c->regs[VCPU_REGS_RAX];
@@ -2266,13 +2244,12 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 struct tss_segment_32 *tss)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	if (ops->set_cr(ctxt, 3, tss->cr3))
+	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
 		return emulate_gp(ctxt, 0);
 	c->eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
@@ -2301,25 +2278,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+	ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+	ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+	ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2327,10 +2304,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_32(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_32 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2341,7 +2318,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
-	save_state_to_tss32(ctxt, ops, &tss_seg);
+	save_state_to_tss32(ctxt, &tss_seg);
 
 	ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			     &ctxt->exception);
@@ -2367,14 +2344,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			return ret;
 	}
 
-	return load_state_from_tss32(ctxt, ops, &tss_seg);
+	return load_state_from_tss32(ctxt, &tss_seg);
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 tss_selector, int reason,
 				   bool has_error_code, u32 error_code)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct curr_tss_desc, next_tss_desc;
 	int ret;
 	u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2384,10 +2361,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	/* FIXME: old_tss_base == ~0 ? */
 
-	ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2409,8 +2386,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
 		curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
-		write_segment_descriptor(ctxt, ops, old_tss_sel,
-					 &curr_tss_desc);
+		write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
 	}
 
 	if (reason == TASK_SWITCH_IRET)
@@ -2422,10 +2398,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		old_tss_sel = 0xffff;
 
 	if (next_tss_desc.type & 8)
-		ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+		ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
 	else
-		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+		ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
@@ -2435,8 +2411,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (reason != TASK_SWITCH_IRET) {
 		next_tss_desc.type |= (1 << 1); /* set busy flag */
-		write_segment_descriptor(ctxt, ops, tss_selector,
-					 &next_tss_desc);
+		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	}
 
 	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
@@ -2458,14 +2433,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	c->eip = ctxt->eip;
 	c->dst.type = OP_NONE;
 
-	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE)
@@ -2535,7 +2509,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	old_eip = c->eip;
 
 	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-	if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+	if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
 		return X86EMUL_CONTINUE;
 
 	c->eip = 0;
@@ -2973,7 +2947,7 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 
 	c->dst.bytes = min(c->dst.bytes, 4u);
-	if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
+	if (!emulator_io_permited(ctxt, c->src.val, c->dst.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -2984,7 +2958,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 
 	c->src.bytes = min(c->src.bytes, 4u);
-	if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
+	if (!emulator_io_permited(ctxt, c->dst.val, c->src.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -3724,8 +3698,7 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	return false;
 }
 
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
@@ -3854,25 +3827,25 @@ special_insn:
 
 	switch (c->b) {
 	case 0x06:		/* push es */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
 		break;
 	case 0x07:		/* pop es */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
 		break;
 	case 0x0e:		/* push cs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
 		break;
 	case 0x16:		/* push ss */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
 		break;
 	case 0x17:		/* pop ss */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
 		break;
 	case 0x1e:		/* push ds */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
 		break;
 	case 0x1f:		/* pop ds */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
 		break;
 	case 0x40 ... 0x47: /* inc r16/r32 */
 		emulate_1op("inc", c->dst, ctxt->eflags);
@@ -3938,7 +3911,7 @@ special_insn:
 		if (c->modrm_reg == VCPU_SREG_SS)
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
-		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
+		rc = load_segment_descriptor(ctxt, sel, c->modrm_reg);
 
 		c->dst.type = OP_NONE;  /* Disable writeback. */
 		break;
@@ -3969,13 +3942,13 @@ special_insn:
 		rc = em_pop(ctxt);
 		break;
 	case 0xc4:		/* les */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
 		break;
 	case 0xc5:		/* lds */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
 		break;
 	case 0xcb:		/* ret far */
-		rc = emulate_ret_far(ctxt, ops);
+		rc = emulate_ret_far(ctxt);
 		break;
 	case 0xcc:		/* int3 */
 		irq = 3;
@@ -3983,7 +3956,7 @@ special_insn:
 	case 0xcd:		/* int n */
 		irq = c->src.val;
 	do_interrupt:
-		rc = emulate_int(ctxt, ops, irq);
+		rc = emulate_int(ctxt, irq);
 		break;
 	case 0xce:		/* into */
 		if (ctxt->eflags & EFLG_OF) {
@@ -3992,7 +3965,7 @@ special_insn:
 		}
 		break;
 	case 0xcf:		/* iret */
-		rc = emulate_iret(ctxt, ops);
+		rc = emulate_iret(ctxt);
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		rc = em_grp2(ctxt);
@@ -4037,7 +4010,7 @@ special_insn:
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
 	do_io_in:
-		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+		if (!pio_in_emulated(ctxt, c->dst.bytes, c->src.val,
 				     &c->dst.val))
 			goto done; /* IO is needed */
 		break;
@@ -4065,14 +4038,14 @@ special_insn:
 		ctxt->eflags |= EFLG_CF;
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt, ops)) {
+		if (emulator_bad_iopl(ctxt)) {
 			rc = emulate_gp(ctxt, 0);
 			goto done;
 		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt, ops)) {
+		if (emulator_bad_iopl(ctxt)) {
 			rc = emulate_gp(ctxt, 0);
 			goto done;
 		} else {
@@ -4154,7 +4127,7 @@ done:
 twobyte_insn:
 	switch (c->b) {
 	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt, ops);
+		rc = emulate_syscall(ctxt);
 		break;
 	case 0x06:
 		rc = em_clts(ctxt);
@@ -4216,10 +4189,10 @@ twobyte_insn:
 		rc = X86EMUL_CONTINUE;
 		break;
 	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt, ops);
+		rc = emulate_sysenter(ctxt);
 		break;
 	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt, ops);
+		rc = emulate_sysexit(ctxt);
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
@@ -4234,10 +4207,10 @@ twobyte_insn:
 		c->dst.val = test_cc(c->b, ctxt->eflags);
 		break;
 	case 0xa0:	  /* push fs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xa1:	 /* pop fs */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xa3:
 	      bt:		/* bt */
@@ -4251,10 +4224,10 @@ twobyte_insn:
 		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xa8:	/* push gs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xa9:	/* pop gs */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xab:
 	      bts:		/* bts */
@@ -4284,17 +4257,17 @@ twobyte_insn:
 		}
 		break;
 	case 0xb2:		/* lss */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
 		break;
 	case 0xb3:
 	      btr:		/* btr */
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xb4:		/* lfs */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xb5:		/* lgs */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77c9d8673dc4..51df0b6c891a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4513,7 +4513,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
 	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
 								 inc_eip;
-	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq);
 
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;

From 5e520e62787afd8fc28626fd8d4f77491135119d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 15 May 2011 10:13:12 -0400
Subject: [PATCH 005/143] KVM: VMX: Move VMREAD cleanup to exception handler

We clean up a failed VMREAD by clearing the output register.  Do
it in the exception handler instead of unconditionally.  This is
worthwhile since there are more than a hundred call sites.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 6 +++++-
 arch/x86/kvm/vmx.c              | 8 +++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2ee897..db4b6543b830 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -830,11 +830,12 @@ enum {
 asmlinkage void kvm_spurious_fault(void);
 extern bool kvm_rebooting;
 
-#define __kvm_handle_fault_on_reboot(insn) \
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)	\
 	"666: " insn "\n\t" \
 	"668: \n\t"                           \
 	".pushsection .fixup, \"ax\" \n" \
 	"667: \n\t" \
+	cleanup_insn "\n\t"		      \
 	"cmpb $0, kvm_rebooting \n\t"	      \
 	"jne 668b \n\t"      		      \
 	__ASM_SIZE(push) " $666b \n\t"	      \
@@ -844,6 +845,9 @@ extern bool kvm_rebooting;
 	_ASM_PTR " 666b, 667b \n\t" \
 	".popsection"
 
+#define __kvm_handle_fault_on_reboot(insn)		\
+	____kvm_handle_fault_on_reboot(insn, "")
+
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea421..7d22d6c6734d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,6 +43,8 @@
 #include "trace.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -587,10 +589,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 
 static unsigned long vmcs_readl(unsigned long field)
 {
-	unsigned long value = 0;
+	unsigned long value;
 
-	asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-		      : "+a"(value) : "d"(field) : "cc");
+	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+		      : "=a"(value) : "d"(field) : "cc");
 	return value;
 }
 

From 96304217a783a65c0923a26f54141cfe7a2a71b5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 15 May 2011 10:13:13 -0400
Subject: [PATCH 006/143] KVM: VMX: always_inline VMREADs

vmcs_readl() and friends are really short, but gcc thinks they are long because of
the out-of-line exception handlers.  Mark them always_inline to clear the
misunderstanding.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d22d6c6734d..3365e5dd3360 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -587,7 +587,7 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 	}
 }
 
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
 
@@ -596,17 +596,17 @@ static unsigned long vmcs_readl(unsigned long field)
 	return value;
 }
 
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
 {
 	return vmcs_readl(field);
 }
 
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
 {
 	return vmcs_readl(field);
 }
 
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
 {
 #ifdef CONFIG_X86_64
 	return vmcs_readl(field);

From 332b207d65c1d7982489dbb83e5071c95e19eb75 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:20:27 +0800
Subject: [PATCH 007/143] KVM: MMU: optimize pte write path if don't have
 protected sp

Simply return from kvm_mmu_pte_write path if no shadow page is
write-protected, then we can avoid to walk all shadow pages and hold
mmu-lock

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index db4b6543b830..387780eb97bb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -441,6 +441,7 @@ struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
+	unsigned int indirect_shadow_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b768..b4ae7afa6b3b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -498,6 +498,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count += 1;
 	}
+	kvm->arch.indirect_shadow_pages++;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +514,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 		linfo->write_count -= 1;
 		WARN_ON(linfo->write_count < 0);
 	}
+	kvm->arch.indirect_shadow_pages--;
 }
 
 static int has_wrprotected_page(struct kvm *kvm,
@@ -3233,6 +3235,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int level, npte, invlpg_counter, r, flooded = 0;
 	bool remote_flush, local_flush, zap_page;
 
+	/*
+	 * If we don't have indirect shadow pages, it means no page is
+	 * write-protected, so we can exit simply.
+	 */
+	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+		return;
+
 	zap_page = remote_flush = local_flush = false;
 	offset = offset_in_page(gpa);
 

From 8b0cedff040b652f3d36b1368778667581b0c140 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:22:04 +0800
Subject: [PATCH 008/143] KVM: use __copy_to_user/__clear_user to write guest
 page

Simply use __copy_to_user/__clear_user to write guest page since we have
already verified the user address when the memslot is set

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c  | 4 ++--
 virt/kvm/kvm_main.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51df0b6c891a..0b089860e950 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1388,7 +1388,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			return 1;
 		kvm_x86_ops->patch_hypercall(vcpu, instructions);
 		((unsigned char *)instructions)[3] = 0xc3; /* ret */
-		if (copy_to_user((void __user *)addr, instructions, 4))
+		if (__copy_to_user((void __user *)addr, instructions, 4))
 			return 1;
 		kvm->arch.hv_hypercall = data;
 		break;
@@ -1415,7 +1415,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
 		if (kvm_is_error_hva(addr))
 			return 1;
-		if (clear_user((void __user *)addr, PAGE_SIZE))
+		if (__clear_user((void __user *)addr, PAGE_SIZE))
 			return 1;
 		vcpu->arch.hv_vapic = data;
 		break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 96ebc0679415..087b3f8ca46e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1345,7 +1345,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
-	r = copy_to_user((void __user *)addr + offset, data, len);
+	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty(kvm, gfn);
@@ -1405,7 +1405,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
-	r = copy_to_user((void __user *)ghc->hva, data, len);
+	r = __copy_to_user((void __user *)ghc->hva, data, len);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);

From 1249b96e72533ffdb2fa25b5d7471918b065ccc8 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:25:10 +0800
Subject: [PATCH 009/143] KVM: fix uninitialized warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

 warning: ‘cs_sel’ may be used uninitialized in this function
 warning: ‘ss_sel’ may be used uninitialized in this function

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d3f4466cd19c..bc916bd22e86 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1996,7 +1996,7 @@ static int emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
-	u16 cs_sel, ss_sel;
+	u16 cs_sel = 0, ss_sel = 0;
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||

From 53c07b18787d564a105e1aa678795d67eeb27447 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:26:20 +0800
Subject: [PATCH 010/143] KVM: MMU: abstract the operation of rmap

Abstract the operation of rmap to spte_list, then we can use it for the
reverse mapping of parent pte in the later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/mmu.c              | 296 +++++++++++++++++---------------
 2 files changed, 158 insertions(+), 140 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 387780eb97bb..e6a4a57e142b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -347,7 +347,7 @@ struct kvm_vcpu_arch {
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
 	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b4ae7afa6b3b..a6811cbdbf0d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 			| PT64_NX_MASK)
 
-#define RMAP_EXT 4
+#define PTE_LIST_EXT 4
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -164,9 +164,9 @@ module_param(oos_shadow, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
-struct kvm_rmap_desc {
-	u64 *sptes[RMAP_EXT];
-	struct kvm_rmap_desc *more;
+struct pte_list_desc {
+	u64 *sptes[PTE_LIST_EXT];
+	struct pte_list_desc *more;
 };
 
 struct kvm_shadow_walk_iterator {
@@ -185,7 +185,7 @@ struct kvm_shadow_walk_iterator {
 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 
 static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
@@ -401,8 +401,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 				   pte_chain_cache, 4);
 	if (r)
 		goto out;
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+				   pte_list_desc_cache, 4 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +416,10 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+				pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+				pte_list_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
 				mmu_page_header_cache);
@@ -444,15 +446,15 @@ static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 	kmem_cache_free(pte_chain_cache, pc);
 }
 
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-				      sizeof(struct kvm_rmap_desc));
+	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
+				      sizeof(struct pte_list_desc));
 }
 
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
-	kmem_cache_free(rmap_desc_cache, rd);
+	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 }
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -589,10 +591,139 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	return level - 1;
 }
 
+/*
+ * Pte mapping structures:
+ *
+ * If pte_list bit zero is zero, then pte_list point to the spte.
+ *
+ * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ *
+ * Returns the number of pte entries before the spte was added or zero if
+ * the spte was not added.
+ *
+ */
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+			unsigned long *pte_list)
+{
+	struct pte_list_desc *desc;
+	int i, count = 0;
+
+	if (!*pte_list) {
+		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+		*pte_list = (unsigned long)spte;
+	} else if (!(*pte_list & 1)) {
+		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+		desc = mmu_alloc_pte_list_desc(vcpu);
+		desc->sptes[0] = (u64 *)*pte_list;
+		desc->sptes[1] = spte;
+		*pte_list = (unsigned long)desc | 1;
+		++count;
+	} else {
+		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+		while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
+			desc = desc->more;
+			count += PTE_LIST_EXT;
+		}
+		if (desc->sptes[PTE_LIST_EXT-1]) {
+			desc->more = mmu_alloc_pte_list_desc(vcpu);
+			desc = desc->more;
+		}
+		for (i = 0; desc->sptes[i]; ++i)
+			++count;
+		desc->sptes[i] = spte;
+	}
+	return count;
+}
+
+static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
+{
+	struct pte_list_desc *desc;
+	u64 *prev_spte;
+	int i;
+
+	if (!*pte_list)
+		return NULL;
+	else if (!(*pte_list & 1)) {
+		if (!spte)
+			return (u64 *)*pte_list;
+		return NULL;
+	}
+	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+	prev_spte = NULL;
+	while (desc) {
+		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+			if (prev_spte == spte)
+				return desc->sptes[i];
+			prev_spte = desc->sptes[i];
+		}
+		desc = desc->more;
+	}
+	return NULL;
+}
+
+static void
+pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
+			   int i, struct pte_list_desc *prev_desc)
+{
+	int j;
+
+	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
+		;
+	desc->sptes[i] = desc->sptes[j];
+	desc->sptes[j] = NULL;
+	if (j != 0)
+		return;
+	if (!prev_desc && !desc->more)
+		*pte_list = (unsigned long)desc->sptes[0];
+	else
+		if (prev_desc)
+			prev_desc->more = desc->more;
+		else
+			*pte_list = (unsigned long)desc->more | 1;
+	mmu_free_pte_list_desc(desc);
+}
+
+static void pte_list_remove(u64 *spte, unsigned long *pte_list)
+{
+	struct pte_list_desc *desc;
+	struct pte_list_desc *prev_desc;
+	int i;
+
+	if (!*pte_list) {
+		printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+		BUG();
+	} else if (!(*pte_list & 1)) {
+		rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+		if ((u64 *)*pte_list != spte) {
+			printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
+			BUG();
+		}
+		*pte_list = 0;
+	} else {
+		rmap_printk("pte_list_remove:  %p many->many\n", spte);
+		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+		prev_desc = NULL;
+		while (desc) {
+			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+				if (desc->sptes[i] == spte) {
+					pte_list_desc_remove_entry(pte_list,
+							       desc, i,
+							       prev_desc);
+					return;
+				}
+			prev_desc = desc;
+			desc = desc->more;
+		}
+		pr_err("pte_list_remove: %p many->many\n", spte);
+		BUG();
+	}
+}
+
 /*
  * Take gfn and return the reverse mapping to it.
  */
-
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
 	struct kvm_memory_slot *slot;
@@ -607,122 +738,35 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	return &linfo->rmap_pde;
 }
 
-/*
- * Reverse mapping data structures:
- *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
- *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
- *
- * Returns the number of rmap entries before the spte was added or zero if
- * the spte was not added.
- *
- */
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
-	struct kvm_rmap_desc *desc;
 	unsigned long *rmapp;
-	int i, count = 0;
 
 	if (!is_rmap_spte(*spte))
-		return count;
+		return 0;
+
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-	if (!*rmapp) {
-		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-		*rmapp = (unsigned long)spte;
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-		desc = mmu_alloc_rmap_desc(vcpu);
-		desc->sptes[0] = (u64 *)*rmapp;
-		desc->sptes[1] = spte;
-		*rmapp = (unsigned long)desc | 1;
-		++count;
-	} else {
-		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		while (desc->sptes[RMAP_EXT-1] && desc->more) {
-			desc = desc->more;
-			count += RMAP_EXT;
-		}
-		if (desc->sptes[RMAP_EXT-1]) {
-			desc->more = mmu_alloc_rmap_desc(vcpu);
-			desc = desc->more;
-		}
-		for (i = 0; desc->sptes[i]; ++i)
-			++count;
-		desc->sptes[i] = spte;
-	}
-	return count;
+	return pte_list_add(vcpu, spte, rmapp);
 }
 
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-				   struct kvm_rmap_desc *desc,
-				   int i,
-				   struct kvm_rmap_desc *prev_desc)
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
-	int j;
-
-	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
-		;
-	desc->sptes[i] = desc->sptes[j];
-	desc->sptes[j] = NULL;
-	if (j != 0)
-		return;
-	if (!prev_desc && !desc->more)
-		*rmapp = (unsigned long)desc->sptes[0];
-	else
-		if (prev_desc)
-			prev_desc->more = desc->more;
-		else
-			*rmapp = (unsigned long)desc->more | 1;
-	mmu_free_rmap_desc(desc);
+	return pte_list_next(rmapp, spte);
 }
 
 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
-	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
 	gfn_t gfn;
 	unsigned long *rmapp;
-	int i;
 
 	sp = page_header(__pa(spte));
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
-	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
-		BUG();
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p 1->0\n", spte);
-		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
-			BUG();
-		}
-		*rmapp = 0;
-	} else {
-		rmap_printk("rmap_remove:  %p many->many\n", spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		prev_desc = NULL;
-		while (desc) {
-			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
-				if (desc->sptes[i] == spte) {
-					rmap_desc_remove_entry(rmapp,
-							       desc, i,
-							       prev_desc);
-					return;
-				}
-			prev_desc = desc;
-			desc = desc->more;
-		}
-		pr_err("rmap_remove: %p many->many\n", spte);
-		BUG();
-	}
+	pte_list_remove(spte, rmapp);
 }
 
 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
@@ -752,32 +796,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 		rmap_remove(kvm, sptep);
 }
 
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
-	struct kvm_rmap_desc *desc;
-	u64 *prev_spte;
-	int i;
-
-	if (!*rmapp)
-		return NULL;
-	else if (!(*rmapp & 1)) {
-		if (!spte)
-			return (u64 *)*rmapp;
-		return NULL;
-	}
-	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_spte = NULL;
-	while (desc) {
-		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-			if (prev_spte == spte)
-				return desc->sptes[i];
-			prev_spte = desc->sptes[i];
-		}
-		desc = desc->more;
-	}
-	return NULL;
-}
-
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	unsigned long *rmapp;
@@ -3601,8 +3619,8 @@ static void mmu_destroy_caches(void)
 {
 	if (pte_chain_cache)
 		kmem_cache_destroy(pte_chain_cache);
-	if (rmap_desc_cache)
-		kmem_cache_destroy(rmap_desc_cache);
+	if (pte_list_desc_cache)
+		kmem_cache_destroy(pte_list_desc_cache);
 	if (mmu_page_header_cache)
 		kmem_cache_destroy(mmu_page_header_cache);
 }
@@ -3614,10 +3632,10 @@ int kvm_mmu_module_init(void)
 					    0, 0, NULL);
 	if (!pte_chain_cache)
 		goto nomem;
-	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-					    sizeof(struct kvm_rmap_desc),
+	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
-	if (!rmap_desc_cache)
+	if (!pte_list_desc_cache)
 		goto nomem;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",

From 67052b3508f09956427d6476fd35e8fddde6c618 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:27:08 +0800
Subject: [PATCH 011/143] KVM: MMU: remove the arithmetic of parent pte rmap

Parent pte rmap and page rmap are very similar, so use the same arithmetic
for them

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   7 +-
 arch/x86/kvm/mmu.c              | 191 ++++++++------------------------
 2 files changed, 47 insertions(+), 151 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e6a4a57e142b..ff17deb6e98b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -227,14 +227,10 @@ struct kvm_mmu_page {
 	 * in this shadow page.
 	 */
 	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	bool multimapped;         /* More than one parent_pte? */
 	bool unsync;
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
-	union {
-		u64 *parent_pte;               /* !multimapped */
-		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-	};
+	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 };
 
@@ -346,7 +342,6 @@ struct kvm_vcpu_arch {
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
-	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
 	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a6811cbdbf0d..9eaca1c739a6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,9 +182,6 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
-
-static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
@@ -397,12 +394,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-				   pte_chain_cache, 4);
-	if (r)
-		goto out;
 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
-				   pte_list_desc_cache, 4 + PTE_PREFETCH_NUM);
+				   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +409,6 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-				pte_chain_cache);
 	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
 				pte_list_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
@@ -435,17 +426,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 	return p;
 }
 
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-				      sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
-	kmem_cache_free(pte_chain_cache, pc);
-}
-
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
@@ -721,6 +701,26 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 	}
 }
 
+typedef void (*pte_list_walk_fn) (u64 *spte);
+static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
+{
+	struct pte_list_desc *desc;
+	int i;
+
+	if (!*pte_list)
+		return;
+
+	if (!(*pte_list & 1))
+		return fn((u64 *)*pte_list);
+
+	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+	while (desc) {
+		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+			fn(desc->sptes[i]);
+		desc = desc->more;
+	}
+}
+
 /*
  * Take gfn and return the reverse mapping to it.
  */
@@ -1069,12 +1069,27 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+	if (!parent_pte)
+		return;
+
+	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+				       u64 *parent_pte)
+{
+	pte_list_remove(parent_pte, &sp->parent_ptes);
+}
+
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
-
-	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
+					sizeof *sp);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	if (!direct)
 		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
@@ -1082,121 +1097,24 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	sp->multimapped = 0;
-	sp->parent_pte = parent_pte;
+	sp->parent_ptes = 0;
+	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
 }
 
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!parent_pte)
-		return;
-	if (!sp->multimapped) {
-		u64 *old = sp->parent_pte;
-
-		if (!old) {
-			sp->parent_pte = parent_pte;
-			return;
-		}
-		sp->multimapped = 1;
-		pte_chain = mmu_alloc_pte_chain(vcpu);
-		INIT_HLIST_HEAD(&sp->parent_ptes);
-		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-		pte_chain->parent_ptes[0] = old;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-			continue;
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-			if (!pte_chain->parent_ptes[i]) {
-				pte_chain->parent_ptes[i] = parent_pte;
-				return;
-			}
-	}
-	pte_chain = mmu_alloc_pte_chain(vcpu);
-	BUG_ON(!pte_chain);
-	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-	pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
-				       u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!sp->multimapped) {
-		BUG_ON(sp->parent_pte != parent_pte);
-		sp->parent_pte = NULL;
-		return;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			if (pte_chain->parent_ptes[i] != parent_pte)
-				continue;
-			while (i + 1 < NR_PTE_CHAIN_ENTRIES
-				&& pte_chain->parent_ptes[i + 1]) {
-				pte_chain->parent_ptes[i]
-					= pte_chain->parent_ptes[i + 1];
-				++i;
-			}
-			pte_chain->parent_ptes[i] = NULL;
-			if (i == 0) {
-				hlist_del(&pte_chain->link);
-				mmu_free_pte_chain(pte_chain);
-				if (hlist_empty(&sp->parent_ptes)) {
-					sp->multimapped = 0;
-					sp->parent_pte = NULL;
-				}
-			}
-			return;
-		}
-	BUG();
-}
-
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	struct kvm_mmu_page *parent_sp;
-	int i;
-
-	if (!sp->multimapped && sp->parent_pte) {
-		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(parent_sp, sp->parent_pte);
-		return;
-	}
-
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			u64 *spte = pte_chain->parent_ptes[i];
-
-			if (!spte)
-				break;
-			parent_sp = page_header(__pa(spte));
-			fn(parent_sp, spte);
-		}
-}
-
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void mark_unsync(u64 *spte);
 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	mmu_parent_walk(sp, mark_unsync);
+	pte_list_walk(&sp->parent_ptes, mark_unsync);
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void mark_unsync(u64 *spte)
 {
+	struct kvm_mmu_page *sp;
 	unsigned int index;
 
+	sp = page_header(__pa(spte));
 	index = spte - sp->spt;
 	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
 		return;
@@ -1694,17 +1612,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
 
-	while (sp->multimapped || sp->parent_pte) {
-		if (!sp->multimapped)
-			parent_pte = sp->parent_pte;
-		else {
-			struct kvm_pte_chain *chain;
-
-			chain = container_of(sp->parent_ptes.first,
-					     struct kvm_pte_chain, link);
-			parent_pte = chain->parent_ptes[0];
-		}
-		BUG_ON(!parent_pte);
+	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) {
 		kvm_mmu_put_page(sp, parent_pte);
 		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
 	}
@@ -3617,8 +3525,6 @@ static struct shrinker mmu_shrinker = {
 
 static void mmu_destroy_caches(void)
 {
-	if (pte_chain_cache)
-		kmem_cache_destroy(pte_chain_cache);
 	if (pte_list_desc_cache)
 		kmem_cache_destroy(pte_list_desc_cache);
 	if (mmu_page_header_cache)
@@ -3627,11 +3533,6 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
-	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL);
-	if (!pte_chain_cache)
-		goto nomem;
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);

From 38e3b2b28c5f8fe7914172f4ba631ef4552824d6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:27:52 +0800
Subject: [PATCH 012/143] KVM: MMU: cleanup for kvm_mmu_page_unlink_children

Cleanup the same operation between kvm_mmu_page_unlink_children and
mmu_pte_write_zap_pte

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 66 ++++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9eaca1c739a6..71eddc4c9810 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1566,32 +1566,33 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 }
 
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+			     u64 *spte)
+{
+	u64 pte;
+	struct kvm_mmu_page *child;
+
+	pte = *spte;
+	if (is_shadow_present_pte(pte)) {
+		if (is_last_spte(pte, sp->role.level))
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+		else {
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, spte);
+		}
+	}
+	__set_spte(spte, shadow_trap_nonpresent_pte);
+	if (is_large_pte(pte))
+		--kvm->stat.lpages;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
 	unsigned i;
-	u64 *pt;
-	u64 ent;
 
-	pt = sp->spt;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		ent = pt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				ent &= PT64_BASE_ADDR_MASK;
-				mmu_page_remove_parent_pte(page_header(ent),
-							   &pt[i]);
-			} else {
-				if (is_large_pte(ent))
-					--kvm->stat.lpages;
-				drop_spte(kvm, &pt[i],
-					  shadow_trap_nonpresent_pte);
-			}
-		}
-		pt[i] = shadow_trap_nonpresent_pte;
-	}
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		mmu_page_zap_pte(kvm, sp, sp->spt + i);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -3069,27 +3070,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp,
-				  u64 *spte)
-{
-	u64 pte;
-	struct kvm_mmu_page *child;
-
-	pte = *spte;
-	if (is_shadow_present_pte(pte)) {
-		if (is_last_spte(pte, sp->role.level))
-			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
-		else {
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, spte);
-		}
-	}
-	__set_spte(spte, shadow_trap_nonpresent_pte);
-	if (is_large_pte(pte))
-		--vcpu->kvm->stat.lpages;
-}
-
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  const void *new)
@@ -3271,7 +3251,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
-			mmu_pte_write_zap_pte(vcpu, sp, spte);
+			mmu_page_zap_pte(vcpu->kvm, sp, spte);
 			if (gentry &&
 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
 			      & mask.word))

From bcdd9a93c57571652c887cb522176db741893581 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 15 May 2011 23:28:29 +0800
Subject: [PATCH 013/143] KVM: MMU: cleanup for dropping parent pte

Introduce drop_parent_pte to remove the rmap of parent pte and
clear parent pte

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 71eddc4c9810..15afa1e1eaf9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1084,6 +1084,13 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	pte_list_remove(parent_pte, &sp->parent_ptes);
 }
 
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+			    u64 *parent_pte)
+{
+	mmu_page_remove_parent_pte(sp, parent_pte);
+	__set_spte(parent_pte, shadow_trap_nonpresent_pte);
+}
+
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
@@ -1560,8 +1567,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (child->role.access == direct_access)
 			return;
 
-		mmu_page_remove_parent_pte(child, sptep);
-		__set_spte(sptep, shadow_trap_nonpresent_pte);
+		drop_parent_pte(child, sptep);
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 }
@@ -1578,7 +1584,7 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, spte);
+			drop_parent_pte(child, spte);
 		}
 	}
 	__set_spte(spte, shadow_trap_nonpresent_pte);
@@ -1613,10 +1619,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
 
-	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) {
-		kvm_mmu_put_page(sp, parent_pte);
-		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
-	}
+	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
+		drop_parent_pte(sp, parent_pte);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -2046,8 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			u64 pte = *sptep;
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_parent_pte(child, sptep);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %llx new %llx\n",

From 24c82e576b7860a4f02a21103e9df39e11e97006 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 May 2011 05:56:07 -0400
Subject: [PATCH 014/143] KVM: Sanitize cpuid

Instead of blacklisting known-unsupported cpuid leaves, whitelist known-
supported leaves.  This is more conservative and prevents us from reporting
features we don't support.  Also whitelist a few more leaves while at it.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b089860e950..da48622d170f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2283,6 +2283,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	entry->flags = 0;
 }
 
+static bool supported_xcr0_bit(unsigned bit)
+{
+	u64 mask = ((u64)1 << bit);
+
+	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
 #define F(x) bit(X86_FEATURE_##x)
 
 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2393,6 +2400,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 9:
+		break;
 	case 0xb: {
 		int i, level_type;
 
@@ -2414,7 +2423,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		for (i = 1; *nent < maxnent && i < 64; ++i) {
-			if (entry[i].eax == 0)
+			if (entry[i].eax == 0 || !supported_xcr0_bit(i))
 				continue;
 			do_cpuid_1_ent(&entry[i], function, i);
 			entry[i].flags |=
@@ -2451,6 +2460,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->ecx &= kvm_supported_word6_x86_features;
 		cpuid_mask(&entry->ecx, 6);
 		break;
+	case 0x80000008: {
+		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+		unsigned phys_as = entry->eax & 0xff;
+
+		if (!g_phys_as)
+			g_phys_as = phys_as;
+		entry->eax = g_phys_as | (virt_as << 8);
+		entry->ebx = entry->edx = 0;
+		break;
+	}
+	case 0x80000019:
+		entry->ecx = entry->edx = 0;
+		break;
+	case 0x8000001a:
+		break;
+	case 0x8000001d:
+		break;
 	/*Add support for Centaur's CPUID instruction*/
 	case 0xC0000000:
 		/*Just support up to 0xC0000004 now*/
@@ -2460,10 +2487,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->edx &= kvm_supported_word5_x86_features;
 		cpuid_mask(&entry->edx, 5);
 		break;
+	case 3: /* Processor serial number */
+	case 5: /* MONITOR/MWAIT */
+	case 6: /* Thermal management */
+	case 0xA: /* Architectural Performance Monitoring */
+	case 0x80000007: /* Advanced power management */
 	case 0xC0000002:
 	case 0xC0000003:
 	case 0xC0000004:
-		/*Now nothing to do, reserved for the future*/
+	default:
+		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 		break;
 	}
 

From d462b8192368f10e979250377930f9695a4039d0 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@math.technion.ac.il>
Date: Tue, 24 May 2011 15:26:10 +0300
Subject: [PATCH 015/143] KVM: VMX: Keep list of loaded VMCSs, instead of vcpus

In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
because (at least in theory) the processor might not have written all of its
content back to memory. Since a patch from June 26, 2008, this is done using
a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.

The problem is that with nested VMX, we no longer have the concept of a
vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
L2s), and each of those may be have been last loaded on a different cpu.

So instead of linking the vcpus, we link the VMCSs, using a new structure
loaded_vmcs. This structure contains the VMCS, and the information pertaining
to its loading on a specific cpu (namely, the cpu number, and whether it
was already launched on this cpu once). In nested we will also use the same
structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
currently active VMCS.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Acked-by: Acked-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 150 ++++++++++++++++++++++++++-------------------
 1 file changed, 86 insertions(+), 64 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3365e5dd3360..1444e4149c4d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -118,6 +118,18 @@ struct vmcs {
 	char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+	struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
 	unsigned index;
 	u64 data;
@@ -126,9 +138,7 @@ struct shared_msr_entry {
 
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
-	struct list_head      local_vcpus_link;
 	unsigned long         host_rsp;
-	int                   launched;
 	u8                    fail;
 	u8                    cpl;
 	bool                  nmi_known_unmasked;
@@ -142,7 +152,14 @@ struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
-	struct vmcs          *vmcs;
+	/*
+	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+	 * non-nested (L1) guest, it always points to vmcs01. For a nested
+	 * guest (L2), it points to a different VMCS.
+	 */
+	struct loaded_vmcs    vmcs01;
+	struct loaded_vmcs   *loaded_vmcs;
+	bool                  __launched; /* temporary, used in vmx_vcpu_run */
 	struct msr_autoload {
 		unsigned nr;
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -202,7 +219,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -503,6 +524,13 @@ static void vmcs_clear(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+	vmcs_clear(loaded_vmcs->vmcs);
+	loaded_vmcs->cpu = -1;
+	loaded_vmcs->launched = 0;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
@@ -516,25 +544,24 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-	struct vcpu_vmx *vmx = arg;
+	struct loaded_vmcs *loaded_vmcs = arg;
 	int cpu = raw_smp_processor_id();
 
-	if (vmx->vcpu.cpu == cpu)
-		vmcs_clear(vmx->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+	if (loaded_vmcs->cpu != cpu)
+		return; /* vcpu migration can race with cpu offline */
+	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	list_del(&vmx->local_vcpus_link);
-	vmx->vcpu.cpu = -1;
-	vmx->launched = 0;
+	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+	loaded_vmcs_init(loaded_vmcs);
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-	if (vmx->vcpu.cpu == -1)
-		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+	if (loaded_vmcs->cpu != -1)
+		smp_call_function_single(
+			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -973,22 +1000,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(phys_addr);
-	else if (vcpu->cpu != cpu)
-		vcpu_clear(vmx);
+	else if (vmx->loaded_vmcs->cpu != cpu)
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 
-	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		vmcs_load(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+		vmcs_load(vmx->loaded_vmcs->vmcs);
 	}
 
-	if (vcpu->cpu != cpu) {
+	if (vmx->loaded_vmcs->cpu != cpu) {
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
+		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		local_irq_enable();
 
 		/*
@@ -1000,6 +1027,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+		vmx->loaded_vmcs->cpu = cpu;
 	}
 }
 
@@ -1007,7 +1035,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
-		__vcpu_clear(to_vmx(vcpu));
+		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+		vcpu->cpu = -1;
 		kvm_cpu_vmxoff();
 	}
 }
@@ -1471,7 +1500,7 @@ static int hardware_enable(void *garbage)
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
-	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1495,14 +1524,14 @@ static int hardware_enable(void *garbage)
 	return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
 	int cpu = raw_smp_processor_id();
-	struct vcpu_vmx *vmx, *n;
+	struct loaded_vmcs *v, *n;
 
-	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-				 local_vcpus_link)
-		__vcpu_clear(vmx);
+	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+				 loaded_vmcss_on_cpu_link)
+		__loaded_vmcs_clear(v);
 }
 
 
@@ -1517,7 +1546,7 @@ static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
-		vmclear_local_vcpus();
+		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1698,6 +1727,18 @@ static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+	if (!loaded_vmcs->vmcs)
+		return;
+	loaded_vmcs_clear(loaded_vmcs);
+	free_vmcs(loaded_vmcs->vmcs);
+	loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -4169,6 +4210,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -4239,7 +4281,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
 		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4279,7 +4321,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-	vmx->launched = 1;
+	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4291,41 +4333,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->vmcs) {
-		vcpu_clear(vmx);
-		free_vmcs(vmx->vmcs);
-		vmx->vmcs = NULL;
-	}
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
-	vmx_free_vmcs(vcpu);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
-static inline void vmcs_init(struct vmcs *vmcs)
-{
-	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-
-	vmcs_clear(vmcs);
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
-}
-
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -4347,11 +4365,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto uninit_vcpu;
 	}
 
-	vmx->vmcs = alloc_vmcs();
-	if (!vmx->vmcs)
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs->vmcs = alloc_vmcs();
+	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-
-	vmcs_init(vmx->vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+	loaded_vmcs_init(vmx->loaded_vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4380,7 +4402,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->vmcs);
+	free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:

From d780592b99d7d8a5ff905f6bacca519d4a342c76 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 23 May 2011 10:33:05 +0200
Subject: [PATCH 016/143] KVM: Clean up error handling during VCPU creation

So far kvm_arch_vcpu_setup is responsible for freeing the vcpu struct if
it fails. Move this confusing resonsibility back into the hands of
kvm_vm_ioctl_create_vcpu. Only kvm_arch_vcpu_setup of x86 is affected,
all other archs cannot fail.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c  |  5 -----
 virt/kvm/kvm_main.c | 11 ++++++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da48622d170f..aaa3735004bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6126,12 +6126,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	if (r == 0)
 		r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
-	if (r < 0)
-		goto free_vcpu;
 
-	return 0;
-free_vcpu:
-	kvm_x86_ops->vcpu_free(vcpu);
 	return r;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 087b3f8ca46e..fa2321ac77cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1615,18 +1615,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
 	r = kvm_arch_vcpu_setup(vcpu);
 	if (r)
-		return r;
+		goto vcpu_destroy;
 
 	mutex_lock(&kvm->lock);
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 		r = -EINVAL;
-		goto vcpu_destroy;
+		goto unlock_vcpu_destroy;
 	}
 
 	kvm_for_each_vcpu(r, v, kvm)
 		if (v->vcpu_id == id) {
 			r = -EEXIST;
-			goto vcpu_destroy;
+			goto unlock_vcpu_destroy;
 		}
 
 	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
@@ -1636,7 +1636,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	r = create_vcpu_fd(vcpu);
 	if (r < 0) {
 		kvm_put_kvm(kvm);
-		goto vcpu_destroy;
+		goto unlock_vcpu_destroy;
 	}
 
 	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
@@ -1650,8 +1650,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	mutex_unlock(&kvm->lock);
 	return r;
 
-vcpu_destroy:
+unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
+vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
 	return r;
 }

From adf52235b4082e67f31bf1fba36f1dce312633d6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 25 May 2011 11:06:16 +0900
Subject: [PATCH 017/143] KVM: x86 emulator: Clean up init_emulate_ctxt()

Use a local pointer to the emulate_ctxt for simplicity.  Then, arrange
the hard-to-read mode selection lines neatly.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aaa3735004bc..ae2353c50208 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4508,7 +4508,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	struct decode_cache *c = &ctxt->decode;
 	int cs_db, cs_l;
 
 	/*
@@ -4521,15 +4522,15 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-	vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
-	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-	vcpu->arch.emulate_ctxt.mode =
-		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_VM86 : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
+	ctxt->eflags = kvm_get_rflags(vcpu);
+	ctxt->eip = kvm_rip_read(vcpu);
+	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
+		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
+		     cs_l				? X86EMUL_MODE_PROT64 :
+		     cs_db				? X86EMUL_MODE_PROT32 :
+							  X86EMUL_MODE_PROT16;
+	ctxt->guest_mode = is_guest_mode(vcpu);
+
 	memset(c, 0, sizeof(struct decode_cache));
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;

From b5c9ff731f3cee5a2f2d7154f48f8006b48eb66d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 25 May 2011 11:09:38 +0900
Subject: [PATCH 018/143] KVM: x86 emulator: Avoid clearing the whole
 decode_cache

During tracing the emulator, we noticed that init_emulate_ctxt()
sometimes took a bit longer time than we expected.

This patch is for mitigating the problem by some degree.

By looking into the function, we soon notice that it clears the whole
decode_cache whose size is about 2.5K bytes now.  Furthermore, most of
the bytes are taken for the two read_cache arrays, which are used only
by a few instructions.

Considering the fact that we are not assuming the cache arrays have
been cleared when we store actual data, we do not need to clear the
arrays: 2K bytes elimination.  In addition, we can avoid clearing the
fetch_cache and regs arrays.

This patch changes the initialization not to clear the arrays.

On our 64-bit host, init_emulate_ctxt() becomes 0.3 to 0.5us faster with
this patch applied.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  5 +++--
 arch/x86/kvm/x86.c                 | 17 +++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ab09ba290db3..c0f77e09ebce 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -246,8 +246,6 @@ struct decode_cache {
 	unsigned int d;
 	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
-	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
@@ -255,6 +253,9 @@ struct decode_cache {
 	u8 modrm_rm;
 	u8 modrm_seg;
 	bool rip_relative;
+	unsigned long eip;
+	/* Fields above regs are cleared together. */
+	unsigned long regs[NR_VCPU_REGS];
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae2353c50208..d88de565d0c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4506,6 +4506,20 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
+static void init_decode_cache(struct decode_cache *c,
+			      const unsigned long *regs)
+{
+	memset(c, 0, offsetof(struct decode_cache, regs));
+	memcpy(c->regs, regs, sizeof(c->regs));
+
+	c->fetch.start = 0;
+	c->fetch.end = 0;
+	c->io_read.pos = 0;
+	c->io_read.end = 0;
+	c->mem_read.pos = 0;
+	c->mem_read.end = 0;
+}
+
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -4531,8 +4545,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 							  X86EMUL_MODE_PROT16;
 	ctxt->guest_mode = is_guest_mode(vcpu);
 
-	memset(c, 0, sizeof(struct decode_cache));
-	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+	init_decode_cache(c, vcpu->arch.regs);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 

From 801d342432190947928e18f893f073fd87cd8bdf Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:02:23 +0300
Subject: [PATCH 019/143] KVM: nVMX: Add "nested" module option to kvm_intel

This patch adds to kvm_intel a module option "nested". This option controls
whether the guest can use VMX instructions, i.e., whether we allow nested
virtualization. A similar, but separate, option already exists for the
SVM module.

This option currently defaults to 0, meaning that nested VMX must be
explicitly enabled by giving nested=1. When nested VMX matures, the default
should probably be changed to enable nested VMX by default - just like
nested SVM is currently enabled by default.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1444e4149c4d..263be2debde8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -74,6 +74,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -1292,6 +1300,23 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - native_read_tsc();
 }
 
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+	return nested && guest_cpuid_has_vmx(vcpu);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.

From ec378aeef9dfc7c4ba72e9bd6cd4bd6f7d5fd0cc Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:02:54 +0300
Subject: [PATCH 020/143] KVM: nVMX: Implement VMXON and VMXOFF

This patch allows a guest to use the VMXON and VMXOFF instructions, and
emulates them accordingly. Basically this amounts to checking some
prerequisites, and then remembering whether the guest has enabled or disabled
VMX operation.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 110 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 263be2debde8..3a727ca02f24 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -144,6 +144,15 @@ struct shared_msr_entry {
 	u64 mask;
 };
 
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+	/* Has the level1 guest done vmxon? */
+	bool vmxon;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -203,6 +212,9 @@ struct vcpu_vmx {
 	u32 exit_reason;
 
 	bool rdtscp_enabled;
+
+	/* Support for a guest hypervisor (nested VMX) */
+	struct nested_vmx nested;
 };
 
 enum segment_cache_field {
@@ -3933,6 +3945,99 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* The Intel VMX Instruction Reference lists a bunch of bits that
+	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
+	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+	 * Otherwise, we should fail with #UD. We test these now:
+	 */
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	if (is_long_mode(vcpu) && !cs.l) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	vmx->nested.vmxon = true;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.vmxon) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+	    (is_long_mode(vcpu) && !cs.l)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+	if (!vmx->nested.vmxon)
+		return;
+	vmx->nested.vmxon = false;
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	free_nested(to_vmx(vcpu));
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -3961,8 +4066,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
@@ -4363,6 +4468,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
+	free_nested(vmx);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);

From 5e1746d6205d1efa3193cc0c67aa2d15e54799bd Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:03:24 +0300
Subject: [PATCH 021/143] KVM: nVMX: Allow setting the VMXE bit in CR4

This patch allows the guest to enable the VMXE bit in CR4, which is a
prerequisite to running VMXON.

Whether to allow setting the VMXE bit now depends on the architecture (svm
or vmx), so its checking has moved to kvm_x86_ops->set_cr4(). This function
now returns an int: If kvm_x86_ops->set_cr4() returns 1, __kvm_set_cr4()
will also return 1, and this will cause kvm_set_cr4() will throw a #GP.

Turning on the VMXE bit is allowed only when the nested VMX feature is
enabled, and turning it off is forbidden after a vmxon.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  6 +++++-
 arch/x86/kvm/vmx.c              | 17 +++++++++++++++--
 arch/x86/kvm/x86.c              |  4 +---
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff17deb6e98b..d167039ecdf4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -555,7 +555,7 @@ struct kvm_x86_ops {
 	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+	int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
 	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 506e4fe23adc..475d1c948501 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	update_cr0_intercept(svm);
 }
 
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
 	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
+	if (cr4 & X86_CR4_VMXE)
+		return 1;
+
 	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
 		svm_flush_tlb(vcpu);
 
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	cr4 |= host_cr4_mce;
 	to_svm(vcpu)->vmcb->save.cr4 = cr4;
 	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+	return 0;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3a727ca02f24..eda2cb619c25 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2121,7 +2121,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 		  (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 					unsigned long cr0,
@@ -2219,11 +2219,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+	if (cr4 & X86_CR4_VMXE) {
+		/*
+		 * To use VMXON (and later other VMX instructions), a guest
+		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
+		 * So basically the check on whether to allow nested VMX
+		 * is here.
+		 */
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+	} else if (to_vmx(vcpu)->nested.vmxon)
+		return 1;
+
 	vcpu->arch.cr4 = cr4;
 	if (enable_ept) {
 		if (!is_paging(vcpu)) {
@@ -2236,6 +2248,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
+	return 0;
 }
 
 static void vmx_get_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d88de565d0c0..460932b62c5b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -615,11 +615,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 				   kvm_read_cr3(vcpu)))
 		return 1;
 
-	if (cr4 & X86_CR4_VMXE)
+	if (kvm_x86_ops->set_cr4(vcpu, cr4))
 		return 1;
 
-	kvm_x86_ops->set_cr4(vcpu, cr4);
-
 	if ((cr4 ^ old_cr4) & pdptr_bits)
 		kvm_mmu_reset_context(vcpu);
 

From a9d30f33dd21b67b2f4db09f3dfe63a7c390d1b3 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:03:55 +0300
Subject: [PATCH 022/143] KVM: nVMX: Introduce vmcs12: a VMCS structure for L1

An implementation of VMX needs to define a VMCS structure. This structure
is kept in guest memory, but is opaque to the guest (who can only read or
write it with VMX instructions).

This patch starts to define the VMCS structure which our nested VMX
implementation will present to L1. We call it "vmcs12", as it is the VMCS
that L1 keeps for its L2 guest. We will add more content to this structure
in later patches.

This patch also adds the notion (as required by the VMX spec) of L1's "current
VMCS", and finally includes utility functions for mapping the guest-allocated
VMCSs in host memory.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eda2cb619c25..914dc4e9b37f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -144,6 +144,41 @@ struct shared_msr_entry {
 	u64 mask;
 };
 
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+struct __packed vmcs12 {
+	/* According to the Intel spec, a VMCS region must start with the
+	 * following two fields. Then follow implementation-specific data.
+	 */
+	u32 revision_id;
+	u32 abort;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -151,6 +186,12 @@ struct shared_msr_entry {
 struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
+
+	/* The guest-physical address of the current VMCS L1 keeps for L2 */
+	gpa_t current_vmptr;
+	/* The host-usable pointer to the above */
+	struct page *current_vmcs12_page;
+	struct vmcs12 *current_vmcs12;
 };
 
 struct vcpu_vmx {
@@ -231,6 +272,31 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return NULL;
+	}
+	return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+	kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+	kvm_release_page_clean(page);
+}
+
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -4039,6 +4105,12 @@ static void free_nested(struct vcpu_vmx *vmx)
 	if (!vmx->nested.vmxon)
 		return;
 	vmx->nested.vmxon = false;
+	if (vmx->nested.current_vmptr != -1ull) {
+		kunmap(vmx->nested.current_vmcs12_page);
+		nested_release_page(vmx->nested.current_vmcs12_page);
+		vmx->nested.current_vmptr = -1ull;
+		vmx->nested.current_vmcs12 = NULL;
+	}
 }
 
 /* Emulate the VMXOFF instruction */
@@ -4543,6 +4615,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.current_vmptr = -1ull;
+	vmx->nested.current_vmcs12 = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:

From b87a51ae2893a5907f796eadb4beb60747a69209 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:04:25 +0300
Subject: [PATCH 023/143] KVM: nVMX: Implement reading and writing of VMX MSRs

When the guest can use VMX instructions (when the "nested" module option is
on), it should also be able to read and write VMX MSRs, e.g., to query about
VMX capabilities. This patch adds this support.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/msr-index.h |  12 ++
 arch/x86/kvm/vmx.c               | 219 +++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 485b4f1f079b..e3022ccff33b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -438,6 +438,18 @@
 #define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT	32
+#define VMX_BASIC_64		0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT	50
+#define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB	6LLU
+#define VMX_BASIC_INOUT		0x0040000000000000LLU
 
 /* AMD-V MSRs */
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 914dc4e9b37f..487952b20217 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1395,6 +1395,218 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
 	return nested && guest_cpuid_has_vmx(vcpu);
 }
 
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ * TODO: allow these variables to be modified (downgraded) by module options
+ * or other means.
+ */
+static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
+static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
+static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static __init void nested_vmx_setup_ctls_msrs(void)
+{
+	/*
+	 * Note that as a general rule, the high half of the MSRs (bits in
+	 * the control fields which may be 1) should be initialized by the
+	 * intersection of the underlying hardware's MSR (i.e., features which
+	 * can be supported) and the list of features we want to expose -
+	 * because they are known to be properly supported in our code.
+	 * Also, usually, the low half of the MSRs (bits which must be 1) can
+	 * be set to 0, meaning that L1 may turn off any of these bits. The
+	 * reason is that if one of these bits is necessary, it will appear
+	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+	 * fields of vmcs01 and vmcs02, will turn these bits off - and
+	 * nested_vmx_exit_handled() will not pass related exits to L1.
+	 * These rules have exceptions below.
+	 */
+
+	/* pin-based controls */
+	/*
+	 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
+	 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
+	 */
+	nested_vmx_pinbased_ctls_low = 0x16 ;
+	nested_vmx_pinbased_ctls_high = 0x16 |
+		PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+
+	/* exit controls */
+	nested_vmx_exit_ctls_low = 0;
+#ifdef CONFIG_X86_64
+	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#else
+	nested_vmx_exit_ctls_high = 0;
+#endif
+
+	/* entry controls */
+	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
+	nested_vmx_entry_ctls_low = 0;
+	nested_vmx_entry_ctls_high &=
+		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+
+	/* cpu-based controls */
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
+	nested_vmx_procbased_ctls_low = 0;
+	nested_vmx_procbased_ctls_high &=
+		CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+		CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	/*
+	 * We can allow some features even when not supported by the
+	 * hardware. For example, L1 can specify an MSR bitmap - and we
+	 * can use it to avoid exits to L1 - even when L0 runs L2
+	 * without MSR bitmaps.
+	 */
+	nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+
+	/* secondary cpu-based controls */
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
+	nested_vmx_secondary_ctls_low = 0;
+	nested_vmx_secondary_ctls_high &=
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+	/*
+	 * Bits 0 in high must be 0, and bits 1 in low must be 1.
+	 */
+	return ((control & high) | low) == control;
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+	return low | ((u64)high << 32);
+}
+
+/*
+ * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
+ * also let it use VMX-specific MSRs.
+ * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
+ * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
+ * like all other MSRs).
+ */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+	if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
+		     msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
+		/*
+		 * According to the spec, processors which do not support VMX
+		 * should throw a #GP(0) when VMX capability MSRs are read.
+		 */
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		/*
+		 * This MSR reports some information about VMX support. We
+		 * should return information about the VMX we emulate for the
+		 * guest, and the VMCS structure we give it - not about the
+		 * VMX support of the underlying hardware.
+		 */
+		*pdata = VMCS12_REVISION |
+			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+		break;
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
+					nested_vmx_pinbased_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
+					nested_vmx_procbased_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
+					nested_vmx_exit_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
+					nested_vmx_entry_ctls_high);
+		break;
+	case MSR_IA32_VMX_MISC:
+		*pdata = 0;
+		break;
+	/*
+	 * These MSRs specify bits which the guest must keep fixed (on or off)
+	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+	 * We picked the standard core2 setting.
+	 */
+#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
+	case MSR_IA32_VMX_CR0_FIXED0:
+		*pdata = VMXON_CR0_ALWAYSON;
+		break;
+	case MSR_IA32_VMX_CR0_FIXED1:
+		*pdata = -1ULL;
+		break;
+	case MSR_IA32_VMX_CR4_FIXED0:
+		*pdata = VMXON_CR4_ALWAYSON;
+		break;
+	case MSR_IA32_VMX_CR4_FIXED1:
+		*pdata = -1ULL;
+		break;
+	case MSR_IA32_VMX_VMCS_ENUM:
+		*pdata = 0x1f;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
+					nested_vmx_secondary_ctls_high);
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		/* Currently, no nested ept or nested vpid */
+		*pdata = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	if (!nested_vmx_allowed(vcpu))
+		return 0;
+
+	if (msr_index == MSR_IA32_FEATURE_CONTROL)
+		/* TODO: the right thing. */
+		return 1;
+	/*
+	 * No need to treat VMX capability MSRs specially: If we don't handle
+	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
+	 */
+	return 0;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1443,6 +1655,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		/* Otherwise falls through */
 	default:
 		vmx_load_host_state(to_vmx(vcpu));
+		if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
+			return 0;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			vmx_load_host_state(to_vmx(vcpu));
@@ -1514,6 +1728,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			return 1;
 		/* Otherwise falls through */
 	default:
+		if (vmx_set_vmx_msr(vcpu, msr_index, data))
+			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			vmx_load_host_state(vmx);
@@ -1902,6 +2118,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
+	if (nested)
+		nested_vmx_setup_ctls_msrs();
+
 	return alloc_kvm_area();
 }
 

From 064aea774768749c6fd308b37818ea3a9600583d Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:04:56 +0300
Subject: [PATCH 024/143] KVM: nVMX: Decoding memory operands of VMX
 instructions

This patch includes a utility function for decoding pointer operands of VMX
instructions issued by L1 (a guest hypervisor)

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c |  3 ++-
 arch/x86/kvm/x86.h |  4 ++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 487952b20217..74afd8660930 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4342,6 +4342,59 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+				 unsigned long exit_qualification,
+				 u32 vmx_instruction_info, gva_t *ret)
+{
+	/*
+	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
+	 * Execution", on an exit, vmx_instruction_info holds most of the
+	 * addressing components of the operand. Only the displacement part
+	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+	 * For how an actual address is calculated from all these components,
+	 * refer to Vol. 1, "Operand Addressing".
+	 */
+	int  scaling = vmx_instruction_info & 3;
+	int  addr_size = (vmx_instruction_info >> 7) & 7;
+	bool is_reg = vmx_instruction_info & (1u << 10);
+	int  seg_reg = (vmx_instruction_info >> 15) & 7;
+	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
+	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
+	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
+
+	if (is_reg) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	/* Addr = segment_base + offset */
+	/* offset = base + [index * scale] + displacement */
+	*ret = vmx_get_segment_base(vcpu, seg_reg);
+	if (base_is_valid)
+		*ret += kvm_register_read(vcpu, base_reg);
+	if (index_is_valid)
+		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
+	*ret += exit_qualification; /* holds the displacement */
+
+	if (addr_size == 1) /* 32 bit */
+		*ret &= 0xffffffff;
+
+	/*
+	 * TODO: throw #GP (and return 1) in various cases that the VM*
+	 * instructions require it - e.g., offset beyond segment limit,
+	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
+	 * address, and so on. Currently these are not checked.
+	 */
+	return 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 460932b62c5b..27d12a3b1890 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3848,7 +3848,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 					  exception);
 }
 
-static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 			       gva_t addr, void *val, unsigned int bytes,
 			       struct x86_exception *exception)
 {
@@ -3858,6 +3858,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 					  exception);
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				      gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e407ed3df817..6f150539b262 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -81,4 +81,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+	gva_t addr, void *val, unsigned int bytes,
+	struct x86_exception *exception);
+
 #endif

From ff2f6fe9618806d365f76a6f11a4f37881919412 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:05:27 +0300
Subject: [PATCH 025/143] KVM: nVMX: Introduce vmcs02: VMCS used to run L2

We saw in a previous patch that L1 controls its L2 guest with a vcms12.
L0 needs to create a real VMCS for running L2. We call that "vmcs02".
A later patch will contain the code, prepare_vmcs02(), for filling the vmcs02
fields. This patch only contains code for allocating vmcs02.

In this version, prepare_vmcs02() sets *all* of vmcs02's fields each time we
enter from L1 to L2, so keeping just one vmcs02 for the vcpu is enough: It can
be reused even when L1 runs multiple L2 guests. However, in future versions
we'll probably want to add an optimization where vmcs02 fields that rarely
change will not be set each time. For that, we may want to keep around several
vmcs02s of L2 guests that have recently run, so that potentially we could run
these L2s again more quickly because less vmwrites to vmcs02 will be needed.

This patch adds to each vcpu a vmcs02 pool, vmx->nested.vmcs02_pool,
which remembers the vmcs02s last used to run up to VMCS02_POOL_SIZE L2s.
As explained above, in the current version we choose VMCS02_POOL_SIZE=1,
I.e., one vmcs02 is allocated (and loaded onto the processor), and it is
reused to enter any L2 guest. In the future, when prepare_vmcs02() is
optimized not to set all fields every time, VMCS02_POOL_SIZE should be
increased.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 100 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 74afd8660930..03b29aca6e7f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -119,6 +119,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
 #define NR_AUTOLOAD_MSRS 1
+#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
 	u32 revision_id;
@@ -179,6 +180,13 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
+/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+struct vmcs02_list {
+	struct list_head list;
+	gpa_t vmptr;
+	struct loaded_vmcs vmcs02;
+};
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -192,6 +200,10 @@ struct nested_vmx {
 	/* The host-usable pointer to the above */
 	struct page *current_vmcs12_page;
 	struct vmcs12 *current_vmcs12;
+
+	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
+	struct list_head vmcs02_pool;
+	int vmcs02_num;
 };
 
 struct vcpu_vmx {
@@ -4243,6 +4255,89 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
+ */
+
+/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmptr == vmx->nested.current_vmptr) {
+			list_move(&item->list, &vmx->nested.vmcs02_pool);
+			return &item->vmcs02;
+		}
+
+	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+		/* Recycle the least recently used VMCS. */
+		item = list_entry(vmx->nested.vmcs02_pool.prev,
+			struct vmcs02_list, list);
+		item->vmptr = vmx->nested.current_vmptr;
+		list_move(&item->list, &vmx->nested.vmcs02_pool);
+		return &item->vmcs02;
+	}
+
+	/* Create a new VMCS */
+	item = (struct vmcs02_list *)
+		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+	if (!item)
+		return NULL;
+	item->vmcs02.vmcs = alloc_vmcs();
+	if (!item->vmcs02.vmcs) {
+		kfree(item);
+		return NULL;
+	}
+	loaded_vmcs_init(&item->vmcs02);
+	item->vmptr = vmx->nested.current_vmptr;
+	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num++;
+	return &item->vmcs02;
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmptr == vmptr) {
+			free_loaded_vmcs(&item->vmcs02);
+			list_del(&item->list);
+			kfree(item);
+			vmx->nested.vmcs02_num--;
+			return;
+		}
+}
+
+/*
+ * Free all VMCSs saved for this vcpu, except the one pointed by
+ * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
+ * currently used, if running L2), and vmcs01 when running L2.
+ */
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item, *n;
+	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+		if (vmx->loaded_vmcs != &item->vmcs02)
+			free_loaded_vmcs(&item->vmcs02);
+		list_del(&item->list);
+		kfree(item);
+	}
+	vmx->nested.vmcs02_num = 0;
+
+	if (vmx->loaded_vmcs != &vmx->vmcs01)
+		free_loaded_vmcs(&vmx->vmcs01);
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -4279,6 +4374,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num = 0;
+
 	vmx->nested.vmxon = true;
 
 	skip_emulated_instruction(vcpu);
@@ -4330,6 +4428,8 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
+
+	nested_free_all_saved_vmcss(vmx);
 }
 
 /* Emulate the VMXOFF instruction */

From 22bd035868b06a614debf7352c09fb3efdc7c269 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:05:57 +0300
Subject: [PATCH 026/143] KVM: nVMX: Add VMCS fields to the vmcs12

In this patch we add to vmcs12 (the VMCS that L1 keeps for L2) all the
standard VMCS fields.

Later patches will enable L1 to read and write these fields using VMREAD/
VMWRITE, and they will be used during a VMLAUNCH/VMRESUME in preparing vmcs02,
a hardware VMCS for running L2.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 281 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 281 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03b29aca6e7f..33476eb152ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -158,12 +158,150 @@ struct shared_msr_entry {
  * machines (necessary for live migration).
  * If there are changes in this struct, VMCS12_REVISION must be changed.
  */
+typedef u64 natural_width;
 struct __packed vmcs12 {
 	/* According to the Intel spec, a VMCS region must start with the
 	 * following two fields. Then follow implementation-specific data.
 	 */
 	u32 revision_id;
 	u32 abort;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+	u64 vm_exit_msr_store_addr;
+	u64 vm_exit_msr_load_addr;
+	u64 vm_entry_msr_load_addr;
+	u64 tsc_offset;
+	u64 virtual_apic_page_addr;
+	u64 apic_access_addr;
+	u64 ept_pointer;
+	u64 guest_physical_address;
+	u64 vmcs_link_pointer;
+	u64 guest_ia32_debugctl;
+	u64 guest_ia32_pat;
+	u64 guest_ia32_efer;
+	u64 guest_ia32_perf_global_ctrl;
+	u64 guest_pdptr0;
+	u64 guest_pdptr1;
+	u64 guest_pdptr2;
+	u64 guest_pdptr3;
+	u64 host_ia32_pat;
+	u64 host_ia32_efer;
+	u64 host_ia32_perf_global_ctrl;
+	u64 padding64[8]; /* room for future expansion */
+	/*
+	 * To allow migration of L1 (complete with its L2 guests) between
+	 * machines of different natural widths (32 or 64 bit), we cannot have
+	 * unsigned long fields with no explict size. We use u64 (aliased
+	 * natural_width) instead. Luckily, x86 is little-endian.
+	 */
+	natural_width cr0_guest_host_mask;
+	natural_width cr4_guest_host_mask;
+	natural_width cr0_read_shadow;
+	natural_width cr4_read_shadow;
+	natural_width cr3_target_value0;
+	natural_width cr3_target_value1;
+	natural_width cr3_target_value2;
+	natural_width cr3_target_value3;
+	natural_width exit_qualification;
+	natural_width guest_linear_address;
+	natural_width guest_cr0;
+	natural_width guest_cr3;
+	natural_width guest_cr4;
+	natural_width guest_es_base;
+	natural_width guest_cs_base;
+	natural_width guest_ss_base;
+	natural_width guest_ds_base;
+	natural_width guest_fs_base;
+	natural_width guest_gs_base;
+	natural_width guest_ldtr_base;
+	natural_width guest_tr_base;
+	natural_width guest_gdtr_base;
+	natural_width guest_idtr_base;
+	natural_width guest_dr7;
+	natural_width guest_rsp;
+	natural_width guest_rip;
+	natural_width guest_rflags;
+	natural_width guest_pending_dbg_exceptions;
+	natural_width guest_sysenter_esp;
+	natural_width guest_sysenter_eip;
+	natural_width host_cr0;
+	natural_width host_cr3;
+	natural_width host_cr4;
+	natural_width host_fs_base;
+	natural_width host_gs_base;
+	natural_width host_tr_base;
+	natural_width host_gdtr_base;
+	natural_width host_idtr_base;
+	natural_width host_ia32_sysenter_esp;
+	natural_width host_ia32_sysenter_eip;
+	natural_width host_rsp;
+	natural_width host_rip;
+	natural_width paddingl[8]; /* room for future expansion */
+	u32 pin_based_vm_exec_control;
+	u32 cpu_based_vm_exec_control;
+	u32 exception_bitmap;
+	u32 page_fault_error_code_mask;
+	u32 page_fault_error_code_match;
+	u32 cr3_target_count;
+	u32 vm_exit_controls;
+	u32 vm_exit_msr_store_count;
+	u32 vm_exit_msr_load_count;
+	u32 vm_entry_controls;
+	u32 vm_entry_msr_load_count;
+	u32 vm_entry_intr_info_field;
+	u32 vm_entry_exception_error_code;
+	u32 vm_entry_instruction_len;
+	u32 tpr_threshold;
+	u32 secondary_vm_exec_control;
+	u32 vm_instruction_error;
+	u32 vm_exit_reason;
+	u32 vm_exit_intr_info;
+	u32 vm_exit_intr_error_code;
+	u32 idt_vectoring_info_field;
+	u32 idt_vectoring_error_code;
+	u32 vm_exit_instruction_len;
+	u32 vmx_instruction_info;
+	u32 guest_es_limit;
+	u32 guest_cs_limit;
+	u32 guest_ss_limit;
+	u32 guest_ds_limit;
+	u32 guest_fs_limit;
+	u32 guest_gs_limit;
+	u32 guest_ldtr_limit;
+	u32 guest_tr_limit;
+	u32 guest_gdtr_limit;
+	u32 guest_idtr_limit;
+	u32 guest_es_ar_bytes;
+	u32 guest_cs_ar_bytes;
+	u32 guest_ss_ar_bytes;
+	u32 guest_ds_ar_bytes;
+	u32 guest_fs_ar_bytes;
+	u32 guest_gs_ar_bytes;
+	u32 guest_ldtr_ar_bytes;
+	u32 guest_tr_ar_bytes;
+	u32 guest_interruptibility_info;
+	u32 guest_activity_state;
+	u32 guest_sysenter_cs;
+	u32 host_ia32_sysenter_cs;
+	u32 padding32[8]; /* room for future expansion */
+	u16 virtual_processor_id;
+	u16 guest_es_selector;
+	u16 guest_cs_selector;
+	u16 guest_ss_selector;
+	u16 guest_ds_selector;
+	u16 guest_fs_selector;
+	u16 guest_gs_selector;
+	u16 guest_ldtr_selector;
+	u16 guest_tr_selector;
+	u16 host_es_selector;
+	u16 host_cs_selector;
+	u16 host_ss_selector;
+	u16 host_ds_selector;
+	u16 host_fs_selector;
+	u16 host_gs_selector;
+	u16 host_tr_selector;
 };
 
 /*
@@ -284,6 +422,149 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
+#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
+				[number##_HIGH] = VMCS12_OFFSET(name)+4
+
+static unsigned short vmcs_field_to_offset_table[] = {
+	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+	FIELD(HOST_ES_SELECTOR, host_es_selector),
+	FIELD(HOST_CS_SELECTOR, host_cs_selector),
+	FIELD(HOST_SS_SELECTOR, host_ss_selector),
+	FIELD(HOST_DS_SELECTOR, host_ds_selector),
+	FIELD(HOST_FS_SELECTOR, host_fs_selector),
+	FIELD(HOST_GS_SELECTOR, host_gs_selector),
+	FIELD(HOST_TR_SELECTOR, host_tr_selector),
+	FIELD64(IO_BITMAP_A, io_bitmap_a),
+	FIELD64(IO_BITMAP_B, io_bitmap_b),
+	FIELD64(MSR_BITMAP, msr_bitmap),
+	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+	FIELD64(TSC_OFFSET, tsc_offset),
+	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+	FIELD64(EPT_POINTER, ept_pointer),
+	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+	FIELD64(GUEST_PDPTR0, guest_pdptr0),
+	FIELD64(GUEST_PDPTR1, guest_pdptr1),
+	FIELD64(GUEST_PDPTR2, guest_pdptr2),
+	FIELD64(GUEST_PDPTR3, guest_pdptr3),
+	FIELD64(HOST_IA32_PAT, host_ia32_pat),
+	FIELD64(HOST_IA32_EFER, host_ia32_efer),
+	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+	FIELD(EXCEPTION_BITMAP, exception_bitmap),
+	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+	FIELD(CR3_TARGET_COUNT, cr3_target_count),
+	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+	FIELD(TPR_THRESHOLD, tpr_threshold),
+	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+	FIELD(VM_EXIT_REASON, vm_exit_reason),
+	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+	FIELD(GUEST_ES_LIMIT, guest_es_limit),
+	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+	FIELD(EXIT_QUALIFICATION, exit_qualification),
+	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+	FIELD(GUEST_CR0, guest_cr0),
+	FIELD(GUEST_CR3, guest_cr3),
+	FIELD(GUEST_CR4, guest_cr4),
+	FIELD(GUEST_ES_BASE, guest_es_base),
+	FIELD(GUEST_CS_BASE, guest_cs_base),
+	FIELD(GUEST_SS_BASE, guest_ss_base),
+	FIELD(GUEST_DS_BASE, guest_ds_base),
+	FIELD(GUEST_FS_BASE, guest_fs_base),
+	FIELD(GUEST_GS_BASE, guest_gs_base),
+	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+	FIELD(GUEST_TR_BASE, guest_tr_base),
+	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+	FIELD(GUEST_DR7, guest_dr7),
+	FIELD(GUEST_RSP, guest_rsp),
+	FIELD(GUEST_RIP, guest_rip),
+	FIELD(GUEST_RFLAGS, guest_rflags),
+	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+	FIELD(HOST_CR0, host_cr0),
+	FIELD(HOST_CR3, host_cr3),
+	FIELD(HOST_CR4, host_cr4),
+	FIELD(HOST_FS_BASE, host_fs_base),
+	FIELD(HOST_GS_BASE, host_gs_base),
+	FIELD(HOST_TR_BASE, host_tr_base),
+	FIELD(HOST_GDTR_BASE, host_gdtr_base),
+	FIELD(HOST_IDTR_BASE, host_idtr_base),
+	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+	FIELD(HOST_RSP, host_rsp),
+	FIELD(HOST_RIP, host_rip),
+};
+static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+	if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
+		return -1;
+	return vmcs_field_to_offset_table[field];
+}
+
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
 	return to_vmx(vcpu)->nested.current_vmcs12;

From 0140caea3b9972f826416a796271f17b42cbe827 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:06:28 +0300
Subject: [PATCH 027/143] KVM: nVMX: Success/failure of VMX instructions.

VMX instructions specify success or failure by setting certain RFLAGS bits.
This patch contains common functions to do this, and they will be used in
the following patches which emulate the various VMX instructions.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx.c         | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b810460..37690bd580c8 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -426,4 +426,35 @@ struct vmx_msr_entry {
 	u64 value;
 } __aligned(16);
 
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+	VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+	VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+	VMXERR_VMCLEAR_VMXON_POINTER = 3,
+	VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+	VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+	VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+	VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+	VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+	VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+	VMXERR_VMPTRLD_VMXON_POINTER = 10,
+	VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+	VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+	VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+	VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+	VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+	VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+	VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+	VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+	VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+	VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+	VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+	VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+	VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+	VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+	VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33476eb152ea..8432448fe356 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4776,6 +4776,44 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+					u32 vm_instruction_error)
+{
+	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+		/*
+		 * failValid writes the error number to the current VMCS, which
+		 * can't be done there isn't a current VMCS.
+		 */
+		nested_vmx_failInvalid(vcpu);
+		return;
+	}
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_ZF);
+	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs

From 27d6c865211662721e6cf305706e4a3da35f12b4 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:06:59 +0300
Subject: [PATCH 028/143] KVM: nVMX: Implement VMCLEAR

This patch implements the VMCLEAR instruction.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 65 +++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |  1 +
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8432448fe356..c7600009dacb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -166,6 +166,9 @@ struct __packed vmcs12 {
 	u32 revision_id;
 	u32 abort;
 
+	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+	u32 padding[7]; /* room for future expansion */
+
 	u64 io_bitmap_a;
 	u64 io_bitmap_b;
 	u64 msr_bitmap;
@@ -4814,6 +4817,66 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
 }
 
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t gva;
+	gpa_t vmptr;
+	struct vmcs12 *vmcs12;
+	struct page *page;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+				sizeof(vmptr), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	if (vmptr == vmx->nested.current_vmptr) {
+		kunmap(vmx->nested.current_vmcs12_page);
+		nested_release_page(vmx->nested.current_vmcs12_page);
+		vmx->nested.current_vmptr = -1ull;
+		vmx->nested.current_vmcs12 = NULL;
+	}
+
+	page = nested_get_page(vcpu, vmptr);
+	if (page == NULL) {
+		/*
+		 * For accurate processor emulation, VMCLEAR beyond available
+		 * physical memory should do nothing at all. However, it is
+		 * possible that a nested vmx bug, not a guest hypervisor bug,
+		 * resulted in this case, so let's shut down before doing any
+		 * more damage:
+		 */
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		return 1;
+	}
+	vmcs12 = kmap(page);
+	vmcs12->launch_state = 0;
+	kunmap(page);
+	nested_release_page(page);
+
+	nested_free_vmcs02(vmx, vmptr);
+
+	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -4835,7 +4898,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVD]		      = handle_invd,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
 	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27d12a3b1890..c1b5a1817e43 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 	vcpu->arch.cr2 = fault->address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {

From 63846663eac783eabbc1ab7c325e41a3627d986f Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:07:29 +0300
Subject: [PATCH 029/143] KVM: nVMX: Implement VMPTRLD

This patch implements the VMPTRLD instruction.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7600009dacb..54866f538c6f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4877,6 +4877,66 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t gva;
+	gpa_t vmptr;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+				sizeof(vmptr), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	if (vmx->nested.current_vmptr != vmptr) {
+		struct vmcs12 *new_vmcs12;
+		struct page *page;
+		page = nested_get_page(vcpu, vmptr);
+		if (page == NULL) {
+			nested_vmx_failInvalid(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		new_vmcs12 = kmap(page);
+		if (new_vmcs12->revision_id != VMCS12_REVISION) {
+			kunmap(page);
+			nested_release_page_clean(page);
+			nested_vmx_failValid(vcpu,
+				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		if (vmx->nested.current_vmptr != -1ull) {
+			kunmap(vmx->nested.current_vmcs12_page);
+			nested_release_page(vmx->nested.current_vmcs12_page);
+		}
+
+		vmx->nested.current_vmptr = vmptr;
+		vmx->nested.current_vmcs12 = new_vmcs12;
+		vmx->nested.current_vmcs12_page = page;
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -4900,7 +4960,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,

From 6a4d7550601b5b17df227959bdbec208384f729c Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:08:00 +0300
Subject: [PATCH 030/143] KVM: nVMX: Implement VMPTRST

This patch implements the VMPTRST instruction.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 28 +++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |  3 ++-
 arch/x86/kvm/x86.h |  4 ++++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 54866f538c6f..2bc521c9dabe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4937,6 +4937,32 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t vmcs_gva;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, exit_qualification,
+			vmx_instruction_info, &vmcs_gva))
+		return 1;
+	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
+				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
+				 sizeof(u64), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -4961,7 +4987,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1b5a1817e43..de262a086863 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3869,7 +3869,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
-static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
 				       struct x86_exception *exception)
@@ -3901,6 +3901,7 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 out:
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  unsigned long addr,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6f150539b262..256da82856bd 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -85,4 +85,8 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
 	struct x86_exception *exception);
 
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+	gva_t addr, void *val, unsigned int bytes,
+	struct x86_exception *exception);
+
 #endif

From 49f705c5324aa13bb5623b392c23996e23eabc23 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:08:30 +0300
Subject: [PATCH 031/143] KVM: nVMX: Implement VMREAD and VMWRITE

Implement the VMREAD and VMWRITE instructions. With these instructions, L1
can read and write to the VMCS it is holding. The values are read or written
to the fields of the vmcs12 structure introduced in a previous patch.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 193 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2bc521c9dabe..84d9c93fde05 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4877,6 +4877,195 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+enum vmcs_field_type {
+	VMCS_FIELD_TYPE_U16 = 0,
+	VMCS_FIELD_TYPE_U64 = 1,
+	VMCS_FIELD_TYPE_U32 = 2,
+	VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_type(unsigned long field)
+{
+	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
+		return VMCS_FIELD_TYPE_U32;
+	return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+	return (((field >> 10) & 0x3) == 1);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
+					unsigned long field, u64 *ret)
+{
+	short offset = vmcs_field_to_offset(field);
+	char *p;
+
+	if (offset < 0)
+		return 0;
+
+	p = ((char *)(get_vmcs12(vcpu))) + offset;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+		*ret = *((natural_width *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U16:
+		*ret = *((u16 *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U32:
+		*ret = *((u32 *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U64:
+		*ret = *((u64 *)p);
+		return 1;
+	default:
+		return 0; /* can never happen. */
+	}
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	if (vmx->nested.current_vmptr == -1ull) {
+		nested_vmx_failInvalid(vcpu);
+		skip_emulated_instruction(vcpu);
+		return 0;
+	}
+	return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	u64 field_value;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t gva = 0;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	/* Decode instruction info and find the field to read */
+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+	/* Read the field, zero-extended to a u64 field_value */
+	if (!vmcs12_read_any(vcpu, field, &field_value)) {
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+	/*
+	 * Now copy part of this value to register or memory, as requested.
+	 * Note that the number of bits actually copied is 32 or 64 depending
+	 * on the guest's mode (32 or 64 bit), not on the given field's length.
+	 */
+	if (vmx_instruction_info & (1u << 10)) {
+		kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+			field_value);
+	} else {
+		if (get_vmx_mem_address(vcpu, exit_qualification,
+				vmx_instruction_info, &gva))
+			return 1;
+		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
+		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
+			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	gva_t gva;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	char *p;
+	short offset;
+	/* The value to write might be 32 or 64 bits, depending on L1's long
+	 * mode, and eventually we need to write that into a field of several
+	 * possible lengths. The code below first zero-extends the value to 64
+	 * bit (field_value), and then copies only the approriate number of
+	 * bits into the vmcs12 field.
+	 */
+	u64 field_value = 0;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	if (vmx_instruction_info & (1u << 10))
+		field_value = kvm_register_read(vcpu,
+			(((vmx_instruction_info) >> 3) & 0xf));
+	else {
+		if (get_vmx_mem_address(vcpu, exit_qualification,
+				vmx_instruction_info, &gva))
+			return 1;
+		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
+			   &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+			kvm_inject_page_fault(vcpu, &e);
+			return 1;
+		}
+	}
+
+
+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+	if (vmcs_field_readonly(field)) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	offset = vmcs_field_to_offset(field);
+	if (offset < 0) {
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+	p = ((char *) get_vmcs12(vcpu)) + offset;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_U16:
+		*(u16 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		*(u32 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_U64:
+		*(u64 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+		*(natural_width *)p = field_value;
+		break;
+	default:
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /* Emulate the VMPTRLD instruction */
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
@@ -4988,9 +5177,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
-	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
+	[EXIT_REASON_VMREAD]                  = handle_vmread,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
+	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,

From a3a8ff8ebf87cbd828f54276d902c3c8ee0c4781 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:09:01 +0300
Subject: [PATCH 032/143] KVM: nVMX: Move host-state field setup to a function

Move the setting of constant host-state fields (fields that do not change
throughout the life of the guest) from vmx_vcpu_setup to a new common function
vmx_set_constant_host_state(). This function will also be used to set the
host state when running L2 guests.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 74 ++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 84d9c93fde05..4b97161b2625 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3394,18 +3394,52 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
 }
 
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_set_constant_host_state(void)
+{
+	u32 low32, high32;
+	unsigned long tmpl;
+	struct desc_ptr dt;
+
+	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
+	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+	native_store_idt(&dt);
+	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
+
+	asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
+	vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+
+	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, low32, high32);
+		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+	}
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-	u32 host_sysenter_cs, msr_low, msr_high;
-	u32 junk;
-	u64 host_pat;
 	unsigned long a;
-	struct desc_ptr dt;
 	int i;
-	unsigned long kvm_vmx_return;
 	u32 exec_control;
 
 	/* I/O */
@@ -3462,16 +3496,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
-	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
-	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
-	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
-
-	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmx_set_constant_host_state();
 #ifdef CONFIG_X86_64
 	rdmsrl(MSR_FS_BASE, a);
 	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -3482,32 +3509,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 #endif
 
-	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-	native_store_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
-
-	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
-
-	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((u64) msr_high << 32);
-		vmcs_write64(HOST_IA32_PAT, host_pat);
-	}
 	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+		u32 msr_low, msr_high;
+		u64 host_pat;
 		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
 		host_pat = msr_low | ((u64) msr_high << 32);
 		/* Write the default value follow host pat */

From bf8179a011d40e7322d34440039b5e86a0d03aed Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:09:31 +0300
Subject: [PATCH 033/143] KVM: nVMX: Move control field setup to functions

Move some of the control field setup to common functions. These functions will
also be needed for running L2 guests - L0's desires (expressed in these
functions) will be appropriately merged with L1's desires.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 80 +++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4b97161b2625..3134638e9e90 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3433,6 +3433,49 @@ static void vmx_set_constant_host_state(void)
 	}
 }
 
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+	if (enable_ept)
+		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+				CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+	if (!enable_ept)
+		exec_control |= CPU_BASED_CR3_STORE_EXITING |
+				CPU_BASED_CR3_LOAD_EXITING  |
+				CPU_BASED_INVLPG_EXITING;
+	return exec_control;
+}
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	if (vmx->vpid == 0)
+		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+	if (!enable_ept) {
+		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		enable_unrestricted_guest = 0;
+	}
+	if (!enable_unrestricted_guest)
+		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+	if (!ple_gap)
+		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+	return exec_control;
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -3440,7 +3483,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	unsigned long a;
 	int i;
-	u32 exec_control;
 
 	/* I/O */
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -3455,36 +3497,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
 		vmcs_config.pin_based_exec_ctrl);
 
-	exec_control = vmcs_config.cpu_based_exec_ctrl;
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
-		exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_STORE_EXITING |
-				CPU_BASED_CR8_LOAD_EXITING;
-#endif
-	}
-	if (!enable_ept)
-		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING  |
-				CPU_BASED_INVLPG_EXITING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
 	if (cpu_has_secondary_exec_ctrls()) {
-		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-			exec_control &=
-				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-		if (vmx->vpid == 0)
-			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-		if (!enable_ept) {
-			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
-			enable_unrestricted_guest = 0;
-		}
-		if (!enable_unrestricted_guest)
-			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
-		if (!ple_gap)
-			exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+				vmx_secondary_exec_control(vmx));
 	}
 
 	if (ple_gap) {
@@ -3547,10 +3564,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
-	if (enable_ept)
-		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
-	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+	set_cr4_guest_host_mask(vmx);
 
 	kvm_write_tsc(&vmx->vcpu, 0);
 

From fe3ef05c7572d68721c1ddd4d36009611f565ba2 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:10:02 +0300
Subject: [PATCH 034/143] KVM: nVMX: Prepare vmcs02 from vmcs01 and vmcs12

This patch contains code to prepare the VMCS which can be used to actually
run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information
in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (our desires for our
own guests).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 279 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3134638e9e90..506c9144074a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -345,6 +345,12 @@ struct nested_vmx {
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
+	u64 vmcs01_tsc_offset;
+	/*
+	 * Guest pages referred to in vmcs02 with host-physical pointers, so
+	 * we must keep them pinned while L2 runs.
+	 */
+	struct page *apic_access_page;
 };
 
 struct vcpu_vmx {
@@ -847,6 +853,18 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+	return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+	return (vmcs12->cpu_based_vm_exec_control &
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+		(vmcs12->secondary_vm_exec_control & bit);
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1441,6 +1459,22 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	vmx_decache_cr0_guest_bits(vcpu);
@@ -3438,6 +3472,9 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
 	if (enable_ept)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+	if (is_guest_mode(&vmx->vcpu))
+		vmx->vcpu.arch.cr4_guest_owned_bits &=
+			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
@@ -4736,6 +4773,11 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
+	/* Unpin physical memory we referred to in current vmcs02 */
+	if (vmx->nested.apic_access_page) {
+		nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = 0;
+	}
 
 	nested_free_all_saved_vmcss(vmx);
 }
@@ -5810,6 +5852,243 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 }
 
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ */
+static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exec_control;
+
+	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+	vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		vmcs12->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		vmcs12->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+		vmcs12->vm_entry_instruction_len);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		vmcs12->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+	vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		vmcs12->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		(vmcs_config.pin_based_exec_ctrl |
+		 vmcs12->pin_based_vm_exec_control));
+
+	/*
+	 * Whether page-faults are trapped is determined by a combination of
+	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+	 * If enable_ept, L0 doesn't care about page faults and we should
+	 * set all of these to L1's desires. However, if !enable_ept, L0 does
+	 * care about (at least some) page faults, and because it is not easy
+	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
+	 * to exit on each and every L2 page fault. This is done by setting
+	 * MASK=MATCH=0 and (see below) EB.PF=1.
+	 * Note that below we don't need special code to set EB.PF beyond the
+	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+	 *
+	 * A problem with this approach (when !enable_ept) is that L1 may be
+	 * injected with more page faults than it asked for. This could have
+	 * caused problems, but in practice existing hypervisors don't care.
+	 * To fix this, we will need to emulate the PFEC checking (on the L1
+	 * page tables), using walk_addr(), when injecting PFs to L1.
+	 */
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+		u32 exec_control = vmx_secondary_exec_control(vmx);
+		if (!vmx->rdtscp_enabled)
+			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+		/* Take the following fields only from vmcs12 */
+		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		if (nested_cpu_has(vmcs12,
+				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+			exec_control |= vmcs12->secondary_vm_exec_control;
+
+		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
+			/*
+			 * Translate L1 physical address to host physical
+			 * address for vmcs02. Keep the page pinned, so this
+			 * physical address remains valid. We keep a reference
+			 * to it so we can release it later.
+			 */
+			if (vmx->nested.apic_access_page) /* shouldn't happen */
+				nested_release_page(vmx->nested.apic_access_page);
+			vmx->nested.apic_access_page =
+				nested_get_page(vcpu, vmcs12->apic_access_addr);
+			/*
+			 * If translation failed, no matter: This feature asks
+			 * to exit when accessing the given address, and if it
+			 * can never be accessed, this feature won't do
+			 * anything anyway.
+			 */
+			if (!vmx->nested.apic_access_page)
+				exec_control &=
+				  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			else
+				vmcs_write64(APIC_ACCESS_ADDR,
+				  page_to_phys(vmx->nested.apic_access_page));
+		}
+
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+
+	/*
+	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+	 * Some constant fields are set here by vmx_set_constant_host_state().
+	 * Other fields are different per CPU, and will be set later when
+	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+	 */
+	vmx_set_constant_host_state();
+
+	/*
+	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+	 * entry, but only if the current (host) sp changed from the value
+	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
+	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
+	 * here we just force the write to happen on entry.
+	 */
+	vmx->host_rsp = 0;
+
+	exec_control = vmx_exec_control(vmx); /* L0's desires */
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+	exec_control |= vmcs12->cpu_based_vm_exec_control;
+	/*
+	 * Merging of IO and MSR bitmaps not currently supported.
+	 * Rather, exit every time.
+	 */
+	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+	 * bitwise-or of what L1 wants to trap for L2, and what we want to
+	 * trap. Note that CR0.TS also needs updating - we do this later.
+	 */
+	update_exception_bitmap(vcpu);
+	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+	vmcs_write32(VM_EXIT_CONTROLS,
+		vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+	else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+
+	set_cr4_guest_host_mask(vmx);
+
+	vmcs_write64(TSC_OFFSET,
+		vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+
+	if (enable_vpid) {
+		/*
+		 * Trivially support vpid by letting L2s share their parent
+		 * L1's vpid. TODO: move to a more elaborate solution, giving
+		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 */
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+		vmx_flush_tlb(vcpu);
+	}
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->guest_ia32_efer;
+	if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	/*
+	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+	 * The CR0_READ_SHADOW is what L2 should have expected to read given
+	 * the specifications by L1; It's not enough to take
+	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+	 * have more bits than L1 expected.
+	 */
+	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+	/* shadow page tables on either EPT or shadow page tables */
+	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+	kvm_mmu_reset_context(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)

From cd232ad02f00286c3f8c9df30948da17212ef905 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:10:33 +0300
Subject: [PATCH 035/143] KVM: nVMX: Implement VMLAUNCH and VMRESUME

Implement the VMLAUNCH and VMRESUME instructions, allowing a guest
hypervisor to run its own guests.

This patch does not include some of the necessary validity checks on
vmcs12 fields before the entry. These will appear in a separate patch
below.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 65 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 506c9144074a..1a8b32867975 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4943,6 +4943,21 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+	return nested_vmx_run(vcpu, false);
+}
+
 enum vmcs_field_type {
 	VMCS_FIELD_TYPE_U16 = 0,
 	VMCS_FIELD_TYPE_U64 = 1,
@@ -5240,11 +5255,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -6089,6 +6104,52 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
 
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+	struct vmcs12 *vmcs12;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct loaded_vmcs *vmcs02;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+
+	vmcs02 = nested_get_current_vmcs02(vmx);
+	if (!vmcs02)
+		return -ENOMEM;
+
+	enter_guest_mode(vcpu);
+
+	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = vmcs02;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	vmcs12->launch_state = 1;
+
+	prepare_vmcs02(vcpu, vmcs12);
+
+	/*
+	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+	 * returned as far as L1 is concerned. It will only return (and set
+	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
+	 */
+	return 1;
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)

From 99e65e805dea4df061aa4038211112aa96416412 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:11:03 +0300
Subject: [PATCH 036/143] KVM: nVMX: No need for handle_vmx_insn function any
 more

Before nested VMX support, the exit handler for a guest executing a VMX
instruction (vmclear, vmlaunch, vmptrld, vmptrst, vmread, vmread, vmresume,
vmwrite, vmon, vmoff), was handle_vmx_insn(). This handler simply threw a #UD
exception. Now that all these exit reasons are properly handled (and emulate
the respective VMX instruction), nothing calls this dummy handler and it can
be removed.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1a8b32867975..f8dccb9df34b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4309,12 +4309,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;

From 4704d0befb0721274bda863192c4782febb6b94c Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:11:34 +0300
Subject: [PATCH 037/143] KVM: nVMX: Exiting from L2 to L1

This patch implements nested_vmx_vmexit(), called when the nested L2 guest
exits and we want to run its L1 parent and let it handle this exit.

Note that this will not necessarily be called on every L2 exit. L0 may decide
to handle a particular exit on its own, without L1's involvement; In that
case, L0 will handle the exit, and resume running L2, without running L1 and
without calling nested_vmx_vmexit(). The logic for deciding whether to handle
a particular exit in L1 or in L0, i.e., whether to call nested_vmx_vmexit(),
will appear in a separate patch below.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h |   4 +
 arch/x86/kvm/vmx.c         | 262 +++++++++++++++++++++++++++++++++++++
 2 files changed, 266 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 37690bd580c8..b747773cf83b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
 	GUEST_IA32_PAT_HIGH		= 0x00002805,
 	GUEST_IA32_EFER			= 0x00002806,
 	GUEST_IA32_EFER_HIGH		= 0x00002807,
+	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
+	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
 	HOST_IA32_EFER			= 0x00002c02,
 	HOST_IA32_EFER_HIGH		= 0x00002c03,
+	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
+	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8dccb9df34b..7d778e4976ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6144,6 +6144,268 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	return 1;
 }
 
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ *     didn't trap the bit, because if L1 did, so would L0).
+ *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ *     been modified by L2, and L1 knows it. So just leave the old value of
+ *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ *     isn't relevant, because if L0 traps this bit it can set it to anything.
+ *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ *     changed these bits, and therefore they need to be updated, but L0
+ *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	return
+	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+			vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	return
+	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+			vcpu->arch.cr4_guest_owned_bits));
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	/* update guest state fields: */
+	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+	kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+	vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	vmcs12->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	vmcs12->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+	/* TODO: These cannot have changed unless we have MSR bitmaps and
+	 * the relevant bit asks not to trap the change */
+	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+	/* update exit information fields: */
+
+	vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	vmcs12->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+	/* clear vm-entry fields which are to be cleared on exit */
+	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->host_ia32_efer;
+	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+	/*
+	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+	 * actually changed, because it depends on the current state of
+	 * fpu_active (which may have changed).
+	 * Note that vmx_set_cr0 refers to efer set above.
+	 */
+	kvm_set_cr0(vcpu, vmcs12->host_cr0);
+	/*
+	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
+	 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
+	 * but we also need to update cr0_guest_host_mask and exception_bitmap.
+	 */
+	update_exception_bitmap(vcpu);
+	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	/*
+	 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
+	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
+	 */
+	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+	kvm_set_cr4(vcpu, vmcs12->host_cr4);
+
+	/* shadow page tables on either EPT or shadow page tables */
+	kvm_set_cr3(vcpu, vmcs12->host_cr3);
+	kvm_mmu_reset_context(vcpu);
+
+	if (enable_vpid) {
+		/*
+		 * Trivially support vpid by letting L2s share their parent
+		 * L1's vpid. TODO: move to a more elaborate solution, giving
+		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 */
+		vmx_flush_tlb(vcpu);
+	}
+
+
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+	vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
+	vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+	vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
+	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
+
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+			vmcs12->host_ia32_perf_global_ctrl);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	leave_guest_mode(vcpu);
+	prepare_vmcs12(vcpu, vmcs12);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	/* if no vmcs02 cache requested, remove the one we used */
+	if (VMCS02_POOL_SIZE == 0)
+		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+
+	load_vmcs12_host_state(vcpu, vmcs12);
+
+	/* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+
+	/* This is needed for same reason as it was needed in prepare_vmcs02 */
+	vmx->host_rsp = 0;
+
+	/* Unpin physical memory we referred to in vmcs02 */
+	if (vmx->nested.apic_access_page) {
+		nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = 0;
+	}
+
+	/*
+	 * Exiting from L2 to L1, we're now back to L1 which thinks it just
+	 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
+	 * success or failure flag accordingly.
+	 */
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+	} else
+		nested_vmx_succeed(vcpu);
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)

From 7c1779384a2b2479722e90778721c40811e1b7a7 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:12:04 +0300
Subject: [PATCH 038/143] KVM: nVMX: vmcs12 checks on nested entry

This patch adds a bunch of tests of the validity of the vmcs12 fields,
according to what the VMX spec and our implementation allows. If fields
we cannot (or don't want to) honor are discovered, an entry failure is
emulated.

According to the spec, there are two types of entry failures: If the problem
was in vmcs12's host state or control fields, the VMLAUNCH instruction simply
fails. But a problem is found in the guest state, the behavior is more
similar to that of an exit.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h |   8 +++
 arch/x86/kvm/vmx.c         | 101 +++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b747773cf83b..2caf290e9895 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -430,6 +430,14 @@ struct vmx_msr_entry {
 	u64 value;
 } __aligned(16);
 
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT		0
+#define ENTRY_FAIL_PDPTE		2
+#define ENTRY_FAIL_NMI			3
+#define ENTRY_FAIL_VMCS_LINK_PTR	4
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d778e4976ea..ee25b9fdfa82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -865,6 +865,10 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
 		(vmcs12->secondary_vm_exec_control & bit);
 }
 
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+			struct vmcs12 *vmcs12,
+			u32 reason, unsigned long qualification);
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -6116,6 +6120,86 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	skip_emulated_instruction(vcpu);
 	vmcs12 = get_vmcs12(vcpu);
 
+	/*
+	 * The nested entry process starts with enforcing various prerequisites
+	 * on vmcs12 as required by the Intel SDM, and act appropriately when
+	 * they fail: As the SDM explains, some conditions should cause the
+	 * instruction to fail, while others will cause the instruction to seem
+	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
+	 * To speed up the normal (success) code path, we should avoid checking
+	 * for misconfigurations which will anyway be caught by the processor
+	 * when using the merged vmcs02.
+	 */
+	if (vmcs12->launch_state == launch) {
+		nested_vmx_failValid(vcpu,
+			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+		return 1;
+	}
+
+	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
+			!IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+		/*TODO: Also verify bits beyond physical address width are 0*/
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+			!IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+		/*TODO: Also verify bits beyond physical address width are 0*/
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (vmcs12->vm_entry_msr_load_count > 0 ||
+	    vmcs12->vm_exit_msr_load_count > 0 ||
+	    vmcs12->vm_exit_msr_store_count > 0) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING
+			  "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+	      nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+	      nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+	    !vmx_control_verify(vmcs12->vm_exit_controls,
+	      nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+	    !vmx_control_verify(vmcs12->vm_entry_controls,
+	      nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+	{
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+		return 1;
+	}
+
+	if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		return 1;
+	}
+	if (vmcs12->vmcs_link_pointer != -1ull) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+		return 1;
+	}
+
+	/*
+	 * We're finally done with prerequisite checking, and can start with
+	 * the nested entry.
+	 */
+
 	vmcs02 = nested_get_current_vmcs02(vmx);
 	if (!vmcs02)
 		return -ENOMEM;
@@ -6406,6 +6490,23 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 		nested_vmx_succeed(vcpu);
 }
 
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+			struct vmcs12 *vmcs12,
+			u32 reason, unsigned long qualification)
+{
+	load_vmcs12_host_state(vcpu, vmcs12);
+	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+	vmcs12->exit_qualification = qualification;
+	nested_vmx_succeed(vcpu);
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)

From 644d711aa0e16111d8aba6d289caebec013e26ea Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:12:35 +0300
Subject: [PATCH 039/143] KVM: nVMX: Deciding if L0 or L1 should handle an L2
 exit

This patch contains the logic of whether an L2 exit should be handled by L0
and then L2 should be resumed, or whether L1 should be run to handle this
exit (using the nested_vmx_vmexit() function of the previous patch).

The basic idea is to let L1 handle the exit only if it actually asked to
trap this sort of event. For example, when L2 exits on a change to CR0,
we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
bit which changed; If it did, we exit to L1. But if it didn't it means that
it is we (L0) that wished to trap this event, so we handle it ourselves.

The next two patches add additional logic of what to do when an interrupt or
exception is injected: Does L0 need to do it, should we exit to L1 to do it,
or should we resume L2 and keep the exception to be injected later.

We keep a new flag, "nested_run_pending", which can override the decision of
which should run next, L1 or L2. nested_run_pending=1 means that we *must* run
L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2
and therefore expects L2 to be run (and perhaps be injected with an event it
specified, etc.). Nested_run_pending is especially intended to avoid switching
to L1 in the injection decision-point described above.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 253 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 252 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee25b9fdfa82..7f62dc36af9b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -346,6 +346,8 @@ struct nested_vmx {
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
 	u64 vmcs01_tsc_offset;
+	/* L2 must run next, and mustn't decide to exit to L1. */
+	bool nested_run_pending;
 	/*
 	 * Guest pages referred to in vmcs02 with host-physical pointers, so
 	 * we must keep them pinned while L2 runs.
@@ -865,6 +867,19 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
 		(vmcs12->secondary_vm_exec_control & bit);
 }
 
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+	struct kvm_vcpu *vcpu)
+{
+	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
 			struct vmcs12 *vmcs12,
 			u32 reason, unsigned long qualification);
@@ -5277,6 +5292,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
 	ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+	struct vmcs12 *vmcs12, u32 exit_reason)
+{
+	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+	gpa_t bitmap;
+
+	if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+		return 1;
+
+	/*
+	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+	 * for the four combinations of read/write and low/high MSR numbers.
+	 * First we need to figure out which of the four to use:
+	 */
+	bitmap = vmcs12->msr_bitmap;
+	if (exit_reason == EXIT_REASON_MSR_WRITE)
+		bitmap += 2048;
+	if (msr_index >= 0xc0000000) {
+		msr_index -= 0xc0000000;
+		bitmap += 1024;
+	}
+
+	/* Then read the msr_index'th bit from this bitmap: */
+	if (msr_index < 1024*8) {
+		unsigned char b;
+		kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+		return 1 & (b >> (msr_index & 7));
+	} else
+		return 1; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+	struct vmcs12 *vmcs12)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	int cr = exit_qualification & 15;
+	int reg = (exit_qualification >> 8) & 15;
+	unsigned long val = kvm_register_read(vcpu, reg);
+
+	switch ((exit_qualification >> 4) & 3) {
+	case 0: /* mov to cr */
+		switch (cr) {
+		case 0:
+			if (vmcs12->cr0_guest_host_mask &
+			    (val ^ vmcs12->cr0_read_shadow))
+				return 1;
+			break;
+		case 3:
+			if ((vmcs12->cr3_target_count >= 1 &&
+					vmcs12->cr3_target_value0 == val) ||
+				(vmcs12->cr3_target_count >= 2 &&
+					vmcs12->cr3_target_value1 == val) ||
+				(vmcs12->cr3_target_count >= 3 &&
+					vmcs12->cr3_target_value2 == val) ||
+				(vmcs12->cr3_target_count >= 4 &&
+					vmcs12->cr3_target_value3 == val))
+				return 0;
+			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+				return 1;
+			break;
+		case 4:
+			if (vmcs12->cr4_guest_host_mask &
+			    (vmcs12->cr4_read_shadow ^ val))
+				return 1;
+			break;
+		case 8:
+			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+				return 1;
+			break;
+		}
+		break;
+	case 2: /* clts */
+		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
+			return 1;
+		break;
+	case 1: /* mov from cr */
+		switch (cr) {
+		case 3:
+			if (vmcs12->cpu_based_vm_exec_control &
+			    CPU_BASED_CR3_STORE_EXITING)
+				return 1;
+			break;
+		case 8:
+			if (vmcs12->cpu_based_vm_exec_control &
+			    CPU_BASED_CR8_STORE_EXITING)
+				return 1;
+			break;
+		}
+		break;
+	case 3: /* lmsw */
+		/*
+		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+		 * cr0. Other attempted changes are ignored, with no exit.
+		 */
+		if (vmcs12->cr0_guest_host_mask & 0xe &
+		    (val ^ vmcs12->cr0_read_shadow))
+			return 1;
+		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+		    !(vmcs12->cr0_read_shadow & 0x1) &&
+		    (val & 0x1))
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+{
+	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	switch (exit_reason) {
+	case EXIT_REASON_EXCEPTION_NMI:
+		if (!is_exception(intr_info))
+			return 0;
+		else if (is_page_fault(intr_info))
+			return enable_ept;
+		return vmcs12->exception_bitmap &
+				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		return 0;
+	case EXIT_REASON_TRIPLE_FAULT:
+		return 1;
+	case EXIT_REASON_PENDING_INTERRUPT:
+	case EXIT_REASON_NMI_WINDOW:
+		/*
+		 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
+		 * (aka Interrupt Window Exiting) only when L1 turned it on,
+		 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
+		 * Same for NMI Window Exiting.
+		 */
+		return 1;
+	case EXIT_REASON_TASK_SWITCH:
+		return 1;
+	case EXIT_REASON_CPUID:
+		return 1;
+	case EXIT_REASON_HLT:
+		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+	case EXIT_REASON_INVD:
+		return 1;
+	case EXIT_REASON_INVLPG:
+		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+	case EXIT_REASON_RDPMC:
+		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+	case EXIT_REASON_RDTSC:
+		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
+	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+		/*
+		 * VMX instructions trap unconditionally. This allows L1 to
+		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
+		 */
+		return 1;
+	case EXIT_REASON_CR_ACCESS:
+		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+	case EXIT_REASON_DR_ACCESS:
+		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+	case EXIT_REASON_IO_INSTRUCTION:
+		/* TODO: support IO bitmaps */
+		return 1;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+	case EXIT_REASON_INVALID_STATE:
+		return 1;
+	case EXIT_REASON_MWAIT_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+	case EXIT_REASON_MONITOR_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+	case EXIT_REASON_PAUSE_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+			nested_cpu_has2(vmcs12,
+				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+	case EXIT_REASON_MCE_DURING_VMENTRY:
+		return 0;
+	case EXIT_REASON_TPR_BELOW_THRESHOLD:
+		return 1;
+	case EXIT_REASON_APIC_ACCESS:
+		return nested_cpu_has2(vmcs12,
+			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	case EXIT_REASON_EPT_VIOLATION:
+	case EXIT_REASON_EPT_MISCONFIG:
+		return 0;
+	case EXIT_REASON_WBINVD:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+	case EXIT_REASON_XSETBV:
+		return 1;
+	default:
+		return 1;
+	}
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
 	*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -5299,6 +5537,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
+		nested_vmx_vmexit(vcpu);
+		return 1;
+	}
+
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -5321,7 +5570,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
+	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
+	                                get_vmcs12(vcpu), vcpu)))) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&

From b6f1250edb4462e38d72c7f6cce35911df21d31b Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:13:06 +0300
Subject: [PATCH 040/143] KVM: nVMX: Correct handling of interrupt injection

The code in this patch correctly emulates external-interrupt injection
while a nested guest L2 is running.

Because of this code's relative un-obviousness, I include here a longer-than-
usual justification for what it does - much longer than the code itself ;-)

To understand how to correctly emulate interrupt injection while L2 is
running, let's look first at what we need to emulate: How would things look
like if the extra L0 hypervisor layer is removed, and instead of L0 injecting
an interrupt, we had hardware delivering an interrupt?

Now we have L1 running on bare metal with a guest L2, and the hardware
generates an interrupt. Assuming that L1 set PIN_BASED_EXT_INTR_MASK to 1, and
VM_EXIT_ACK_INTR_ON_EXIT to 0 (we'll revisit these assumptions below), what
happens now is this: The processor exits from L2 to L1, with an external-
interrupt exit reason but without an interrupt vector. L1 runs, with
interrupts disabled, and it doesn't yet know what the interrupt was. Soon
after, it enables interrupts and only at that moment, it gets the interrupt
from the processor. when L1 is KVM, Linux handles this interrupt.

Now we need exactly the same thing to happen when that L1->L2 system runs
on top of L0, instead of real hardware. This is how we do this:

When L0 wants to inject an interrupt, it needs to exit from L2 to L1, with
external-interrupt exit reason (with an invalid interrupt vector), and run L1.
Just like in the bare metal case, it likely can't deliver the interrupt to
L1 now because L1 is running with interrupts disabled, in which case it turns
on the interrupt window when running L1 after the exit. L1 will soon enable
interrupts, and at that point L0 will gain control again and inject the
interrupt to L1.

Finally, there is an extra complication in the code: when nested_run_pending,
we cannot return to L1 now, and must launch L2. We need to remember the
interrupt we wanted to inject (and not clear it now), and do it on the
next exit.

The above explanation shows that the relative strangeness of the nested
interrupt injection code in this patch, and the extra interrupt-window
exit incurred, are in fact necessary for accurate emulation, and are not
just an unoptimized implementation.

Let's revisit now the two assumptions made above:

If L1 turns off PIN_BASED_EXT_INTR_MASK (no hypervisor that I know
does, by the way), things are simple: L0 may inject the interrupt directly
to the L2 guest - using the normal code path that injects to any guest.
We support this case in the code below.

If L1 turns on VM_EXIT_ACK_INTR_ON_EXIT, things look very different from the
description above: L1 expects to see an exit from L2 with the interrupt vector
already filled in the exit information, and does not expect to be interrupted
again with this interrupt. The current code does not (yet) support this case,
so we do not allow the VM_EXIT_ACK_INTR_ON_EXIT exit-control to be turned on
by L1.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7f62dc36af9b..ab218da8b956 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1790,6 +1790,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
 	/* exit controls */
 	nested_vmx_exit_ctls_low = 0;
+	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
 #ifdef CONFIG_X86_64
 	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #else
@@ -3744,9 +3745,25 @@ out:
 	return ret;
 }
 
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+		PIN_BASED_EXT_INTR_MASK;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+		/* We can get here when nested_run_pending caused
+		 * vmx_interrupt_allowed() to return false. In this case, do
+		 * nothing - the interrupt will be injected later.
+		 */
+		return;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3869,6 +3886,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+		struct vmcs12 *vmcs12;
+		if (to_vmx(vcpu)->nested.nested_run_pending)
+			return 0;
+		nested_vmx_vmexit(vcpu);
+		vmcs12 = get_vmcs12(vcpu);
+		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+		vmcs12->vm_exit_intr_info = 0;
+		/* fall through to normal code, but now in L1, not L2 */
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -5537,6 +5565,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
+	/*
+	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+	 * we did not inject a still-pending event to L1 now because of
+	 * nested_run_pending, we need to re-enable this bit.
+	 */
+	if (vmx->nested.nested_run_pending)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	if (exit_reason == EXIT_REASON_VMLAUNCH ||
 	    exit_reason == EXIT_REASON_VMRESUME)
 		vmx->nested.nested_run_pending = 1;

From 0b6ac343fc8e120b7d32fd2d51a8f81354086fa0 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:13:36 +0300
Subject: [PATCH 041/143] KVM: nVMX: Correct handling of exception injection

Similar to the previous patch, but concerning injection of exceptions rather
than external interrupts.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab218da8b956..9604af7675e8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1585,6 +1585,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 }
 
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ * This function assumes it is called with the exit reason in vmcs02 being
+ * a #PF exception (this is the only case in which KVM injects a #PF when L2
+ * is running).
+ */
+static int nested_pf_handled(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+	if (!(vmcs12->exception_bitmap & PF_VECTOR))
+		return 0;
+
+	nested_vmx_vmexit(vcpu);
+	return 1;
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject)
@@ -1592,6 +1611,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+		nested_pf_handled(vcpu))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -3820,6 +3843,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (is_guest_mode(vcpu))
+		return;
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon

From 66c78ae40cd0a7258d01ef433ede74e33e4adbbe Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:14:07 +0300
Subject: [PATCH 042/143] KVM: nVMX: Correct handling of idt vectoring info

This patch adds correct handling of IDT_VECTORING_INFO_FIELD for the nested
case.

When a guest exits while delivering an interrupt or exception, we get this
information in IDT_VECTORING_INFO_FIELD in the VMCS. When L2 exits to L1,
there's nothing we need to do, because L1 will see this field in vmcs12, and
handle it itself. However, when L2 exits and L0 handles the exit itself and
plans to return to L2, L0 must inject this event to L2.

In the normal non-nested case, the idt_vectoring_info case is discovered after
the exit, and the decision to inject (though not the injection itself) is made
at that point. However, in the nested case a decision of whether to return
to L2 or L1 also happens during the injection phase (see the previous
patches), so in the nested case we can only decide what to do about the
idt_vectoring_info right after the injection, i.e., in the beginning of
vmx_vcpu_run, which is the first time we know for sure if we're staying in
L2.

Therefore, when we exit L2 (is_guest_mode(vcpu)), we disable the regular
vmx_complete_interrupts() code which queues the idt_vectoring_info for
injection on next entry - because such injection would not be appropriate
if we will decide to exit to L1. Rather, we just save the idt_vectoring_info
and related fields in vmcs12 (which is a convenient place to save these
fields). On the next entry in vmx_vcpu_run (*after* the injection phase,
potentially exiting to L1 to inject an event requested by user space), if
we find ourselves in L1 we don't need to do anything with those values
we saved (as explained above). But if we find that we're in L2, or rather
*still* at L2 (it's not nested_run_pending, meaning that this is the first
round of L2 running after L1 having just launched it), we need to inject
the event saved in those fields - by writing the appropriate VMCS fields.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9604af7675e8..1e6bd69ae433 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5797,6 +5797,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
+	if (is_guest_mode(&vmx->vcpu))
+		return;
 	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -5804,6 +5806,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu))
+		return;
 	__vmx_complete_interrupts(to_vmx(vcpu),
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,
@@ -5824,6 +5828,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		if (vmcs12->idt_vectoring_info_field &
+				VECTORING_INFO_VALID_MASK) {
+			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+				vmcs12->idt_vectoring_info_field);
+			vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+				vmcs12->vm_exit_instruction_len);
+			if (vmcs12->idt_vectoring_info_field &
+					VECTORING_INFO_DELIVER_CODE_MASK)
+				vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+					vmcs12->idt_vectoring_error_code);
+		}
+	}
+
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
 		vmx->entry_time = ktime_get();
@@ -5956,6 +5975,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
+	if (is_guest_mode(vcpu)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
+		if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+			vmcs12->idt_vectoring_error_code =
+				vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			vmcs12->vm_exit_instruction_len =
+				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		}
+	}
+
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->loaded_vmcs->launched = 1;
 

From eeadf9e7558ce2c34c0d91985d26047a6e2245e7 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:14:38 +0300
Subject: [PATCH 043/143] KVM: nVMX: Handling of CR0 and CR4 modifying
 instructions

When L2 tries to modify CR0 or CR4 (with mov or clts), and modifies a bit
which L1 asked to shadow (via CR[04]_GUEST_HOST_MASK), we already do the right
thing: we let L1 handle the trap (see nested_vmx_exit_handled_cr() in a
previous patch).
When L2 modifies bits that L1 doesn't care about, we let it think (via
CR[04]_READ_SHADOW) that it did these modifications, while only changing
(in GUEST_CR[04]) the bits that L0 doesn't shadow.

This is needed for corect handling of CR0.TS for lazy FPU loading: L0 may
want to leave TS on, while pretending to allow the guest to change it.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1e6bd69ae433..6e9bebcdd5a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4164,6 +4164,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (to_vmx(vcpu)->nested.vmxon &&
+	    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+		return 1;
+
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * We get here when L2 changed cr0 in a way that did not change
+		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+		 * but did change L0 shadowed bits. This can currently happen
+		 * with the TS bit: L0 may want to leave TS on (for lazy fpu
+		 * loading) while pretending to allow the guest to change it.
+		 */
+		if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
+			 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+			return 1;
+		vmcs_writel(CR0_READ_SHADOW, val);
+		return 0;
+	} else
+		return kvm_set_cr0(vcpu, val);
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (is_guest_mode(vcpu)) {
+		if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
+			 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+			return 1;
+		vmcs_writel(CR4_READ_SHADOW, val);
+		return 0;
+	} else
+		return kvm_set_cr4(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
+		 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
+		 * just pretend it's off (also in arch.cr0 for fpu_activate).
+		 */
+		vmcs_writel(CR0_READ_SHADOW,
+			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+		vcpu->arch.cr0 &= ~X86_CR0_TS;
+	} else
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -4180,7 +4232,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			err = kvm_set_cr0(vcpu, val);
+			err = handle_set_cr0(vcpu, val);
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
@@ -4188,7 +4240,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 4:
-			err = kvm_set_cr4(vcpu, val);
+			err = handle_set_cr4(vcpu, val);
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
@@ -4206,7 +4258,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		};
 		break;
 	case 2: /* clts */
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+		handle_clts(vcpu);
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
 		skip_emulated_instruction(vcpu);
 		vmx_fpu_activate(vcpu);

From 36cf24e01e9eba8c9ea201202762081ced2f8cdf Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:15:08 +0300
Subject: [PATCH 044/143] KVM: nVMX: Further fixes for lazy FPU loading

KVM's "Lazy FPU loading" means that sometimes L0 needs to set CR0.TS, even
if a guest didn't set it. Moreover, L0 must also trap CR0.TS changes and
NM exceptions, even if we have a guest hypervisor (L1) who didn't want these
traps. And of course, conversely: If L1 wanted to trap these events, we
must let it, even if L0 is not interested in them.

This patch fixes some existing KVM code (in update_exception_bitmap(),
vmx_fpu_activate(), vmx_fpu_deactivate()) to do the correct merging of L0's
and L1's needs. Note that handle_cr() was already fixed in the above patch,
and that new code in introduced in previous patches already handles CR0
correctly (see prepare_vmcs02(), prepare_vmcs12(), and nested_vmx_vmexit()).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e9bebcdd5a6..10b22619e7ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1179,6 +1179,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	if (vcpu->fpu_active)
 		eb &= ~(1u << NM_VECTOR);
+
+	/* When we are running a nested L2 guest and L1 specified for it a
+	 * certain exception bitmap, we must trap the same exceptions and pass
+	 * them to L1. When running L2, we will only handle the exceptions
+	 * specified above if L1 did not want them.
+	 */
+	if (is_guest_mode(vcpu))
+		eb |= get_vmcs12(vcpu)->exception_bitmap;
+
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1473,6 +1482,9 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR0, cr0);
 	update_exception_bitmap(vcpu);
 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+	if (is_guest_mode(vcpu))
+		vcpu->arch.cr0_guest_owned_bits &=
+			~get_vmcs12(vcpu)->cr0_guest_host_mask;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
@@ -1496,12 +1508,29 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
+	 * set this *before* calling this function.
+	 */
 	vmx_decache_cr0_guest_bits(vcpu);
 	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
 	update_exception_bitmap(vcpu);
 	vcpu->arch.cr0_guest_owned_bits = 0;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-	vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * L1's specified read shadow might not contain the TS bit,
+		 * so now that we turned on shadowing of this bit, we need to
+		 * set this bit of the shadow. Like in nested_vmx_run we need
+		 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
+		 * up-to-date here because we just decached cr0.TS (and we'll
+		 * only update vmcs12->guest_cr0 on nested exit).
+		 */
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+			(vcpu->arch.cr0 & X86_CR0_TS);
+		vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+	} else
+		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)

From 7991825b8558a719eb7cfb93c4458d767ae1f2eb Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:15:39 +0300
Subject: [PATCH 045/143] KVM: nVMX: Additional TSC-offset handling

In the unlikely case that L1 does not capture MSR_IA32_TSC, L0 needs to
emulate this MSR write by L2 by modifying vmcs02.tsc_offset. We also need to
set vmcs12.tsc_offset, for this change to survive the next nested entry (see
prepare_vmcs02()).
Additionally, we also need to modify vmx_adjust_tsc_offset: The semantics
of this function is that the TSC of all guests on this vcpu, L1 and possibly
several L2s, need to be adjusted. To do this, we need to adjust vmcs01's
tsc_offset (this offset will also apply to each L2s we enter). We can't set
vmcs01 now, so we have to remember this adjustment and apply it when we
later exit to L1.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10b22619e7ec..afc1f06907ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1766,12 +1766,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	vmcs_write64(TSC_OFFSET, offset);
+	if (is_guest_mode(vcpu))
+		/*
+		 * We're here if L1 chose not to trap the TSC MSR. Since
+		 * prepare_vmcs12() does not copy tsc_offset, we need to also
+		 * set the vmcs12 field here.
+		 */
+		get_vmcs12(vcpu)->tsc_offset = offset -
+			to_vmx(vcpu)->nested.vmcs01_tsc_offset;
 }
 
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
+	if (is_guest_mode(vcpu)) {
+		/* Even when running L2, the adjustment needs to apply to L1 */
+		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
+	}
 }
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)

From 7b8050f570a03718d21fc8662c54586192ea2dac Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:16:10 +0300
Subject: [PATCH 046/143] KVM: nVMX: Add VMX to list of supported cpuid
 features

If the "nested" module option is enabled, add the "VMX" CPU feature to the
list of CPU features KVM advertises with the KVM_GET_SUPPORTED_CPUID ioctl.

Qemu uses this ioctl, and intersects KVM's list with its own list of desired
cpu features (depending on the -cpu option given to qemu) to determine the
final list of features presented to the guest.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index afc1f06907ce..a600fd7795f1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6299,6 +6299,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
+	if (func == 1 && nested)
+		entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
 /*

From 2844d8490523c5768cc37f21e065c76c45232724 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:16:40 +0300
Subject: [PATCH 047/143] KVM: nVMX: Miscellenous small corrections

Small corrections of KVM (spelling, etc.) not directly related to nested VMX.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a600fd7795f1..54e732d74fb1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -959,7 +959,7 @@ static void vmcs_load(struct vmcs *vmcs)
 			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 			: "cc", "memory");
 	if (error)
-		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
 		       vmcs, phys_addr);
 }
 

From 823e396558e509b7c3225cd76806f3d6643ff5f8 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Wed, 25 May 2011 23:17:11 +0300
Subject: [PATCH 048/143] KVM: nVMX: Documentation

This patch includes a brief introduction to the nested vmx feature in the
Documentation/kvm directory. The document also includes a copy of the
vmcs12 structure, as requested by Avi Kivity.

[marcelo: move to Documentation/virtual/kvm]

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/nested-vmx.txt | 251 +++++++++++++++++++++++
 1 file changed, 251 insertions(+)
 create mode 100644 Documentation/virtual/kvm/nested-vmx.txt

diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt
new file mode 100644
index 000000000000..8ed937de1163
--- /dev/null
+++ b/Documentation/virtual/kvm/nested-vmx.txt
@@ -0,0 +1,251 @@
+Nested VMX
+==========
+
+Overview
+---------
+
+On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions)
+to easily and efficiently run guest operating systems. Normally, these guests
+*cannot* themselves be hypervisors running their own guests, because in VMX,
+guests cannot use VMX instructions.
+
+The "Nested VMX" feature adds this missing capability - of running guest
+hypervisors (which use VMX) with their own nested guests. It does so by
+allowing a guest to use VMX instructions, and correctly and efficiently
+emulating them using the single level of VMX available in the hardware.
+
+We describe in much greater detail the theory behind the nested VMX feature,
+its implementation and its performance characteristics, in the OSDI 2010 paper
+"The Turtles Project: Design and Implementation of Nested Virtualization",
+available at:
+
+	http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf
+
+
+Terminology
+-----------
+
+Single-level virtualization has two levels - the host (KVM) and the guests.
+In nested virtualization, we have three levels: The host (KVM), which we call
+L0, the guest hypervisor, which we call L1, and its nested guest, which we
+call L2.
+
+
+Known limitations
+-----------------
+
+The current code supports running Linux guests under KVM guests.
+Only 64-bit guest hypervisors are supported.
+
+Additional patches for running Windows under guest KVM, and Linux under
+guest VMware server, and support for nested EPT, are currently running in
+the lab, and will be sent as follow-on patchsets.
+
+
+Running nested VMX
+------------------
+
+The nested VMX feature is disabled by default. It can be enabled by giving
+the "nested=1" option to the kvm-intel module.
+
+No modifications are required to user space (qemu). However, qemu's default
+emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
+explicitly enabled, by giving qemu one of the following options:
+
+     -cpu host              (emulated CPU has all features of the real CPU)
+
+     -cpu qemu64,+vmx       (add just the vmx feature to a named CPU type)
+
+
+ABIs
+----
+
+Nested VMX aims to present a standard and (eventually) fully-functional VMX
+implementation for the a guest hypervisor to use. As such, the official
+specification of the ABI that it provides is Intel's VMX specification,
+namely volume 3B of their "Intel 64 and IA-32 Architectures Software
+Developer's Manual". Not all of VMX's features are currently fully supported,
+but the goal is to eventually support them all, starting with the VMX features
+which are used in practice by popular hypervisors (KVM and others).
+
+As a VMX implementation, nested VMX presents a VMCS structure to L1.
+As mandated by the spec, other than the two fields revision_id and abort,
+this structure is *opaque* to its user, who is not supposed to know or care
+about its internal structure. Rather, the structure is accessed through the
+VMREAD and VMWRITE instructions.
+Still, for debugging purposes, KVM developers might be interested to know the
+internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c.
+
+The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we
+also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS
+which L0 builds to actually run L2 - how this is done is explained in the
+aforementioned paper.
+
+For convenience, we repeat the content of struct vmcs12 here. If the internals
+of this structure changes, this can break live migration across KVM versions.
+VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner
+struct shadow_vmcs is ever changed.
+
+	typedef u64 natural_width;
+	struct __packed vmcs12 {
+		/* According to the Intel spec, a VMCS region must start with
+		 * these two user-visible fields */
+		u32 revision_id;
+		u32 abort;
+
+		u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+		u32 padding[7]; /* room for future expansion */
+
+		u64 io_bitmap_a;
+		u64 io_bitmap_b;
+		u64 msr_bitmap;
+		u64 vm_exit_msr_store_addr;
+		u64 vm_exit_msr_load_addr;
+		u64 vm_entry_msr_load_addr;
+		u64 tsc_offset;
+		u64 virtual_apic_page_addr;
+		u64 apic_access_addr;
+		u64 ept_pointer;
+		u64 guest_physical_address;
+		u64 vmcs_link_pointer;
+		u64 guest_ia32_debugctl;
+		u64 guest_ia32_pat;
+		u64 guest_ia32_efer;
+		u64 guest_pdptr0;
+		u64 guest_pdptr1;
+		u64 guest_pdptr2;
+		u64 guest_pdptr3;
+		u64 host_ia32_pat;
+		u64 host_ia32_efer;
+		u64 padding64[8]; /* room for future expansion */
+		natural_width cr0_guest_host_mask;
+		natural_width cr4_guest_host_mask;
+		natural_width cr0_read_shadow;
+		natural_width cr4_read_shadow;
+		natural_width cr3_target_value0;
+		natural_width cr3_target_value1;
+		natural_width cr3_target_value2;
+		natural_width cr3_target_value3;
+		natural_width exit_qualification;
+		natural_width guest_linear_address;
+		natural_width guest_cr0;
+		natural_width guest_cr3;
+		natural_width guest_cr4;
+		natural_width guest_es_base;
+		natural_width guest_cs_base;
+		natural_width guest_ss_base;
+		natural_width guest_ds_base;
+		natural_width guest_fs_base;
+		natural_width guest_gs_base;
+		natural_width guest_ldtr_base;
+		natural_width guest_tr_base;
+		natural_width guest_gdtr_base;
+		natural_width guest_idtr_base;
+		natural_width guest_dr7;
+		natural_width guest_rsp;
+		natural_width guest_rip;
+		natural_width guest_rflags;
+		natural_width guest_pending_dbg_exceptions;
+		natural_width guest_sysenter_esp;
+		natural_width guest_sysenter_eip;
+		natural_width host_cr0;
+		natural_width host_cr3;
+		natural_width host_cr4;
+		natural_width host_fs_base;
+		natural_width host_gs_base;
+		natural_width host_tr_base;
+		natural_width host_gdtr_base;
+		natural_width host_idtr_base;
+		natural_width host_ia32_sysenter_esp;
+		natural_width host_ia32_sysenter_eip;
+		natural_width host_rsp;
+		natural_width host_rip;
+		natural_width paddingl[8]; /* room for future expansion */
+		u32 pin_based_vm_exec_control;
+		u32 cpu_based_vm_exec_control;
+		u32 exception_bitmap;
+		u32 page_fault_error_code_mask;
+		u32 page_fault_error_code_match;
+		u32 cr3_target_count;
+		u32 vm_exit_controls;
+		u32 vm_exit_msr_store_count;
+		u32 vm_exit_msr_load_count;
+		u32 vm_entry_controls;
+		u32 vm_entry_msr_load_count;
+		u32 vm_entry_intr_info_field;
+		u32 vm_entry_exception_error_code;
+		u32 vm_entry_instruction_len;
+		u32 tpr_threshold;
+		u32 secondary_vm_exec_control;
+		u32 vm_instruction_error;
+		u32 vm_exit_reason;
+		u32 vm_exit_intr_info;
+		u32 vm_exit_intr_error_code;
+		u32 idt_vectoring_info_field;
+		u32 idt_vectoring_error_code;
+		u32 vm_exit_instruction_len;
+		u32 vmx_instruction_info;
+		u32 guest_es_limit;
+		u32 guest_cs_limit;
+		u32 guest_ss_limit;
+		u32 guest_ds_limit;
+		u32 guest_fs_limit;
+		u32 guest_gs_limit;
+		u32 guest_ldtr_limit;
+		u32 guest_tr_limit;
+		u32 guest_gdtr_limit;
+		u32 guest_idtr_limit;
+		u32 guest_es_ar_bytes;
+		u32 guest_cs_ar_bytes;
+		u32 guest_ss_ar_bytes;
+		u32 guest_ds_ar_bytes;
+		u32 guest_fs_ar_bytes;
+		u32 guest_gs_ar_bytes;
+		u32 guest_ldtr_ar_bytes;
+		u32 guest_tr_ar_bytes;
+		u32 guest_interruptibility_info;
+		u32 guest_activity_state;
+		u32 guest_sysenter_cs;
+		u32 host_ia32_sysenter_cs;
+		u32 padding32[8]; /* room for future expansion */
+		u16 virtual_processor_id;
+		u16 guest_es_selector;
+		u16 guest_cs_selector;
+		u16 guest_ss_selector;
+		u16 guest_ds_selector;
+		u16 guest_fs_selector;
+		u16 guest_gs_selector;
+		u16 guest_ldtr_selector;
+		u16 guest_tr_selector;
+		u16 host_es_selector;
+		u16 host_cs_selector;
+		u16 host_ss_selector;
+		u16 host_ds_selector;
+		u16 host_fs_selector;
+		u16 host_gs_selector;
+		u16 host_tr_selector;
+	};
+
+
+Authors
+-------
+
+These patches were written by:
+     Abel Gordon, abelg <at> il.ibm.com
+     Nadav Har'El, nyh <at> il.ibm.com
+     Orit Wasserman, oritw <at> il.ibm.com
+     Ben-Ami Yassor, benami <at> il.ibm.com
+     Muli Ben-Yehuda, muli <at> il.ibm.com
+
+With contributions by:
+     Anthony Liguori, aliguori <at> us.ibm.com
+     Mike Day, mdday <at> us.ibm.com
+     Michael Factor, factor <at> il.ibm.com
+     Zvi Dubitzky, dubi <at> il.ibm.com
+
+And valuable reviews by:
+     Avi Kivity, avi <at> redhat.com
+     Gleb Natapov, gleb <at> redhat.com
+     Marcelo Tosatti, mtosatti <at> redhat.com
+     Kevin Tian, kevin.tian <at> intel.com
+     and others.

From 55399a02e90fdc6cd45165b2df5dd97b7c3f018f Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Sat, 28 May 2011 14:12:30 +0300
Subject: [PATCH 049/143] KVM: Document KVM_IOEVENTFD

Document KVM_IOEVENTFD that can be used to receive
notifications of PIO/MMIO events without triggering
an exit.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 67cc0f5b9974..3755a390f886 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1323,6 +1323,36 @@ struct kvm_lapic_state {
 Copies the input argument into the the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
+4.56 KVM_IOEVENTFD
+
+Capability: KVM_CAP_IOEVENTFD
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_ioeventfd (in)
+Returns: 0 on success, !0 on error
+
+This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
+within the guest.  A guest write in the registered address will signal the
+provided event instead of triggering an exit.
+
+struct kvm_ioeventfd {
+	__u64 datamatch;
+	__u64 addr;        /* legal pio/mmio address */
+	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__s32 fd;
+	__u32 flags;
+	__u8  pad[36];
+};
+
+The following flags are defined:
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+
+If datamatch flag is set, the event will be signaled only if the written value
+to the registered address is equal to datamatch in struct kvm_ioeventfd.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by

From 9d74191ab1ea857d1cc27e439316eebf8ae46d19 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 21:53:48 +0900
Subject: [PATCH 050/143] KVM: x86 emulator: Use the pointers ctxt and c
 consistently

We should use the local variables ctxt and c when the emulate_ctxt and
decode appears many times.  At least, we need to be consistent about
how we use these in a function.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c |  6 ++---
 arch/x86/kvm/x86.c     | 59 +++++++++++++++++++++---------------------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bc916bd22e86..6c054f847319 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3707,7 +3707,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	int saved_dst_type = c->dst.type;
 	int irq; /* Used for int 3, int, and into */
 
-	ctxt->decode.mem_read.pos = 0;
+	c->mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		rc = emulate_ud(ctxt);
@@ -4092,7 +4092,7 @@ writeback:
 				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *r = &ctxt->decode.io_read;
+		struct read_cache *r = &c->io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 
 		if (!string_insn_completed(ctxt)) {
@@ -4107,7 +4107,7 @@ writeback:
 				 * decode, but since instruction is restarted
 				 * we have to do it here.
 				 */
-				ctxt->decode.mem_read.end = 0;
+				c->mem_read.end = 0;
 				return EMULATION_RESTART;
 			}
 			goto done; /* skip rip writeback */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index de262a086863..39d8b043580f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4552,24 +4552,24 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
-	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
-	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
-								 inc_eip;
-	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq);
+	c->op_bytes = 2;
+	c->ad_bytes = 2;
+	c->eip = ctxt->eip + inc_eip;
+	ret = emulate_int_real(ctxt, irq);
 
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;
 
-	vcpu->arch.emulate_ctxt.eip = c->eip;
+	ctxt->eip = c->eip;
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_rip_write(vcpu, ctxt->eip);
+	kvm_set_rflags(vcpu, ctxt->eflags);
 
 	if (irq == NMI_VECTOR)
 		vcpu->arch.nmi_pending = false;
@@ -4630,21 +4630,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    int insn_len)
 {
 	int r;
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	struct decode_cache *c = &ctxt->decode;
 	bool writeback = true;
 
 	kvm_clear_exception_queue(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
-		vcpu->arch.emulate_ctxt.interruptibility = 0;
-		vcpu->arch.emulate_ctxt.have_exception = false;
-		vcpu->arch.emulate_ctxt.perm_ok = false;
+		ctxt->interruptibility = 0;
+		ctxt->have_exception = false;
+		ctxt->perm_ok = false;
 
-		vcpu->arch.emulate_ctxt.only_vendor_specific_insn
+		ctxt->only_vendor_specific_insn
 			= emulation_type & EMULTYPE_TRAP_UD;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
+		r = x86_decode_insn(ctxt, insn, insn_len);
 
 		trace_kvm_emulate_insn_start(vcpu);
 		++vcpu->stat.insn_emulation;
@@ -4660,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		kvm_rip_write(vcpu, c->eip);
 		return EMULATE_DONE;
 	}
 
@@ -4672,7 +4673,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 restart:
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
 		return EMULATE_DONE;
@@ -4684,7 +4685,7 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
-	if (vcpu->arch.emulate_ctxt.have_exception) {
+	if (ctxt->have_exception) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
 	} else if (vcpu->arch.pio.count) {
@@ -4703,13 +4704,12 @@ restart:
 		r = EMULATE_DONE;
 
 	if (writeback) {
-		toggle_interruptibility(vcpu,
-				vcpu->arch.emulate_ctxt.interruptibility);
-		kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+		toggle_interruptibility(vcpu, ctxt->interruptibility);
+		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+		kvm_rip_write(vcpu, ctxt->eip);
 	} else
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 
@@ -5130,8 +5130,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
-				       rip, instruction, 3, NULL);
+	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5849,21 +5848,21 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
-				   tss_selector, reason, has_error_code,
-				   error_code);
+	ret = emulator_task_switch(ctxt, tss_selector, reason,
+				   has_error_code, error_code);
 
 	if (ret)
 		return EMULATE_FAIL;
 
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_rip_write(vcpu, ctxt->eip);
+	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }

From e01991e71a179ddab494c8e02100ad73bc0010c4 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 21:55:10 +0900
Subject: [PATCH 051/143] KVM: x86 emulator: Rename emulate_xxx() to em_xxx()

The next patch will change these to be called by opcode::execute.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6c054f847319..4af94424fe87 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1656,7 +1656,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-static int emulate_iret(struct x86_emulate_ctxt *ctxt)
+static int em_iret(struct x86_emulate_ctxt *ctxt)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
@@ -1816,7 +1816,7 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt)
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
@@ -1880,7 +1880,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 }
 
-static int emulate_syscall(struct x86_emulate_ctxt *ctxt)
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -1933,7 +1933,7 @@ static int emulate_syscall(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -1989,7 +1989,7 @@ static int emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -3948,7 +3948,7 @@ special_insn:
 		rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
 		break;
 	case 0xcb:		/* ret far */
-		rc = emulate_ret_far(ctxt);
+		rc = em_ret_far(ctxt);
 		break;
 	case 0xcc:		/* int3 */
 		irq = 3;
@@ -3965,7 +3965,7 @@ special_insn:
 		}
 		break;
 	case 0xcf:		/* iret */
-		rc = emulate_iret(ctxt);
+		rc = em_iret(ctxt);
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		rc = em_grp2(ctxt);
@@ -4127,7 +4127,7 @@ done:
 twobyte_insn:
 	switch (c->b) {
 	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt);
+		rc = em_syscall(ctxt);
 		break;
 	case 0x06:
 		rc = em_clts(ctxt);
@@ -4189,10 +4189,10 @@ twobyte_insn:
 		rc = X86EMUL_CONTINUE;
 		break;
 	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt);
+		rc = em_sysenter(ctxt);
 		break;
 	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt);
+		rc = em_sysexit(ctxt);
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;

From db5b0762f3cab58398f16379ab37ef66ef9ba497 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 21:56:26 +0900
Subject: [PATCH 052/143] KVM: x86 emulator: Use opcode::execute for some
 instructions

Move the following functions to the opcode tables:

  RET (Far return) : CB
  IRET             : CF
  JMP (Jump far)   : EA

  SYSCALL          : 0F 05
  CLTS             : 0F 06
  SYSENTER         : 0F 34
  SYSEXIT          : 0F 35

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 ++++++++-----------------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4af94424fe87..136bc6cbd5fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3169,9 +3169,9 @@ static struct opcode opcode_table[256] = {
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
-	N, N, N, D(ImplicitOps | Stack),
+	N, N, N, I(ImplicitOps | Stack, em_ret_far),
 	D(ImplicitOps), DI(SrcImmByte, intn),
-	D(ImplicitOps | No64), DI(ImplicitOps, iret),
+	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
 	/* 0xD0 - 0xD7 */
 	D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
 	N, N, N, N,
@@ -3183,7 +3183,7 @@ static struct opcode opcode_table[256] = {
 	D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
-	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
 	D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
 	D2bvIP(SrcAcc | DstDX, out, check_perm_out),
 	/* 0xF0 - 0xF7 */
@@ -3198,7 +3198,8 @@ static struct opcode opcode_table[256] = {
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	G(0, group6), GD(0, &group7), N, N,
-	N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
+	N, I(ImplicitOps | VendorSpecific, em_syscall),
+	II(ImplicitOps | Priv, em_clts, clts), N,
 	DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
 	N, D(ImplicitOps | ModRM), N, N,
 	/* 0x10 - 0x1F */
@@ -3215,7 +3216,8 @@ static struct opcode twobyte_table[256] = {
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	DI(ImplicitOps | Priv, rdmsr),
 	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
-	D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+	I(ImplicitOps | VendorSpecific, em_sysenter),
+	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
@@ -3947,9 +3949,6 @@ special_insn:
 	case 0xc5:		/* lds */
 		rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
 		break;
-	case 0xcb:		/* ret far */
-		rc = em_ret_far(ctxt);
-		break;
 	case 0xcc:		/* int3 */
 		irq = 3;
 		goto do_interrupt;
@@ -3964,9 +3963,6 @@ special_insn:
 			goto do_interrupt;
 		}
 		break;
-	case 0xcf:		/* iret */
-		rc = em_iret(ctxt);
-		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		rc = em_grp2(ctxt);
 		break;
@@ -3998,12 +3994,7 @@ special_insn:
 		break;
 	}
 	case 0xe9: /* jmp rel */
-		goto jmp;
-	case 0xea: /* jmp far */
-		rc = em_jmp_far(ctxt);
-		break;
-	case 0xeb:
-	      jmp:		/* jmp rel short */
+	case 0xeb: /* jmp rel short */
 		jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		break;
@@ -4126,12 +4117,6 @@ done:
 
 twobyte_insn:
 	switch (c->b) {
-	case 0x05: 		/* syscall */
-		rc = em_syscall(ctxt);
-		break;
-	case 0x06:
-		rc = em_clts(ctxt);
-		break;
 	case 0x09:		/* wbinvd */
 		(ctxt->ops->wbinvd)(ctxt);
 		break;
@@ -4188,12 +4173,6 @@ twobyte_insn:
 		}
 		rc = X86EMUL_CONTINUE;
 		break;
-	case 0x34:		/* sysenter */
-		rc = em_sysenter(ctxt);
-		break;
-	case 0x35:		/* sysexit */
-		rc = em_sysexit(ctxt);
-		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
 		if (!test_cc(c->b, ctxt->eflags))

From 9f21ca599cd609502de8a56c1d4c4688d40abb2d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 21:57:53 +0900
Subject: [PATCH 053/143] KVM: x86 emulator: Use opcode::execute for
 TEST(84/85, A8/A9)

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 136bc6cbd5fa..d25cfc238b90 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2605,6 +2605,14 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_test(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
+
 static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -3135,7 +3143,8 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImm | ModRM | Group, group1),
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+	I2bv(DstMem | SrcReg | ModRM, em_test),
+	D2bv(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
 	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
@@ -3154,7 +3163,7 @@ static struct opcode opcode_table[256] = {
 	I2bv(SrcSI | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstDI | String, em_cmp),
 	/* 0xA8 - 0xAF */
-	D2bv(DstAcc | SrcImm),
+	I2bv(DstAcc | SrcImm, em_test),
 	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstAcc | Mov | String, em_mov),
 	I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3873,10 +3882,6 @@ special_insn:
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
 		break;
-	case 0x84 ... 0x85:
-	test:
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-		break;
 	case 0x86 ... 0x87:	/* xchg */
 	xchg:
 		/* Write back the register source. */
@@ -3932,8 +3937,6 @@ special_insn:
 		case 8: c->dst.val = (s32)c->dst.val; break;
 		}
 		break;
-	case 0xa8 ... 0xa9:	/* test ax, imm */
-		goto test;
 	case 0xc0 ... 0xc1:
 		rc = em_grp2(ctxt);
 		break;

From e4f973ae913028bac8c07187e0fd49c1dc08ce58 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 21:59:09 +0900
Subject: [PATCH 054/143] KVM: x86 emulator: Use opcode::execute for
 XCHG(86/87)

In addition, replace one "goto xchg" with an em_xchg() call.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d25cfc238b90..c3d071dfe504 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2613,6 +2613,20 @@ static int em_test(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	/* Write back the register source. */
+	c->src.val = c->dst.val;
+	write_register_operand(&c->src);
+
+	/* Write back the memory destination with implicit LOCK prefix. */
+	c->dst.val = c->src.orig_val;
+	c->lock_prefix = 1;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -3144,7 +3158,7 @@ static struct opcode opcode_table[256] = {
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	I2bv(DstMem | SrcReg | ModRM, em_test),
-	D2bv(DstMem | SrcReg | ModRM | Lock),
+	I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
 	/* 0x88 - 0x8F */
 	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
@@ -3882,18 +3896,6 @@ special_insn:
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
 		break;
-	case 0x86 ... 0x87:	/* xchg */
-	xchg:
-		/* Write back the register source. */
-		c->src.val = c->dst.val;
-		write_register_operand(&c->src);
-		/*
-		 * Write back the memory destination with implicit LOCK
-		 * prefix.
-		 */
-		c->dst.val = c->src.orig_val;
-		c->lock_prefix = 1;
-		break;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
 			rc = emulate_ud(ctxt);
@@ -3929,7 +3931,8 @@ special_insn:
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
 			break;
-		goto xchg;
+		rc = em_xchg(ctxt);
+		break;
 	case 0x98: /* cbw/cwde/cdqe */
 		switch (c->op_bytes) {
 		case 2: c->dst.val = (s8)c->dst.val; break;

From ebda02c2a5a6001c787f311b4d5a0dc827ce2d92 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 22:00:22 +0900
Subject: [PATCH 055/143] KVM: x86 emulator: Use opcode::execute for RET(C3)

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c3d071dfe504..2ebec692d44b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1816,6 +1816,16 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.type = OP_REG;
+	c->dst.addr.reg = &c->eip;
+	c->dst.bytes = c->op_bytes;
+	return em_pop(ctxt);
+}
+
 static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -3188,7 +3198,7 @@ static struct opcode opcode_table[256] = {
 	/* 0xC0 - 0xC7 */
 	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
-	D(ImplicitOps | Stack),
+	I(ImplicitOps | Stack, em_ret),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
@@ -3943,12 +3953,6 @@ special_insn:
 	case 0xc0 ... 0xc1:
 		rc = em_grp2(ctxt);
 		break;
-	case 0xc3: /* ret */
-		c->dst.type = OP_REG;
-		c->dst.addr.reg = &c->eip;
-		c->dst.bytes = c->op_bytes;
-		rc = em_pop(ctxt);
-		break;
 	case 0xc4:		/* les */
 		rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
 		break;

From 1bd5f469b2d54330ba41d9c4b857dc5051e8dcf7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 22:01:33 +0900
Subject: [PATCH 056/143] KVM: x86 emulator: Use opcode::execute for MOV(8C/8E)

Different functions for those which take segment register operands.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 59 ++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2ebec692d44b..5561680c1e9c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2683,6 +2683,33 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	if (c->modrm_reg > VCPU_SREG_GS)
+		return emulate_ud(ctxt);
+
+	c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u16 sel = c->src.val;
+
+	if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS)
+		return emulate_ud(ctxt);
+
+	if (c->modrm_reg == VCPU_SREG_SS)
+		ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+	return load_segment_descriptor(ctxt, sel, c->modrm_reg);
+}
+
 static int em_movdqu(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -3172,8 +3199,10 @@ static struct opcode opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+	I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+	D(ModRM | SrcMem | NoAccess | DstReg),
+	I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+	G(0, group1A),
 	/* 0x90 - 0x97 */
 	DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
@@ -3906,35 +3935,9 @@ special_insn:
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
 		break;
-	case 0x8c:  /* mov r/m, sreg */
-		if (c->modrm_reg > VCPU_SREG_GS) {
-			rc = emulate_ud(ctxt);
-			goto done;
-		}
-		c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
-		break;
 	case 0x8d: /* lea r16/r32, m */
 		c->dst.val = c->src.addr.mem.ea;
 		break;
-	case 0x8e: { /* mov seg, r/m16 */
-		uint16_t sel;
-
-		sel = c->src.val;
-
-		if (c->modrm_reg == VCPU_SREG_CS ||
-		    c->modrm_reg > VCPU_SREG_GS) {
-			rc = emulate_ud(ctxt);
-			goto done;
-		}
-
-		if (c->modrm_reg == VCPU_SREG_SS)
-			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
-
-		rc = load_segment_descriptor(ctxt, sel, c->modrm_reg);
-
-		c->dst.type = OP_NONE;  /* Disable writeback. */
-		break;
-	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = em_grp1a(ctxt);
 		break;

From 5c5df76b8b32055956ee4cca338d29046016b13e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 22:02:55 +0900
Subject: [PATCH 057/143] KVM: x86 emulator: Clean up INT n/INTO/INT
 3(CC/CD/CE)

Call emulate_int() directly to avoid spaghetti goto's.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5561680c1e9c..d7df7ba17b89 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3769,7 +3769,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
-	int irq; /* Used for int 3, int, and into */
 
 	c->mem_read.pos = 0;
 
@@ -3963,18 +3962,14 @@ special_insn:
 		rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
 		break;
 	case 0xcc:		/* int3 */
-		irq = 3;
-		goto do_interrupt;
+		rc = emulate_int(ctxt, 3);
+		break;
 	case 0xcd:		/* int n */
-		irq = c->src.val;
-	do_interrupt:
-		rc = emulate_int(ctxt, irq);
+		rc = emulate_int(ctxt, c->src.val);
 		break;
 	case 0xce:		/* into */
-		if (ctxt->eflags & EFLG_OF) {
-			irq = 4;
-			goto do_interrupt;
-		}
+		if (ctxt->eflags & EFLG_OF)
+			rc = emulate_int(ctxt, 4);
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		rc = em_grp2(ctxt);

From d06e03adcb30f9e9fff4df1d80a3087f54a62d9a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 22:04:08 +0900
Subject: [PATCH 058/143] KVM: x86 emulator: Use opcode::execute for LOOP/JCXZ

  LOOP/LOOPcc      : E0-E2
  JCXZ/JECXZ/JRCXZ : E3

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d7df7ba17b89..e9dbbc91ce8e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2824,6 +2824,28 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+	if ((address_mask(c, c->regs[VCPU_REGS_RCX]) != 0) &&
+	    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+		jmp_rel(c, c->src.val);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+		jmp_rel(c, c->src.val);
+
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3240,7 +3262,8 @@ static struct opcode opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	X4(D(SrcImmByte)),
+	X3(I(SrcImmByte, em_loop)),
+	I(SrcImmByte, em_jcxz),
 	D2bvIP(SrcImmUByte | DstAcc, in,  check_perm_in),
 	D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
 	/* 0xE8 - 0xEF */
@@ -3978,16 +4001,6 @@ special_insn:
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe0 ... 0xe2:	/* loop/loopz/loopnz */
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
-		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
-			jmp_rel(c, c->src.val);
-		break;
-	case 0xe3:	/* jcxz/jecxz/jrcxz */
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
-			jmp_rel(c, c->src.val);
-		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;

From f411e6cdc275e63ead2ffb427d0497daae6f6069 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 29 May 2011 22:05:15 +0900
Subject: [PATCH 059/143] KVM: x86 emulator: Use opcode::execute for
 CLI/STI(FA/FB)

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e9dbbc91ce8e..663bdb3637aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2846,6 +2846,25 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+	if (emulator_bad_iopl(ctxt))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->eflags &= ~X86_EFLAGS_IF;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+	if (emulator_bad_iopl(ctxt))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+	ctxt->eflags |= X86_EFLAGS_IF;
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3276,7 +3295,8 @@ static struct opcode opcode_table[256] = {
 	DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
 	G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
-	D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps),
+	I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
 	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
@@ -4049,22 +4069,6 @@ special_insn:
 	case 0xf9: /* stc */
 		ctxt->eflags |= EFLG_CF;
 		break;
-	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt)) {
-			rc = emulate_gp(ctxt, 0);
-			goto done;
-		} else
-			ctxt->eflags &= ~X86_EFLAGS_IF;
-		break;
-	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt)) {
-			rc = emulate_gp(ctxt, 0);
-			goto done;
-		} else {
-			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
-			ctxt->eflags |= X86_EFLAGS_IF;
-		}
-		break;
 	case 0xfc: /* cld */
 		ctxt->eflags &= ~EFLG_DF;
 		break;

From 2e4ce7f574369f374ad537a180b4870e2098cf0e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 1 Jun 2011 12:57:30 +0200
Subject: [PATCH 060/143] KVM: VMX: Silence warning on 32-bit hosts

a is unused now on CONFIG_X86_32.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 54e732d74fb1..58badf1657d4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3602,7 +3602,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
+#ifdef CONFIG_X86_64
 	unsigned long a;
+#endif
 	int i;
 
 	/* I/O */

From 36dd9bb5ce32bc39e25a5fcc61415f13e3ed5d17 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 1 Jun 2011 15:34:24 +0300
Subject: [PATCH 061/143] KVM: x86 emulator: rename decode_cache::eip to _eip

The name eip conflicts with a field of the same name in x86_emulate_ctxt,
which we plan to fold decode_cache into.

The name _eip is unfortunate, but what's really needed is a refactoring
here, not a better name.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   2 +-
 arch/x86/kvm/emulate.c             | 120 ++++++++++++++---------------
 arch/x86/kvm/trace.h               |   2 +-
 arch/x86/kvm/x86.c                 |   6 +-
 4 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c0f77e09ebce..d0e100f55b76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -253,7 +253,7 @@ struct decode_cache {
 	u8 modrm_rm;
 	u8 modrm_seg;
 	bool rip_relative;
-	unsigned long eip;
+	unsigned long _eip;
 	/* Fields above regs are cleared together. */
 	unsigned long regs[NR_VCPU_REGS];
 	struct fetch_cache fetch;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 663bdb3637aa..a1b9705e3cc4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -459,7 +459,7 @@ register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
 
 static inline void jmp_rel(struct decode_cache *c, int rel)
 {
-	register_address_increment(c, &c->eip, rel);
+	register_address_increment(c, &c->_eip, rel);
 }
 
 static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -898,7 +898,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
 	}
 
-	c->modrm = insn_fetch(u8, 1, c->eip);
+	c->modrm = insn_fetch(u8, 1, c->_eip);
 	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
@@ -932,13 +932,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 6)
-				modrm_ea += insn_fetch(u16, 2, c->eip);
+				modrm_ea += insn_fetch(u16, 2, c->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(u16, 2, c->eip);
+			modrm_ea += insn_fetch(u16, 2, c->_eip);
 			break;
 		}
 		switch (c->modrm_rm) {
@@ -975,13 +975,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	} else {
 		/* 32/64-bit ModR/M decode. */
 		if ((c->modrm_rm & 7) == 4) {
-			sib = insn_fetch(u8, 1, c->eip);
+			sib = insn_fetch(u8, 1, c->_eip);
 			index_reg |= (sib >> 3) & 7;
 			base_reg |= sib & 7;
 			scale = sib >> 6;
 
 			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->_eip);
 			else
 				modrm_ea += c->regs[base_reg];
 			if (index_reg != 4)
@@ -994,13 +994,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 5)
-				modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(s32, 4, c->eip);
+			modrm_ea += insn_fetch(s32, 4, c->_eip);
 			break;
 		}
 	}
@@ -1018,13 +1018,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
 	op->type = OP_MEM;
 	switch (c->ad_bytes) {
 	case 2:
-		op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+		op->addr.mem.ea = insn_fetch(u16, 2, c->_eip);
 		break;
 	case 4:
-		op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+		op->addr.mem.ea = insn_fetch(u32, 4, c->_eip);
 		break;
 	case 8:
-		op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+		op->addr.mem.ea = insn_fetch(u64, 8, c->_eip);
 		break;
 	}
 done:
@@ -1561,7 +1561,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->src.val = c->eip;
+	c->src.val = c->_eip;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1583,7 +1583,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = eip;
+	c->_eip = eip;
 
 	return rc;
 }
@@ -1640,7 +1640,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = temp_eip;
+	c->_eip = temp_eip;
 
 
 	if (c->op_bytes == 4)
@@ -1683,8 +1683,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = 0;
-	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+	c->_eip = 0;
+	memcpy(&c->_eip, c->src.valptr, c->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
@@ -1778,14 +1778,14 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 		break;
 	case 2: /* call near abs */ {
 		long int old_eip;
-		old_eip = c->eip;
-		c->eip = c->src.val;
+		old_eip = c->_eip;
+		c->_eip = c->src.val;
 		c->src.val = old_eip;
 		rc = em_push(ctxt);
 		break;
 	}
 	case 4: /* jmp abs */
-		c->eip = c->src.val;
+		c->_eip = c->src.val;
 		break;
 	case 5: /* jmp far */
 		rc = em_jmp_far(ctxt);
@@ -1821,7 +1821,7 @@ static int em_ret(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 
 	c->dst.type = OP_REG;
-	c->dst.addr.reg = &c->eip;
+	c->dst.addr.reg = &c->_eip;
 	c->dst.bytes = c->op_bytes;
 	return em_pop(ctxt);
 }
@@ -1832,11 +1832,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	int rc;
 	unsigned long cs;
 
-	rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
+	rc = emulate_pop(ctxt, &c->_eip, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	if (c->op_bytes == 4)
-		c->eip = (u32)c->eip;
+		c->_eip = (u32)c->_eip;
 	rc = emulate_pop(ctxt, &cs, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1919,7 +1919,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->regs[VCPU_REGS_RCX] = c->eip;
+	c->regs[VCPU_REGS_RCX] = c->_eip;
 	if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
 		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
@@ -1927,7 +1927,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 		ops->get_msr(ctxt,
 			     ctxt->mode == X86EMUL_MODE_PROT64 ?
 			     MSR_LSTAR : MSR_CSTAR, &msr_data);
-		c->eip = msr_data;
+		c->_eip = msr_data;
 
 		ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1935,7 +1935,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	} else {
 		/* legacy mode */
 		ops->get_msr(ctxt, MSR_STAR, &msr_data);
-		c->eip = (u32)msr_data;
+		c->_eip = (u32)msr_data;
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
 	}
@@ -1991,7 +1991,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
-	c->eip = msr_data;
+	c->_eip = msr_data;
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
 	c->regs[VCPU_REGS_RSP] = msr_data;
@@ -2045,7 +2045,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->eip = c->regs[VCPU_REGS_RDX];
+	c->_eip = c->regs[VCPU_REGS_RDX];
 	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
 
 	return X86EMUL_CONTINUE;
@@ -2115,7 +2115,7 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	tss->ip = c->eip;
+	tss->ip = c->_eip;
 	tss->flag = ctxt->eflags;
 	tss->ax = c->regs[VCPU_REGS_RAX];
 	tss->cx = c->regs[VCPU_REGS_RCX];
@@ -2139,7 +2139,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	c->eip = tss->ip;
+	c->_eip = tss->ip;
 	ctxt->eflags = tss->flag | 2;
 	c->regs[VCPU_REGS_RAX] = tss->ax;
 	c->regs[VCPU_REGS_RCX] = tss->cx;
@@ -2233,7 +2233,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 
 	tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
-	tss->eip = c->eip;
+	tss->eip = c->_eip;
 	tss->eflags = ctxt->eflags;
 	tss->eax = c->regs[VCPU_REGS_RAX];
 	tss->ecx = c->regs[VCPU_REGS_RCX];
@@ -2261,7 +2261,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 
 	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
 		return emulate_gp(ctxt, 0);
-	c->eip = tss->eip;
+	c->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
 	c->regs[VCPU_REGS_RCX] = tss->ecx;
@@ -2446,14 +2446,14 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->eip = ctxt->eip;
+	c->_eip = ctxt->eip;
 	c->dst.type = OP_NONE;
 
 	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE)
-		ctxt->eip = c->eip;
+		ctxt->eip = c->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -2516,14 +2516,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	int rc;
 
 	old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
-	old_eip = c->eip;
+	old_eip = c->_eip;
 
 	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
 	if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
 		return X86EMUL_CONTINUE;
 
-	c->eip = 0;
-	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+	c->_eip = 0;
+	memcpy(&c->_eip, c->src.valptr, c->op_bytes);
 
 	c->src.val = old_cs;
 	rc = em_push(ctxt);
@@ -2540,7 +2540,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	int rc;
 
 	c->dst.type = OP_REG;
-	c->dst.addr.reg = &c->eip;
+	c->dst.addr.reg = &c->_eip;
 	c->dst.bytes = c->op_bytes;
 	rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
@@ -2754,7 +2754,7 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 		return rc;
 
 	/* Let the processor re-execute the fixed hypercall */
-	c->eip = ctxt->eip;
+	c->_eip = ctxt->eip;
 	/* Disable writeback. */
 	c->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
@@ -3408,17 +3408,17 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 
 	op->type = OP_IMM;
 	op->bytes = size;
-	op->addr.mem.ea = c->eip;
+	op->addr.mem.ea = c->_eip;
 	/* NB. Immediates are sign-extended as necessary. */
 	switch (op->bytes) {
 	case 1:
-		op->val = insn_fetch(s8, 1, c->eip);
+		op->val = insn_fetch(s8, 1, c->_eip);
 		break;
 	case 2:
-		op->val = insn_fetch(s16, 2, c->eip);
+		op->val = insn_fetch(s16, 2, c->_eip);
 		break;
 	case 4:
-		op->val = insn_fetch(s32, 4, c->eip);
+		op->val = insn_fetch(s32, 4, c->_eip);
 		break;
 	}
 	if (!sign_extension) {
@@ -3448,8 +3448,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	struct opcode opcode;
 	struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
-	c->eip = ctxt->eip;
-	c->fetch.start = c->eip;
+	c->_eip = ctxt->eip;
+	c->fetch.start = c->_eip;
 	c->fetch.end = c->fetch.start + insn_len;
 	if (insn_len > 0)
 		memcpy(c->fetch.data, insn, insn_len);
@@ -3478,7 +3478,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 
 	/* Legacy prefixes. */
 	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		switch (c->b = insn_fetch(u8, 1, c->_eip)) {
 		case 0x66:	/* operand-size override */
 			op_prefix = true;
 			/* switch between 2/4 bytes */
@@ -3534,7 +3534,7 @@ done_prefixes:
 	/* Two-byte opcode? */
 	if (c->b == 0x0f) {
 		c->twobyte = 1;
-		c->b = insn_fetch(u8, 1, c->eip);
+		c->b = insn_fetch(u8, 1, c->_eip);
 		opcode = twobyte_table[c->b];
 	}
 	c->d = opcode.flags;
@@ -3542,14 +3542,14 @@ done_prefixes:
 	while (c->d & GroupMask) {
 		switch (c->d & GroupMask) {
 		case Group:
-			c->modrm = insn_fetch(u8, 1, c->eip);
-			--c->eip;
+			c->modrm = insn_fetch(u8, 1, c->_eip);
+			--c->_eip;
 			goffset = (c->modrm >> 3) & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case GroupDual:
-			c->modrm = insn_fetch(u8, 1, c->eip);
-			--c->eip;
+			c->modrm = insn_fetch(u8, 1, c->_eip);
+			--c->_eip;
 			goffset = (c->modrm >> 3) & 7;
 			if ((c->modrm >> 6) == 3)
 				opcode = opcode.u.gdual->mod3[goffset];
@@ -3679,9 +3679,9 @@ done_prefixes:
 		break;
 	case SrcImmFAddr:
 		c->src.type = OP_IMM;
-		c->src.addr.mem.ea = c->eip;
+		c->src.addr.mem.ea = c->_eip;
 		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->_eip);
 		break;
 	case SrcMemFAddr:
 		memop.bytes = c->op_bytes + 2;
@@ -3732,9 +3732,9 @@ done_prefixes:
 		break;
 	case DstImmUByte:
 		c->dst.type = OP_IMM;
-		c->dst.addr.mem.ea = c->eip;
+		c->dst.addr.mem.ea = c->_eip;
 		c->dst.bytes = 1;
-		c->dst.val = insn_fetch(u8, 1, c->eip);
+		c->dst.val = insn_fetch(u8, 1, c->_eip);
 		break;
 	case DstMem:
 	case DstMem64:
@@ -3778,7 +3778,7 @@ done_prefixes:
 
 done:
 	if (memopp && memopp->type == OP_MEM && c->rip_relative)
-		memopp->addr.mem.ea += c->eip;
+		memopp->addr.mem.ea += c->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -3879,7 +3879,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-			ctxt->eip = c->eip;
+			ctxt->eip = c->_eip;
 			goto done;
 		}
 	}
@@ -4029,7 +4029,7 @@ special_insn:
 		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = c->src.val;
-		c->src.val = (unsigned long) c->eip;
+		c->src.val = (unsigned long) c->_eip;
 		jmp_rel(c, rel);
 		rc = em_push(ctxt);
 		break;
@@ -4130,7 +4130,7 @@ writeback:
 		}
 	}
 
-	ctxt->eip = c->eip;
+	ctxt->eip = c->_eip;
 
 done:
 	if (rc == X86EMUL_PROPAGATE_FAULT)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea82..d69e758d00ac 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -677,7 +677,7 @@ TRACE_EVENT(kvm_emulate_insn,
 	TP_fast_assign(
 		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
 		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
-		__entry->len = vcpu->arch.emulate_ctxt.decode.eip
+		__entry->len = vcpu->arch.emulate_ctxt.decode._eip
 			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
 		memcpy(__entry->insn,
 		       vcpu->arch.emulate_ctxt.decode.fetch.data,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 39d8b043580f..7e452fe31e40 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4560,13 +4560,13 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 
 	c->op_bytes = 2;
 	c->ad_bytes = 2;
-	c->eip = ctxt->eip + inc_eip;
+	c->_eip = ctxt->eip + inc_eip;
 	ret = emulate_int_real(ctxt, irq);
 
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;
 
-	ctxt->eip = c->eip;
+	ctxt->eip = c->_eip;
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
@@ -4661,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, c->eip);
+		kvm_rip_write(vcpu, c->_eip);
 		return EMULATE_DONE;
 	}
 

From 9dac77fa4011bdb4b541a8db087eac96a602faec Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 1 Jun 2011 15:34:25 +0300
Subject: [PATCH 062/143] KVM: x86 emulator: fold decode_cache into
 x86_emulate_ctxt

This saves a lot of pointless casts x86_emulate_ctxt and decode_cache.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   44 +-
 arch/x86/kvm/emulate.c             | 1279 +++++++++++++---------------
 arch/x86/kvm/trace.h               |    8 +-
 arch/x86/kvm/x86.c                 |   47 +-
 4 files changed, 632 insertions(+), 746 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index d0e100f55b76..6040d115ef51 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,26 @@ struct read_cache {
 	unsigned long end;
 };
 
-struct decode_cache {
+struct x86_emulate_ctxt {
+	struct x86_emulate_ops *ops;
+
+	/* Register state before/after emulation. */
+	unsigned long eflags;
+	unsigned long eip; /* eip before instruction emulation */
+	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
+	int mode;
+
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
+
+	bool guest_mode; /* guest running a nested guest */
+	bool perm_ok; /* do not check permissions if true */
+	bool only_vendor_specific_insn;
+
+	bool have_exception;
+	struct x86_exception exception;
+
+	/* decode cache */
 	u8 twobyte;
 	u8 b;
 	u8 intercept;
@@ -261,29 +280,6 @@ struct decode_cache {
 	struct read_cache mem_read;
 };
 
-struct x86_emulate_ctxt {
-	struct x86_emulate_ops *ops;
-
-	/* Register state before/after emulation. */
-	unsigned long eflags;
-	unsigned long eip; /* eip before instruction emulation */
-	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
-	int mode;
-
-	/* interruptibility state, as a result of execution of STI or MOV SS */
-	int interruptibility;
-
-	bool guest_mode; /* guest running a nested guest */
-	bool perm_ok; /* do not check permissions if true */
-	bool only_vendor_specific_insn;
-
-	bool have_exception;
-	struct x86_exception exception;
-
-	/* decode cache */
-	struct decode_cache decode;
-};
-
 /* Repeat String Operation Prefix */
 #define REPE_PREFIX	0xf3
 #define REPNE_PREFIX	0xf2
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a1b9705e3cc4..6f08bc940fa8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -413,53 +413,53 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 {
 	struct x86_instruction_info info = {
 		.intercept  = intercept,
-		.rep_prefix = ctxt->decode.rep_prefix,
-		.modrm_mod  = ctxt->decode.modrm_mod,
-		.modrm_reg  = ctxt->decode.modrm_reg,
-		.modrm_rm   = ctxt->decode.modrm_rm,
-		.src_val    = ctxt->decode.src.val64,
-		.src_bytes  = ctxt->decode.src.bytes,
-		.dst_bytes  = ctxt->decode.dst.bytes,
-		.ad_bytes   = ctxt->decode.ad_bytes,
+		.rep_prefix = ctxt->rep_prefix,
+		.modrm_mod  = ctxt->modrm_mod,
+		.modrm_reg  = ctxt->modrm_reg,
+		.modrm_rm   = ctxt->modrm_rm,
+		.src_val    = ctxt->src.val64,
+		.src_bytes  = ctxt->src.bytes,
+		.dst_bytes  = ctxt->dst.bytes,
+		.ad_bytes   = ctxt->ad_bytes,
 		.next_rip   = ctxt->eip,
 	};
 
 	return ctxt->ops->intercept(ctxt, &info, stage);
 }
 
-static inline unsigned long ad_mask(struct decode_cache *c)
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
-	return (1UL << (c->ad_bytes << 3)) - 1;
+	return (1UL << (ctxt->ad_bytes << 3)) - 1;
 }
 
 /* Access/update address held in a register, based on addressing mode. */
 static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-	if (c->ad_bytes == sizeof(unsigned long))
+	if (ctxt->ad_bytes == sizeof(unsigned long))
 		return reg;
 	else
-		return reg & ad_mask(c);
+		return reg & ad_mask(ctxt);
 }
 
 static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-	return address_mask(c, reg);
+	return address_mask(ctxt, reg);
 }
 
 static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
-	if (c->ad_bytes == sizeof(unsigned long))
+	if (ctxt->ad_bytes == sizeof(unsigned long))
 		*reg += inc;
 	else
-		*reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+		*reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
 }
 
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
 {
-	register_address_increment(c, &c->_eip, rel);
+	register_address_increment(ctxt, &ctxt->_eip, rel);
 }
 
 static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -469,10 +469,10 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
 	return desc->g ? (limit << 12) | 0xfff : limit;
 }
 
-static void set_seg_override(struct decode_cache *c, int seg)
+static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	c->has_seg_override = true;
-	c->seg_override = seg;
+	ctxt->has_seg_override = true;
+	ctxt->seg_override = seg;
 }
 
 static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
@@ -483,13 +483,12 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
 	return ctxt->ops->get_cached_segment_base(ctxt, seg);
 }
 
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
-			     struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
 {
-	if (!c->has_seg_override)
+	if (!ctxt->has_seg_override)
 		return 0;
 
-	return c->seg_override;
+	return ctxt->seg_override;
 }
 
 static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -561,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 		     unsigned size, bool write, bool fetch,
 		     ulong *linear)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_struct desc;
 	bool usable;
 	ulong la;
@@ -619,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 		}
 		break;
 	}
-	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
+	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
 		la &= (u32)-1;
 	*linear = la;
 	return X86EMUL_CONTINUE;
@@ -656,7 +654,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
 			      unsigned long eip, u8 *dest)
 {
-	struct fetch_cache *fc = &ctxt->decode.fetch;
+	struct fetch_cache *fc = &ctxt->fetch;
 	int rc;
 	int size, cur_size;
 
@@ -854,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 				    struct operand *op,
-				    struct decode_cache *c,
 				    int inhibit_bytereg)
 {
-	unsigned reg = c->modrm_reg;
-	int highbyte_regs = c->rex_prefix == 0;
+	unsigned reg = ctxt->modrm_reg;
+	int highbyte_regs = ctxt->rex_prefix == 0;
 
-	if (!(c->d & ModRM))
-		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+	if (!(ctxt->d & ModRM))
+		reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
 
-	if (c->d & Sse) {
+	if (ctxt->d & Sse) {
 		op->type = OP_XMM;
 		op->bytes = 16;
 		op->addr.xmm = reg;
@@ -872,12 +869,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 	}
 
 	op->type = OP_REG;
-	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+	if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
 		op->bytes = 1;
 	} else {
-		op->addr.reg = decode_register(reg, c->regs, 0);
-		op->bytes = c->op_bytes;
+		op->addr.reg = decode_register(reg, ctxt->regs, 0);
+		op->bytes = ctxt->op_bytes;
 	}
 	fetch_register_operand(op);
 	op->orig_val = op->val;
@@ -886,34 +883,33 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
 	int rc = X86EMUL_CONTINUE;
 	ulong modrm_ea = 0;
 
-	if (c->rex_prefix) {
-		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
-		index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
-		c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+	if (ctxt->rex_prefix) {
+		ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1;	/* REX.R */
+		index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
+		ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
 	}
 
-	c->modrm = insn_fetch(u8, 1, c->_eip);
-	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-	c->modrm_reg |= (c->modrm & 0x38) >> 3;
-	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_seg = VCPU_SREG_DS;
+	ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+	ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+	ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+	ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+	ctxt->modrm_seg = VCPU_SREG_DS;
 
-	if (c->modrm_mod == 3) {
+	if (ctxt->modrm_mod == 3) {
 		op->type = OP_REG;
-		op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		op->addr.reg = decode_register(c->modrm_rm,
-					       c->regs, c->d & ByteOp);
-		if (c->d & Sse) {
+		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		op->addr.reg = decode_register(ctxt->modrm_rm,
+					       ctxt->regs, ctxt->d & ByteOp);
+		if (ctxt->d & Sse) {
 			op->type = OP_XMM;
 			op->bytes = 16;
-			op->addr.xmm = c->modrm_rm;
-			read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
+			op->addr.xmm = ctxt->modrm_rm;
+			read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
 			return rc;
 		}
 		fetch_register_operand(op);
@@ -922,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 
 	op->type = OP_MEM;
 
-	if (c->ad_bytes == 2) {
-		unsigned bx = c->regs[VCPU_REGS_RBX];
-		unsigned bp = c->regs[VCPU_REGS_RBP];
-		unsigned si = c->regs[VCPU_REGS_RSI];
-		unsigned di = c->regs[VCPU_REGS_RDI];
+	if (ctxt->ad_bytes == 2) {
+		unsigned bx = ctxt->regs[VCPU_REGS_RBX];
+		unsigned bp = ctxt->regs[VCPU_REGS_RBP];
+		unsigned si = ctxt->regs[VCPU_REGS_RSI];
+		unsigned di = ctxt->regs[VCPU_REGS_RDI];
 
 		/* 16-bit ModR/M decode. */
-		switch (c->modrm_mod) {
+		switch (ctxt->modrm_mod) {
 		case 0:
-			if (c->modrm_rm == 6)
-				modrm_ea += insn_fetch(u16, 2, c->_eip);
+			if (ctxt->modrm_rm == 6)
+				modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->_eip);
+			modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(u16, 2, c->_eip);
+			modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
 			break;
 		}
-		switch (c->modrm_rm) {
+		switch (ctxt->modrm_rm) {
 		case 0:
 			modrm_ea += bx + si;
 			break;
@@ -961,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			modrm_ea += di;
 			break;
 		case 6:
-			if (c->modrm_mod != 0)
+			if (ctxt->modrm_mod != 0)
 				modrm_ea += bp;
 			break;
 		case 7:
 			modrm_ea += bx;
 			break;
 		}
-		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			c->modrm_seg = VCPU_SREG_SS;
+		if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+		    (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+			ctxt->modrm_seg = VCPU_SREG_SS;
 		modrm_ea = (u16)modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
-		if ((c->modrm_rm & 7) == 4) {
-			sib = insn_fetch(u8, 1, c->_eip);
+		if ((ctxt->modrm_rm & 7) == 4) {
+			sib = insn_fetch(u8, 1, ctxt->_eip);
 			index_reg |= (sib >> 3) & 7;
 			base_reg |= sib & 7;
 			scale = sib >> 6;
 
-			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				modrm_ea += insn_fetch(s32, 4, c->_eip);
+			if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+				modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			else
-				modrm_ea += c->regs[base_reg];
+				modrm_ea += ctxt->regs[base_reg];
 			if (index_reg != 4)
-				modrm_ea += c->regs[index_reg] << scale;
-		} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+				modrm_ea += ctxt->regs[index_reg] << scale;
+		} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
-				c->rip_relative = 1;
+				ctxt->rip_relative = 1;
 		} else
-			modrm_ea += c->regs[c->modrm_rm];
-		switch (c->modrm_mod) {
+			modrm_ea += ctxt->regs[ctxt->modrm_rm];
+		switch (ctxt->modrm_mod) {
 		case 0:
-			if (c->modrm_rm == 5)
-				modrm_ea += insn_fetch(s32, 4, c->_eip);
+			if (ctxt->modrm_rm == 5)
+				modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->_eip);
+			modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(s32, 4, c->_eip);
+			modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			break;
 		}
 	}
@@ -1012,49 +1008,48 @@ done:
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
 		      struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
 	op->type = OP_MEM;
-	switch (c->ad_bytes) {
+	switch (ctxt->ad_bytes) {
 	case 2:
-		op->addr.mem.ea = insn_fetch(u16, 2, c->_eip);
+		op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
 		break;
 	case 4:
-		op->addr.mem.ea = insn_fetch(u32, 4, c->_eip);
+		op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
 		break;
 	case 8:
-		op->addr.mem.ea = insn_fetch(u64, 8, c->_eip);
+		op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
 		break;
 	}
 done:
 	return rc;
 }
 
-static void fetch_bit_operand(struct decode_cache *c)
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
 {
 	long sv = 0, mask;
 
-	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
-		mask = ~(c->dst.bytes * 8 - 1);
+	if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+		mask = ~(ctxt->dst.bytes * 8 - 1);
 
-		if (c->src.bytes == 2)
-			sv = (s16)c->src.val & (s16)mask;
-		else if (c->src.bytes == 4)
-			sv = (s32)c->src.val & (s32)mask;
+		if (ctxt->src.bytes == 2)
+			sv = (s16)ctxt->src.val & (s16)mask;
+		else if (ctxt->src.bytes == 4)
+			sv = (s32)ctxt->src.val & (s32)mask;
 
-		c->dst.addr.mem.ea += (sv >> 3);
+		ctxt->dst.addr.mem.ea += (sv >> 3);
 	}
 
 	/* only subword offset */
-	c->src.val &= (c->dst.bytes << 3) - 1;
+	ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
 			 unsigned long addr, void *dest, unsigned size)
 {
 	int rc;
-	struct read_cache *mc = &ctxt->decode.mem_read;
+	struct read_cache *mc = &ctxt->mem_read;
 
 	while (size) {
 		int n = min(size, 8u);
@@ -1125,16 +1120,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 			   unsigned int size, unsigned short port,
 			   void *dest)
 {
-	struct read_cache *rc = &ctxt->decode.io_read;
+	struct read_cache *rc = &ctxt->io_read;
 
 	if (rc->pos == rc->end) { /* refill pio read ahead */
-		struct decode_cache *c = &ctxt->decode;
 		unsigned int in_page, n;
-		unsigned int count = c->rep_prefix ?
-			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		unsigned int count = ctxt->rep_prefix ?
+			address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
 		in_page = (ctxt->eflags & EFLG_DF) ?
-			offset_in_page(c->regs[VCPU_REGS_RDI]) :
-			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+			offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
 		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
 			count);
 		if (n == 0)
@@ -1344,29 +1338,28 @@ static void write_register_operand(struct operand *op)
 static int writeback(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
-	struct decode_cache *c = &ctxt->decode;
 
-	switch (c->dst.type) {
+	switch (ctxt->dst.type) {
 	case OP_REG:
-		write_register_operand(&c->dst);
+		write_register_operand(&ctxt->dst);
 		break;
 	case OP_MEM:
-		if (c->lock_prefix)
+		if (ctxt->lock_prefix)
 			rc = segmented_cmpxchg(ctxt,
-					       c->dst.addr.mem,
-					       &c->dst.orig_val,
-					       &c->dst.val,
-					       c->dst.bytes);
+					       ctxt->dst.addr.mem,
+					       &ctxt->dst.orig_val,
+					       &ctxt->dst.val,
+					       ctxt->dst.bytes);
 		else
 			rc = segmented_write(ctxt,
-					     c->dst.addr.mem,
-					     &c->dst.val,
-					     c->dst.bytes);
+					     ctxt->dst.addr.mem,
+					     &ctxt->dst.val,
+					     ctxt->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
 	case OP_XMM:
-		write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
+		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
 	case OP_NONE:
 		/* no writeback */
@@ -1379,40 +1372,36 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct segmented_address addr;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
 	addr.seg = VCPU_SREG_SS;
 
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
-	return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
+	ctxt->dst.type = OP_NONE;
+	return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 		       void *dest, int len)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	struct segmented_address addr;
 
-	addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
 	addr.seg = VCPU_SREG_SS;
 	rc = segmented_read(ctxt, addr, dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
 	return rc;
 }
 
 static int em_pop(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
 static int emulate_popf(struct x86_emulate_ctxt *ctxt,
@@ -1457,30 +1446,25 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 
 static int em_popf(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type = OP_REG;
-	c->dst.addr.reg = &ctxt->eflags;
-	c->dst.bytes = c->op_bytes;
-	return emulate_popf(ctxt, &c->dst.val, c->op_bytes);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->eflags;
+	ctxt->dst.bytes = ctxt->op_bytes;
+	return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
 static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val = get_segment_selector(ctxt, seg);
+	ctxt->src.val = get_segment_selector(ctxt, seg);
 
 	return em_push(ctxt);
 }
 
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
 	unsigned long selector;
 	int rc;
 
-	rc = emulate_pop(ctxt, &selector, c->op_bytes);
+	rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1490,14 +1474,13 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 
 static int em_pusha(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 
 	while (reg <= VCPU_REGS_RDI) {
 		(reg == VCPU_REGS_RSP) ?
-		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+		(ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
 
 		rc = em_push(ctxt);
 		if (rc != X86EMUL_CONTINUE)
@@ -1511,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val =  (unsigned long)ctxt->eflags;
+	ctxt->src.val =  (unsigned long)ctxt->eflags;
 	return em_push(ctxt);
 }
 
 static int em_popa(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
 
 	while (reg >= VCPU_REGS_RAX) {
 		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-							c->op_bytes);
+			register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
+							ctxt->op_bytes);
 			--reg;
 		}
 
-		rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
+		rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
@@ -1540,7 +1520,6 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc;
 	struct desc_ptr dt;
@@ -1549,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	u16 cs, eip;
 
 	/* TODO: Add limit checks */
-	c->src.val = ctxt->eflags;
+	ctxt->src.val = ctxt->eflags;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
 
-	c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+	ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->src.val = c->_eip;
+	ctxt->src.val = ctxt->_eip;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1583,7 +1562,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->_eip = eip;
+	ctxt->_eip = eip;
 
 	return rc;
 }
@@ -1605,7 +1584,6 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 
 static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	unsigned long temp_eip = 0;
 	unsigned long temp_eflags = 0;
@@ -1617,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 
 	/* TODO: Add stack limit check */
 
-	rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
+	rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1625,12 +1603,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	if (temp_eip & ~0xffff)
 		return emulate_gp(ctxt, 0);
 
-	rc = emulate_pop(ctxt, &cs, c->op_bytes);
+	rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
+	rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1640,12 +1618,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->_eip = temp_eip;
+	ctxt->_eip = temp_eip;
 
 
-	if (c->op_bytes == 4)
+	if (ctxt->op_bytes == 4)
 		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
-	else if (c->op_bytes == 2) {
+	else if (ctxt->op_bytes == 2) {
 		ctxt->eflags &= ~0xffff;
 		ctxt->eflags |= temp_eflags;
 	}
@@ -1673,53 +1651,49 @@ static int em_iret(struct x86_emulate_ctxt *ctxt)
 
 static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	unsigned short sel;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
 	rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->_eip = 0;
-	memcpy(&c->_eip, c->src.valptr, c->op_bytes);
+	ctxt->_eip = 0;
+	memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_grp1a(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
+	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
 }
 
 static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0:	/* rol */
-		emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 1:	/* ror */
-		emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 2:	/* rcl */
-		emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 3:	/* rcr */
-		emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 4:	/* sal/shl */
 	case 6:	/* sal/shl */
-		emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 5:	/* shr */
-		emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 7:	/* sar */
-		emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	}
 	return X86EMUL_CONTINUE;
@@ -1727,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp3(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
-	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+	unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
+	unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
 	u8 de = 0;
 
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0 ... 1:	/* test */
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 2:	/* not */
-		c->dst.val = ~c->dst.val;
+		ctxt->dst.val = ~ctxt->dst.val;
 		break;
 	case 3:	/* neg */
-		emulate_1op("neg", c->dst, ctxt->eflags);
+		emulate_1op("neg", ctxt->dst, ctxt->eflags);
 		break;
 	case 4: /* mul */
-		emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 5: /* imul */
-		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 6: /* div */
-		emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+		emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
 				       ctxt->eflags, de);
 		break;
 	case 7: /* idiv */
-		emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+		emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
 				       ctxt->eflags, de);
 		break;
 	default:
@@ -1766,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp45(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0:	/* inc */
-		emulate_1op("inc", c->dst, ctxt->eflags);
+		emulate_1op("inc", ctxt->dst, ctxt->eflags);
 		break;
 	case 1:	/* dec */
-		emulate_1op("dec", c->dst, ctxt->eflags);
+		emulate_1op("dec", ctxt->dst, ctxt->eflags);
 		break;
 	case 2: /* call near abs */ {
 		long int old_eip;
-		old_eip = c->_eip;
-		c->_eip = c->src.val;
-		c->src.val = old_eip;
+		old_eip = ctxt->_eip;
+		ctxt->_eip = ctxt->src.val;
+		ctxt->src.val = old_eip;
 		rc = em_push(ctxt);
 		break;
 	}
 	case 4: /* jmp abs */
-		c->_eip = c->src.val;
+		ctxt->_eip = ctxt->src.val;
 		break;
 	case 5: /* jmp far */
 		rc = em_jmp_far(ctxt);
@@ -1799,17 +1771,16 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp9(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 old = c->dst.orig_val64;
+	u64 old = ctxt->dst.orig_val64;
 
-	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
-	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+	if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
+	    ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
+		ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+		ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
 	} else {
-		c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
-			(u32) c->regs[VCPU_REGS_RBX];
+		ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
+			(u32) ctxt->regs[VCPU_REGS_RBX];
 
 		ctxt->eflags |= EFLG_ZF;
 	}
@@ -1818,26 +1789,23 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt)
 
 static int em_ret(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type = OP_REG;
-	c->dst.addr.reg = &c->_eip;
-	c->dst.bytes = c->op_bytes;
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->_eip;
+	ctxt->dst.bytes = ctxt->op_bytes;
 	return em_pop(ctxt);
 }
 
 static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	unsigned long cs;
 
-	rc = emulate_pop(ctxt, &c->_eip, c->op_bytes);
+	rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	if (c->op_bytes == 4)
-		c->_eip = (u32)c->_eip;
-	rc = emulate_pop(ctxt, &cs, c->op_bytes);
+	if (ctxt->op_bytes == 4)
+		ctxt->_eip = (u32)ctxt->_eip;
+	rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
@@ -1846,17 +1814,16 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 
 static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
 	unsigned short sel;
 	int rc;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
 	rc = load_segment_descriptor(ctxt, sel, seg);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->dst.val = c->src.val;
+	ctxt->dst.val = ctxt->src.val;
 	return rc;
 }
 
@@ -1892,7 +1859,6 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 
 static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
@@ -1919,15 +1885,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->regs[VCPU_REGS_RCX] = c->_eip;
+	ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
 	if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
-		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+		ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
 		ops->get_msr(ctxt,
 			     ctxt->mode == X86EMUL_MODE_PROT64 ?
 			     MSR_LSTAR : MSR_CSTAR, &msr_data);
-		c->_eip = msr_data;
+		ctxt->_eip = msr_data;
 
 		ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1935,7 +1901,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	} else {
 		/* legacy mode */
 		ops->get_msr(ctxt, MSR_STAR, &msr_data);
-		c->_eip = (u32)msr_data;
+		ctxt->_eip = (u32)msr_data;
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
 	}
@@ -1945,7 +1911,6 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
 static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
@@ -1991,17 +1956,16 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
-	c->_eip = msr_data;
+	ctxt->_eip = msr_data;
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
-	c->regs[VCPU_REGS_RSP] = msr_data;
+	ctxt->regs[VCPU_REGS_RSP] = msr_data;
 
 	return X86EMUL_CONTINUE;
 }
 
 static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
@@ -2015,7 +1979,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	if ((c->rex_prefix & 0x8) != 0x0)
+	if ((ctxt->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
 	else
 		usermode = X86EMUL_MODE_PROT32;
@@ -2045,8 +2009,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->_eip = c->regs[VCPU_REGS_RDX];
-	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+	ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
+	ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
 
 	return X86EMUL_CONTINUE;
 }
@@ -2113,18 +2077,16 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct tss_segment_16 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	tss->ip = c->_eip;
+	tss->ip = ctxt->_eip;
 	tss->flag = ctxt->eflags;
-	tss->ax = c->regs[VCPU_REGS_RAX];
-	tss->cx = c->regs[VCPU_REGS_RCX];
-	tss->dx = c->regs[VCPU_REGS_RDX];
-	tss->bx = c->regs[VCPU_REGS_RBX];
-	tss->sp = c->regs[VCPU_REGS_RSP];
-	tss->bp = c->regs[VCPU_REGS_RBP];
-	tss->si = c->regs[VCPU_REGS_RSI];
-	tss->di = c->regs[VCPU_REGS_RDI];
+	tss->ax = ctxt->regs[VCPU_REGS_RAX];
+	tss->cx = ctxt->regs[VCPU_REGS_RCX];
+	tss->dx = ctxt->regs[VCPU_REGS_RDX];
+	tss->bx = ctxt->regs[VCPU_REGS_RBX];
+	tss->sp = ctxt->regs[VCPU_REGS_RSP];
+	tss->bp = ctxt->regs[VCPU_REGS_RBP];
+	tss->si = ctxt->regs[VCPU_REGS_RSI];
+	tss->di = ctxt->regs[VCPU_REGS_RDI];
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2136,19 +2098,18 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 				 struct tss_segment_16 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	c->_eip = tss->ip;
+	ctxt->_eip = tss->ip;
 	ctxt->eflags = tss->flag | 2;
-	c->regs[VCPU_REGS_RAX] = tss->ax;
-	c->regs[VCPU_REGS_RCX] = tss->cx;
-	c->regs[VCPU_REGS_RDX] = tss->dx;
-	c->regs[VCPU_REGS_RBX] = tss->bx;
-	c->regs[VCPU_REGS_RSP] = tss->sp;
-	c->regs[VCPU_REGS_RBP] = tss->bp;
-	c->regs[VCPU_REGS_RSI] = tss->si;
-	c->regs[VCPU_REGS_RDI] = tss->di;
+	ctxt->regs[VCPU_REGS_RAX] = tss->ax;
+	ctxt->regs[VCPU_REGS_RCX] = tss->cx;
+	ctxt->regs[VCPU_REGS_RDX] = tss->dx;
+	ctxt->regs[VCPU_REGS_RBX] = tss->bx;
+	ctxt->regs[VCPU_REGS_RSP] = tss->sp;
+	ctxt->regs[VCPU_REGS_RBP] = tss->bp;
+	ctxt->regs[VCPU_REGS_RSI] = tss->si;
+	ctxt->regs[VCPU_REGS_RDI] = tss->di;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2230,19 +2191,17 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 				struct tss_segment_32 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
-
 	tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
-	tss->eip = c->_eip;
+	tss->eip = ctxt->_eip;
 	tss->eflags = ctxt->eflags;
-	tss->eax = c->regs[VCPU_REGS_RAX];
-	tss->ecx = c->regs[VCPU_REGS_RCX];
-	tss->edx = c->regs[VCPU_REGS_RDX];
-	tss->ebx = c->regs[VCPU_REGS_RBX];
-	tss->esp = c->regs[VCPU_REGS_RSP];
-	tss->ebp = c->regs[VCPU_REGS_RBP];
-	tss->esi = c->regs[VCPU_REGS_RSI];
-	tss->edi = c->regs[VCPU_REGS_RDI];
+	tss->eax = ctxt->regs[VCPU_REGS_RAX];
+	tss->ecx = ctxt->regs[VCPU_REGS_RCX];
+	tss->edx = ctxt->regs[VCPU_REGS_RDX];
+	tss->ebx = ctxt->regs[VCPU_REGS_RBX];
+	tss->esp = ctxt->regs[VCPU_REGS_RSP];
+	tss->ebp = ctxt->regs[VCPU_REGS_RBP];
+	tss->esi = ctxt->regs[VCPU_REGS_RSI];
+	tss->edi = ctxt->regs[VCPU_REGS_RDI];
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2256,21 +2215,20 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 				 struct tss_segment_32 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
 	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
 		return emulate_gp(ctxt, 0);
-	c->_eip = tss->eip;
+	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
-	c->regs[VCPU_REGS_RAX] = tss->eax;
-	c->regs[VCPU_REGS_RCX] = tss->ecx;
-	c->regs[VCPU_REGS_RDX] = tss->edx;
-	c->regs[VCPU_REGS_RBX] = tss->ebx;
-	c->regs[VCPU_REGS_RSP] = tss->esp;
-	c->regs[VCPU_REGS_RBP] = tss->ebp;
-	c->regs[VCPU_REGS_RSI] = tss->esi;
-	c->regs[VCPU_REGS_RDI] = tss->edi;
+	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
+	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
+	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
+	ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
+	ctxt->regs[VCPU_REGS_RSP] = tss->esp;
+	ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
+	ctxt->regs[VCPU_REGS_RSI] = tss->esi;
+	ctxt->regs[VCPU_REGS_RDI] = tss->edi;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2428,11 +2386,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
 
 	if (has_error_code) {
-		struct decode_cache *c = &ctxt->decode;
-
-		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
-		c->lock_prefix = 0;
-		c->src.val = (unsigned long) error_code;
+		ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+		ctxt->lock_prefix = 0;
+		ctxt->src.val = (unsigned long) error_code;
 		ret = em_push(ctxt);
 	}
 
@@ -2443,17 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->_eip = ctxt->eip;
-	c->dst.type = OP_NONE;
+	ctxt->_eip = ctxt->eip;
+	ctxt->dst.type = OP_NONE;
 
 	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE)
-		ctxt->eip = c->_eip;
+		ctxt->eip = ctxt->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -2461,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
 			    int reg, struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->addr.mem.ea = register_address(c, c->regs[reg]);
+	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
+	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
 	op->addr.mem.seg = seg;
 }
 
 static int em_das(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u8 al, old_al;
 	bool af, cf, old_cf;
 
 	cf = ctxt->eflags & X86_EFLAGS_CF;
-	al = c->dst.val;
+	al = ctxt->dst.val;
 
 	old_al = al;
 	old_cf = cf;
@@ -2494,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 		cf = true;
 	}
 
-	c->dst.val = al;
+	ctxt->dst.val = al;
 	/* Set PF, ZF, SF */
-	c->src.type = OP_IMM;
-	c->src.val = 0;
-	c->src.bytes = 1;
-	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	ctxt->src.type = OP_IMM;
+	ctxt->src.val = 0;
+	ctxt->src.bytes = 1;
+	emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
 	ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
 	if (cf)
 		ctxt->eflags |= X86_EFLAGS_CF;
@@ -2510,224 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 
 static int em_call_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u16 sel, old_cs;
 	ulong old_eip;
 	int rc;
 
 	old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
-	old_eip = c->_eip;
+	old_eip = ctxt->_eip;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 	if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
 		return X86EMUL_CONTINUE;
 
-	c->_eip = 0;
-	memcpy(&c->_eip, c->src.valptr, c->op_bytes);
+	ctxt->_eip = 0;
+	memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
 
-	c->src.val = old_cs;
+	ctxt->src.val = old_cs;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->src.val = old_eip;
+	ctxt->src.val = old_eip;
 	return em_push(ctxt);
 }
 
 static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->dst.type = OP_REG;
-	c->dst.addr.reg = &c->_eip;
-	c->dst.bytes = c->op_bytes;
-	rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->_eip;
+	ctxt->dst.bytes = ctxt->op_bytes;
+	rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_add(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_or(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_adc(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_sbb(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_and(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_sub(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_xor(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_cmp(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_test(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_xchg(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
 	/* Write back the register source. */
-	c->src.val = c->dst.val;
-	write_register_operand(&c->src);
+	ctxt->src.val = ctxt->dst.val;
+	write_register_operand(&ctxt->src);
 
 	/* Write back the memory destination with implicit LOCK prefix. */
-	c->dst.val = c->src.orig_val;
-	c->lock_prefix = 1;
+	ctxt->dst.val = ctxt->src.orig_val;
+	ctxt->lock_prefix = 1;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.val = c->src2.val;
+	ctxt->dst.val = ctxt->src2.val;
 	return em_imul(ctxt);
 }
 
 static int em_cwd(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type = OP_REG;
-	c->dst.bytes = c->src.bytes;
-	c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-	c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.bytes = ctxt->src.bytes;
+	ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+	ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
 
 	return X86EMUL_CONTINUE;
 }
 
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u64 tsc = 0;
 
 	ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
-	c->regs[VCPU_REGS_RAX] = (u32)tsc;
-	c->regs[VCPU_REGS_RDX] = tsc >> 32;
+	ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
+	ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	c->dst.val = c->src.val;
+	ctxt->dst.val = ctxt->src.val;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	if (c->modrm_reg > VCPU_SREG_GS)
+	if (ctxt->modrm_reg > VCPU_SREG_GS)
 		return emulate_ud(ctxt);
 
-	c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
+	ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u16 sel = c->src.val;
+	u16 sel = ctxt->src.val;
 
-	if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS)
+	if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
 		return emulate_ud(ctxt);
 
-	if (c->modrm_reg == VCPU_SREG_SS)
+	if (ctxt->modrm_reg == VCPU_SREG_SS)
 		ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
-	return load_segment_descriptor(ctxt, sel, c->modrm_reg);
+	ctxt->dst.type = OP_NONE;
+	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
 static int em_movdqu(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
+	memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	ulong linear;
 
-	rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
+	rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
 	if (rc == X86EMUL_CONTINUE)
 		ctxt->ops->invlpg(ctxt, linear);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
@@ -2743,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
 
 static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	if (c->modrm_mod != 3 || c->modrm_rm != 1)
+	if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
 		return X86EMUL_UNHANDLEABLE;
 
 	rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2754,94 +2671,84 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 		return rc;
 
 	/* Let the processor re-execute the fixed hypercall */
-	c->_eip = ctxt->eip;
+	ctxt->_eip = ctxt->eip;
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_lgdt(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_ptr desc_ptr;
 	int rc;
 
-	rc = read_descriptor(ctxt, c->src.addr.mem,
+	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
-			     c->op_bytes);
+			     ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	ctxt->ops->set_gdt(ctxt, &desc_ptr);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ctxt->ops->fix_hypercall(ctxt);
 
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return rc;
 }
 
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_ptr desc_ptr;
 	int rc;
 
-	rc = read_descriptor(ctxt, c->src.addr.mem,
+	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
-			     c->op_bytes);
+			     ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	ctxt->ops->set_idt(ctxt, &desc_ptr);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.bytes = 2;
-	c->dst.val = ctxt->ops->get_cr(ctxt, 0);
+	ctxt->dst.bytes = 2;
+	ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
-			  | (c->src.val & 0x0f));
-	c->dst.type = OP_NONE;
+			  | (ctxt->src.val & 0x0f));
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_loop(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-	if ((address_mask(c, c->regs[VCPU_REGS_RCX]) != 0) &&
-	    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
-		jmp_rel(c, c->src.val);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+	if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+	    (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+		jmp_rel(ctxt, ctxt->src.val);
 
 	return X86EMUL_CONTINUE;
 }
 
 static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
-		jmp_rel(c, c->src.val);
+	if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+		jmp_rel(ctxt, ctxt->src.val);
 
 	return X86EMUL_CONTINUE;
 }
@@ -2879,9 +2786,7 @@ static bool valid_cr(int nr)
 
 static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	if (!valid_cr(c->modrm_reg))
+	if (!valid_cr(ctxt->modrm_reg))
 		return emulate_ud(ctxt);
 
 	return X86EMUL_CONTINUE;
@@ -2889,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 new_val = c->src.val64;
-	int cr = c->modrm_reg;
+	u64 new_val = ctxt->src.val64;
+	int cr = ctxt->modrm_reg;
 	u64 efer = 0;
 
 	static u64 cr_reserved_bits[] = {
@@ -2968,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	int dr = c->modrm_reg;
+	int dr = ctxt->modrm_reg;
 	u64 cr4;
 
 	if (dr > 7)
@@ -2987,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_write(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 new_val = c->src.val64;
-	int dr = c->modrm_reg;
+	u64 new_val = ctxt->src.val64;
+	int dr = ctxt->modrm_reg;
 
 	if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
 		return emulate_gp(ctxt, 0);
@@ -3011,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
 
 static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
 {
-	u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
+	u64 rax = ctxt->regs[VCPU_REGS_RAX];
 
 	/* Valid physical address? */
 	if (rax & 0xffff000000000000ULL)
@@ -3033,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
 static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
 	u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-	u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
+	u64 rcx = ctxt->regs[VCPU_REGS_RCX];
 
 	if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
 	    (rcx > 3))
@@ -3044,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.bytes = min(c->dst.bytes, 4u);
-	if (!emulator_io_permited(ctxt, c->src.val, c->dst.bytes))
+	ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+	if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -3055,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.bytes = min(c->src.bytes, 4u);
-	if (!emulator_io_permited(ctxt, c->dst.val, c->src.bytes))
+	ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+	if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -3390,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
 #undef I2bv
 #undef I6ALU
 
-static unsigned imm_size(struct decode_cache *c)
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
 {
 	unsigned size;
 
-	size = (c->d & ByteOp) ? 1 : c->op_bytes;
+	size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
 	if (size == 8)
 		size = 4;
 	return size;
@@ -3403,22 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
 static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		      unsigned size, bool sign_extension)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
 	op->type = OP_IMM;
 	op->bytes = size;
-	op->addr.mem.ea = c->_eip;
+	op->addr.mem.ea = ctxt->_eip;
 	/* NB. Immediates are sign-extended as necessary. */
 	switch (op->bytes) {
 	case 1:
-		op->val = insn_fetch(s8, 1, c->_eip);
+		op->val = insn_fetch(s8, 1, ctxt->_eip);
 		break;
 	case 2:
-		op->val = insn_fetch(s16, 2, c->_eip);
+		op->val = insn_fetch(s16, 2, ctxt->_eip);
 		break;
 	case 4:
-		op->val = insn_fetch(s32, 4, c->_eip);
+		op->val = insn_fetch(s32, 4, ctxt->_eip);
 		break;
 	}
 	if (!sign_extension) {
@@ -3440,7 +3337,6 @@ done:
 
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3448,11 +3344,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	struct opcode opcode;
 	struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
-	c->_eip = ctxt->eip;
-	c->fetch.start = c->_eip;
-	c->fetch.end = c->fetch.start + insn_len;
+	ctxt->_eip = ctxt->eip;
+	ctxt->fetch.start = ctxt->_eip;
+	ctxt->fetch.end = ctxt->fetch.start + insn_len;
 	if (insn_len > 0)
-		memcpy(c->fetch.data, insn, insn_len);
+		memcpy(ctxt->fetch.data, insn, insn_len);
 
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
@@ -3473,46 +3369,46 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 		return -1;
 	}
 
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
+	ctxt->op_bytes = def_op_bytes;
+	ctxt->ad_bytes = def_ad_bytes;
 
 	/* Legacy prefixes. */
 	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->_eip)) {
+		switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
 		case 0x66:	/* operand-size override */
 			op_prefix = true;
 			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
+			ctxt->op_bytes = def_op_bytes ^ 6;
 			break;
 		case 0x67:	/* address-size override */
 			if (mode == X86EMUL_MODE_PROT64)
 				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
+				ctxt->ad_bytes = def_ad_bytes ^ 12;
 			else
 				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
+				ctxt->ad_bytes = def_ad_bytes ^ 6;
 			break;
 		case 0x26:	/* ES override */
 		case 0x2e:	/* CS override */
 		case 0x36:	/* SS override */
 		case 0x3e:	/* DS override */
-			set_seg_override(c, (c->b >> 3) & 3);
+			set_seg_override(ctxt, (ctxt->b >> 3) & 3);
 			break;
 		case 0x64:	/* FS override */
 		case 0x65:	/* GS override */
-			set_seg_override(c, c->b & 7);
+			set_seg_override(ctxt, ctxt->b & 7);
 			break;
 		case 0x40 ... 0x4f: /* REX */
 			if (mode != X86EMUL_MODE_PROT64)
 				goto done_prefixes;
-			c->rex_prefix = c->b;
+			ctxt->rex_prefix = ctxt->b;
 			continue;
 		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
+			ctxt->lock_prefix = 1;
 			break;
 		case 0xf2:	/* REPNE/REPNZ */
 		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = c->b;
+			ctxt->rep_prefix = ctxt->b;
 			break;
 		default:
 			goto done_prefixes;
@@ -3520,50 +3416,50 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 
 		/* Any legacy prefix after a REX prefix nullifies its effect. */
 
-		c->rex_prefix = 0;
+		ctxt->rex_prefix = 0;
 	}
 
 done_prefixes:
 
 	/* REX prefix. */
-	if (c->rex_prefix & 8)
-		c->op_bytes = 8;	/* REX.W */
+	if (ctxt->rex_prefix & 8)
+		ctxt->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	opcode = opcode_table[c->b];
+	opcode = opcode_table[ctxt->b];
 	/* Two-byte opcode? */
-	if (c->b == 0x0f) {
-		c->twobyte = 1;
-		c->b = insn_fetch(u8, 1, c->_eip);
-		opcode = twobyte_table[c->b];
+	if (ctxt->b == 0x0f) {
+		ctxt->twobyte = 1;
+		ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+		opcode = twobyte_table[ctxt->b];
 	}
-	c->d = opcode.flags;
+	ctxt->d = opcode.flags;
 
-	while (c->d & GroupMask) {
-		switch (c->d & GroupMask) {
+	while (ctxt->d & GroupMask) {
+		switch (ctxt->d & GroupMask) {
 		case Group:
-			c->modrm = insn_fetch(u8, 1, c->_eip);
-			--c->_eip;
-			goffset = (c->modrm >> 3) & 7;
+			ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+			--ctxt->_eip;
+			goffset = (ctxt->modrm >> 3) & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case GroupDual:
-			c->modrm = insn_fetch(u8, 1, c->_eip);
-			--c->_eip;
-			goffset = (c->modrm >> 3) & 7;
-			if ((c->modrm >> 6) == 3)
+			ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+			--ctxt->_eip;
+			goffset = (ctxt->modrm >> 3) & 7;
+			if ((ctxt->modrm >> 6) == 3)
 				opcode = opcode.u.gdual->mod3[goffset];
 			else
 				opcode = opcode.u.gdual->mod012[goffset];
 			break;
 		case RMExt:
-			goffset = c->modrm & 7;
+			goffset = ctxt->modrm & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case Prefix:
-			if (c->rep_prefix && op_prefix)
+			if (ctxt->rep_prefix && op_prefix)
 				return X86EMUL_UNHANDLEABLE;
-			simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
+			simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
 			switch (simd_prefix) {
 			case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
 			case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3575,61 +3471,61 @@ done_prefixes:
 			return X86EMUL_UNHANDLEABLE;
 		}
 
-		c->d &= ~GroupMask;
-		c->d |= opcode.flags;
+		ctxt->d &= ~GroupMask;
+		ctxt->d |= opcode.flags;
 	}
 
-	c->execute = opcode.u.execute;
-	c->check_perm = opcode.check_perm;
-	c->intercept = opcode.intercept;
+	ctxt->execute = opcode.u.execute;
+	ctxt->check_perm = opcode.check_perm;
+	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (c->d == 0 || (c->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & Undefined))
 		return -1;
 
-	if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
 		return -1;
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
+	if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
+		ctxt->op_bytes = 8;
 
-	if (c->d & Op3264) {
+	if (ctxt->d & Op3264) {
 		if (mode == X86EMUL_MODE_PROT64)
-			c->op_bytes = 8;
+			ctxt->op_bytes = 8;
 		else
-			c->op_bytes = 4;
+			ctxt->op_bytes = 4;
 	}
 
-	if (c->d & Sse)
-		c->op_bytes = 16;
+	if (ctxt->d & Sse)
+		ctxt->op_bytes = 16;
 
 	/* ModRM and SIB bytes. */
-	if (c->d & ModRM) {
+	if (ctxt->d & ModRM) {
 		rc = decode_modrm(ctxt, &memop);
-		if (!c->has_seg_override)
-			set_seg_override(c, c->modrm_seg);
-	} else if (c->d & MemAbs)
+		if (!ctxt->has_seg_override)
+			set_seg_override(ctxt, ctxt->modrm_seg);
+	} else if (ctxt->d & MemAbs)
 		rc = decode_abs(ctxt, &memop);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
-	if (!c->has_seg_override)
-		set_seg_override(c, VCPU_SREG_DS);
+	if (!ctxt->has_seg_override)
+		set_seg_override(ctxt, VCPU_SREG_DS);
 
-	memop.addr.mem.seg = seg_override(ctxt, c);
+	memop.addr.mem.seg = seg_override(ctxt);
 
-	if (memop.type == OP_MEM && c->ad_bytes != 8)
+	if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
 		memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
 	 */
-	switch (c->d & SrcMask) {
+	switch (ctxt->d & SrcMask) {
 	case SrcNone:
 		break;
 	case SrcReg:
-		decode_register_operand(ctxt, &c->src, c, 0);
+		decode_register_operand(ctxt, &ctxt->src, 0);
 		break;
 	case SrcMem16:
 		memop.bytes = 2;
@@ -3638,60 +3534,60 @@ done_prefixes:
 		memop.bytes = 4;
 		goto srcmem_common;
 	case SrcMem:
-		memop.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
+		memop.bytes = (ctxt->d & ByteOp) ? 1 :
+							   ctxt->op_bytes;
 	srcmem_common:
-		c->src = memop;
-		memopp = &c->src;
+		ctxt->src = memop;
+		memopp = &ctxt->src;
 		break;
 	case SrcImmU16:
-		rc = decode_imm(ctxt, &c->src, 2, false);
+		rc = decode_imm(ctxt, &ctxt->src, 2, false);
 		break;
 	case SrcImm:
-		rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+		rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
 		break;
 	case SrcImmU:
-		rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+		rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
 		break;
 	case SrcImmByte:
-		rc = decode_imm(ctxt, &c->src, 1, true);
+		rc = decode_imm(ctxt, &ctxt->src, 1, true);
 		break;
 	case SrcImmUByte:
-		rc = decode_imm(ctxt, &c->src, 1, false);
+		rc = decode_imm(ctxt, &ctxt->src, 1, false);
 		break;
 	case SrcAcc:
-		c->src.type = OP_REG;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		fetch_register_operand(&c->src);
+		ctxt->src.type = OP_REG;
+		ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&ctxt->src);
 		break;
 	case SrcOne:
-		c->src.bytes = 1;
-		c->src.val = 1;
+		ctxt->src.bytes = 1;
+		ctxt->src.val = 1;
 		break;
 	case SrcSI:
-		c->src.type = OP_MEM;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.addr.mem.ea =
-			register_address(c, c->regs[VCPU_REGS_RSI]);
-		c->src.addr.mem.seg = seg_override(ctxt, c);
-		c->src.val = 0;
+		ctxt->src.type = OP_MEM;
+		ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->src.addr.mem.ea =
+			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+		ctxt->src.addr.mem.seg = seg_override(ctxt);
+		ctxt->src.val = 0;
 		break;
 	case SrcImmFAddr:
-		c->src.type = OP_IMM;
-		c->src.addr.mem.ea = c->_eip;
-		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->_eip);
+		ctxt->src.type = OP_IMM;
+		ctxt->src.addr.mem.ea = ctxt->_eip;
+		ctxt->src.bytes = ctxt->op_bytes + 2;
+		insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
 		break;
 	case SrcMemFAddr:
-		memop.bytes = c->op_bytes + 2;
+		memop.bytes = ctxt->op_bytes + 2;
 		goto srcmem_common;
 		break;
 	case SrcDX:
-		c->src.type = OP_REG;
-		c->src.bytes = 2;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
-		fetch_register_operand(&c->src);
+		ctxt->src.type = OP_REG;
+		ctxt->src.bytes = 2;
+		ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&ctxt->src);
 		break;
 	}
 
@@ -3702,22 +3598,22 @@ done_prefixes:
 	 * Decode and fetch the second source operand: register, memory
 	 * or immediate.
 	 */
-	switch (c->d & Src2Mask) {
+	switch (ctxt->d & Src2Mask) {
 	case Src2None:
 		break;
 	case Src2CL:
-		c->src2.bytes = 1;
-		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		ctxt->src2.bytes = 1;
+		ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
 		break;
 	case Src2ImmByte:
-		rc = decode_imm(ctxt, &c->src2, 1, true);
+		rc = decode_imm(ctxt, &ctxt->src2, 1, true);
 		break;
 	case Src2One:
-		c->src2.bytes = 1;
-		c->src2.val = 1;
+		ctxt->src2.bytes = 1;
+		ctxt->src2.val = 1;
 		break;
 	case Src2Imm:
-		rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+		rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
 		break;
 	}
 
@@ -3725,68 +3621,66 @@ done_prefixes:
 		goto done;
 
 	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
+	switch (ctxt->d & DstMask) {
 	case DstReg:
-		decode_register_operand(ctxt, &c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		decode_register_operand(ctxt, &ctxt->dst,
+			 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
 		break;
 	case DstImmUByte:
-		c->dst.type = OP_IMM;
-		c->dst.addr.mem.ea = c->_eip;
-		c->dst.bytes = 1;
-		c->dst.val = insn_fetch(u8, 1, c->_eip);
+		ctxt->dst.type = OP_IMM;
+		ctxt->dst.addr.mem.ea = ctxt->_eip;
+		ctxt->dst.bytes = 1;
+		ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
 		break;
 	case DstMem:
 	case DstMem64:
-		c->dst = memop;
-		memopp = &c->dst;
-		if ((c->d & DstMask) == DstMem64)
-			c->dst.bytes = 8;
+		ctxt->dst = memop;
+		memopp = &ctxt->dst;
+		if ((ctxt->d & DstMask) == DstMem64)
+			ctxt->dst.bytes = 8;
 		else
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->d & BitOp)
-			fetch_bit_operand(c);
-		c->dst.orig_val = c->dst.val;
+			ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		if (ctxt->d & BitOp)
+			fetch_bit_operand(ctxt);
+		ctxt->dst.orig_val = ctxt->dst.val;
 		break;
 	case DstAcc:
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
-		fetch_register_operand(&c->dst);
-		c->dst.orig_val = c->dst.val;
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&ctxt->dst);
+		ctxt->dst.orig_val = ctxt->dst.val;
 		break;
 	case DstDI:
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.addr.mem.ea =
-			register_address(c, c->regs[VCPU_REGS_RDI]);
-		c->dst.addr.mem.seg = VCPU_SREG_ES;
-		c->dst.val = 0;
+		ctxt->dst.type = OP_MEM;
+		ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->dst.addr.mem.ea =
+			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+		ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
+		ctxt->dst.val = 0;
 		break;
 	case DstDX:
-		c->dst.type = OP_REG;
-		c->dst.bytes = 2;
-		c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-		fetch_register_operand(&c->dst);
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.bytes = 2;
+		ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&ctxt->dst);
 		break;
 	case ImplicitOps:
 		/* Special instructions do their own operand decoding. */
 	default:
-		c->dst.type = OP_NONE; /* Disable writeback. */
+		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
 	}
 
 done:
-	if (memopp && memopp->type == OP_MEM && c->rip_relative)
-		memopp->addr.mem.ea += c->_eip;
+	if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
+		memopp->addr.mem.ea += ctxt->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
 	/* The second termination condition only applies for REPE
 	 * and REPNE. Test if the repeat string operation prefix is
 	 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3794,11 +3688,11 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	 * 	- if REPE/REPZ and ZF = 0 then done
 	 * 	- if REPNE/REPNZ and ZF = 1 then done
 	 */
-	if (((c->b == 0xa6) || (c->b == 0xa7) ||
-	     (c->b == 0xae) || (c->b == 0xaf))
-	    && (((c->rep_prefix == REPE_PREFIX) &&
+	if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+	     (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+	    && (((ctxt->rep_prefix == REPE_PREFIX) &&
 		 ((ctxt->eflags & EFLG_ZF) == 0))
-		|| ((c->rep_prefix == REPNE_PREFIX) &&
+		|| ((ctxt->rep_prefix == REPNE_PREFIX) &&
 		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
 		return true;
 
@@ -3809,129 +3703,128 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
-	int saved_dst_type = c->dst.type;
+	int saved_dst_type = ctxt->dst.type;
 
-	c->mem_read.pos = 0;
+	ctxt->mem_read.pos = 0;
 
-	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+	if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
-	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
+	if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+	if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & Sse)
+	if ((ctxt->d & Sse)
 	    && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
 		|| !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+	if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
 		rc = emulate_nm(ctxt);
 		goto done;
 	}
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_PRE_EXCEPT);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
-	if ((c->d & Priv) && ops->cpl(ctxt)) {
+	if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
 		rc = emulate_gp(ctxt, 0);
 		goto done;
 	}
 
 	/* Instruction can only be executed in protected mode */
-	if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+	if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* Do instruction specific permission checks */
-	if (c->check_perm) {
-		rc = c->check_perm(ctxt);
+	if (ctxt->check_perm) {
+		rc = ctxt->check_perm(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_EXCEPT);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (c->rep_prefix && (c->d & String)) {
+	if (ctxt->rep_prefix && (ctxt->d & String)) {
 		/* All REP prefixes have the same first termination condition */
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-			ctxt->eip = c->_eip;
+		if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+			ctxt->eip = ctxt->_eip;
 			goto done;
 		}
 	}
 
-	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
-		rc = segmented_read(ctxt, c->src.addr.mem,
-				    c->src.valptr, c->src.bytes);
+	if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+		rc = segmented_read(ctxt, ctxt->src.addr.mem,
+				    ctxt->src.valptr, ctxt->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
-		c->src.orig_val64 = c->src.val64;
+		ctxt->src.orig_val64 = ctxt->src.val64;
 	}
 
-	if (c->src2.type == OP_MEM) {
-		rc = segmented_read(ctxt, c->src2.addr.mem,
-				    &c->src2.val, c->src2.bytes);
+	if (ctxt->src2.type == OP_MEM) {
+		rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+				    &ctxt->src2.val, ctxt->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if ((c->d & DstMask) == ImplicitOps)
+	if ((ctxt->d & DstMask) == ImplicitOps)
 		goto special_insn;
 
 
-	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+	if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = segmented_read(ctxt, c->dst.addr.mem,
-				   &c->dst.val, c->dst.bytes);
+		rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+				   &ctxt->dst.val, ctxt->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
-	c->dst.orig_val = c->dst.val;
+	ctxt->dst.orig_val = ctxt->dst.val;
 
 special_insn:
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (c->execute) {
-		rc = c->execute(ctxt);
+	if (ctxt->execute) {
+		rc = ctxt->execute(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		goto writeback;
 	}
 
-	if (c->twobyte)
+	if (ctxt->twobyte)
 		goto twobyte_insn;
 
-	switch (c->b) {
+	switch (ctxt->b) {
 	case 0x06:		/* push es */
 		rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
 		break;
@@ -3954,45 +3847,45 @@ special_insn:
 		rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
 		break;
 	case 0x40 ... 0x47: /* inc r16/r32 */
-		emulate_1op("inc", c->dst, ctxt->eflags);
+		emulate_1op("inc", ctxt->dst, ctxt->eflags);
 		break;
 	case 0x48 ... 0x4f: /* dec r16/r32 */
-		emulate_1op("dec", c->dst, ctxt->eflags);
+		emulate_1op("dec", ctxt->dst, ctxt->eflags);
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
 			goto cannot_emulate;
-		c->dst.val = (s32) c->src.val;
+		ctxt->dst.val = (s32) ctxt->src.val;
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
-		c->src.val = c->regs[VCPU_REGS_RDX];
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
 		goto do_io_in;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
-		c->dst.val = c->regs[VCPU_REGS_RDX];
+		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
 		goto do_io_out;
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
-		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, c->src.val);
+		if (test_cc(ctxt->b, ctxt->eflags))
+			jmp_rel(ctxt, ctxt->src.val);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->src.addr.mem.ea;
+		ctxt->dst.val = ctxt->src.addr.mem.ea;
 		break;
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = em_grp1a(ctxt);
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
-		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+		if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
 			break;
 		rc = em_xchg(ctxt);
 		break;
 	case 0x98: /* cbw/cwde/cdqe */
-		switch (c->op_bytes) {
-		case 2: c->dst.val = (s8)c->dst.val; break;
-		case 4: c->dst.val = (s16)c->dst.val; break;
-		case 8: c->dst.val = (s32)c->dst.val; break;
+		switch (ctxt->op_bytes) {
+		case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+		case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+		case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
 		}
 		break;
 	case 0xc0 ... 0xc1:
@@ -4008,7 +3901,7 @@ special_insn:
 		rc = emulate_int(ctxt, 3);
 		break;
 	case 0xcd:		/* int n */
-		rc = emulate_int(ctxt, c->src.val);
+		rc = emulate_int(ctxt, ctxt->src.val);
 		break;
 	case 0xce:		/* into */
 		if (ctxt->eflags & EFLG_OF)
@@ -4018,7 +3911,7 @@ special_insn:
 		rc = em_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
-		c->src.val = c->regs[VCPU_REGS_RCX];
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
 	case 0xe4: 	/* inb */
@@ -4028,30 +3921,30 @@ special_insn:
 	case 0xe7: /* out */
 		goto do_io_out;
 	case 0xe8: /* call (near) */ {
-		long int rel = c->src.val;
-		c->src.val = (unsigned long) c->_eip;
-		jmp_rel(c, rel);
+		long int rel = ctxt->src.val;
+		ctxt->src.val = (unsigned long) ctxt->_eip;
+		jmp_rel(ctxt, rel);
 		rc = em_push(ctxt);
 		break;
 	}
 	case 0xe9: /* jmp rel */
 	case 0xeb: /* jmp rel short */
-		jmp_rel(c, c->src.val);
-		c->dst.type = OP_NONE; /* Disable writeback. */
+		jmp_rel(ctxt, ctxt->src.val);
+		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
 	do_io_in:
-		if (!pio_in_emulated(ctxt, c->dst.bytes, c->src.val,
-				     &c->dst.val))
+		if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+				     &ctxt->dst.val))
 			goto done; /* IO is needed */
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
 	do_io_out:
-		ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
-				      &c->src.val, 1);
-		c->dst.type = OP_NONE;	/* Disable writeback. */
+		ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+				      &ctxt->src.val, 1);
+		ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
 		ctxt->ops->halt(ctxt);
@@ -4097,40 +3990,40 @@ writeback:
 	 * restore dst type in case the decoding will be reused
 	 * (happens for string instruction )
 	 */
-	c->dst.type = saved_dst_type;
+	ctxt->dst.type = saved_dst_type;
 
-	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override(ctxt, c),
-				VCPU_REGS_RSI, &c->src);
+	if ((ctxt->d & SrcMask) == SrcSI)
+		string_addr_inc(ctxt, seg_override(ctxt),
+				VCPU_REGS_RSI, &ctxt->src);
 
-	if ((c->d & DstMask) == DstDI)
+	if ((ctxt->d & DstMask) == DstDI)
 		string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
-				&c->dst);
+				&ctxt->dst);
 
-	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *r = &c->io_read;
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+	if (ctxt->rep_prefix && (ctxt->d & String)) {
+		struct read_cache *r = &ctxt->io_read;
+		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
 
 		if (!string_insn_completed(ctxt)) {
 			/*
 			 * Re-enter guest when pio read ahead buffer is empty
 			 * or, if it is not used, after each 1024 iteration.
 			 */
-			if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
 			    (r->end == 0 || r->end != r->pos)) {
 				/*
 				 * Reset read cache. Usually happens before
 				 * decode, but since instruction is restarted
 				 * we have to do it here.
 				 */
-				c->mem_read.end = 0;
+				ctxt->mem_read.end = 0;
 				return EMULATION_RESTART;
 			}
 			goto done; /* skip rip writeback */
 		}
 	}
 
-	ctxt->eip = c->_eip;
+	ctxt->eip = ctxt->_eip;
 
 done:
 	if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4141,7 +4034,7 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
-	switch (c->b) {
+	switch (ctxt->b) {
 	case 0x09:		/* wbinvd */
 		(ctxt->ops->wbinvd)(ctxt);
 		break;
@@ -4150,21 +4043,21 @@ twobyte_insn:
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		break;
 	case 0x20: /* mov cr, reg */
-		c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
+		ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
 		break;
 	case 0x21: /* mov from dr to reg */
-		ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
+		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
 	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
+		if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
-		c->dst.type = OP_NONE;
+		ctxt->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
+		if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U)) < 0) {
 			/* #UD condition is already handled by the code above */
@@ -4173,13 +4066,13 @@ twobyte_insn:
 			goto done;
 		}
 
-		c->dst.type = OP_NONE;	/* no writeback */
+		ctxt->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
 		/* wrmsr */
-		msr_data = (u32)c->regs[VCPU_REGS_RAX]
-			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
+		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+			| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+		if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
@@ -4188,27 +4081,27 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
+		if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		} else {
-			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
-			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+			ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+			ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
 		}
 		rc = X86EMUL_CONTINUE;
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
-		c->dst.val = c->dst.orig_val = c->src.val;
-		if (!test_cc(c->b, ctxt->eflags))
-			c->dst.type = OP_NONE; /* no writeback */
+		ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
+		if (!test_cc(ctxt->b, ctxt->eflags))
+			ctxt->dst.type = OP_NONE; /* no writeback */
 		break;
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
-		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, c->src.val);
+		if (test_cc(ctxt->b, ctxt->eflags))
+			jmp_rel(ctxt, ctxt->src.val);
 		break;
 	case 0x90 ... 0x9f:     /* setcc r/m8 */
-		c->dst.val = test_cc(c->b, ctxt->eflags);
+		ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
 		break;
 	case 0xa0:	  /* push fs */
 		rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
@@ -4218,14 +4111,14 @@ twobyte_insn:
 		break;
 	case 0xa3:
 	      bt:		/* bt */
-		c->dst.type = OP_NONE;
+		ctxt->dst.type = OP_NONE;
 		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+		ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xa4: /* shld imm8, r, r/m */
 	case 0xa5: /* shld cl, r, r/m */
-		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+		emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xa8:	/* push gs */
 		rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
@@ -4235,11 +4128,11 @@ twobyte_insn:
 		break;
 	case 0xab:
 	      bts:		/* bts */
-		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xac: /* shrd imm8, r, r/m */
 	case 0xad: /* shrd cl, r, r/m */
-		emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+		emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xae:              /* clflush */
 		break;
@@ -4248,16 +4141,16 @@ twobyte_insn:
 		 * Save real source value, then compare EAX against
 		 * destination.
 		 */
-		c->src.orig_val = c->src.val;
-		c->src.val = c->regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+		ctxt->src.orig_val = ctxt->src.val;
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+		emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
 		if (ctxt->eflags & EFLG_ZF) {
 			/* Success: write back to memory. */
-			c->dst.val = c->src.orig_val;
+			ctxt->dst.val = ctxt->src.orig_val;
 		} else {
 			/* Failure: write the value we saw to EAX. */
-			c->dst.type = OP_REG;
-			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+			ctxt->dst.type = OP_REG;
+			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
 		}
 		break;
 	case 0xb2:		/* lss */
@@ -4265,7 +4158,7 @@ twobyte_insn:
 		break;
 	case 0xb3:
 	      btr:		/* btr */
-		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xb4:		/* lfs */
 		rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
@@ -4274,12 +4167,12 @@ twobyte_insn:
 		rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-						       : (u16) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+						       : (u16) ctxt->src.val;
 		break;
 	case 0xba:		/* Grp8 */
-		switch (c->modrm_reg & 3) {
+		switch (ctxt->modrm_reg & 3) {
 		case 0:
 			goto bt;
 		case 1:
@@ -4292,47 +4185,47 @@ twobyte_insn:
 		break;
 	case 0xbb:
 	      btc:		/* btc */
-		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xbc: {		/* bsf */
 		u8 zf;
 		__asm__ ("bsf %2, %0; setz %1"
-			 : "=r"(c->dst.val), "=q"(zf)
-			 : "r"(c->src.val));
+			 : "=r"(ctxt->dst.val), "=q"(zf)
+			 : "r"(ctxt->src.val));
 		ctxt->eflags &= ~X86_EFLAGS_ZF;
 		if (zf) {
 			ctxt->eflags |= X86_EFLAGS_ZF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
+			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	}
 	case 0xbd: {		/* bsr */
 		u8 zf;
 		__asm__ ("bsr %2, %0; setz %1"
-			 : "=r"(c->dst.val), "=q"(zf)
-			 : "r"(c->src.val));
+			 : "=r"(ctxt->dst.val), "=q"(zf)
+			 : "r"(ctxt->src.val));
 		ctxt->eflags &= ~X86_EFLAGS_ZF;
 		if (zf) {
 			ctxt->eflags |= X86_EFLAGS_ZF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
+			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	}
 	case 0xbe ... 0xbf:	/* movsx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-							(s16) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+							(s16) ctxt->src.val;
 		break;
 	case 0xc0 ... 0xc1:	/* xadd */
-		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
 		/* Write back the register source. */
-		c->src.val = c->dst.orig_val;
-		write_register_operand(&c->src);
+		ctxt->src.val = ctxt->dst.orig_val;
+		write_register_operand(&ctxt->src);
 		break;
 	case 0xc3:		/* movnti */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-							(u64) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
+							(u64) ctxt->src.val;
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = em_grp9(ctxt);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d69e758d00ac..624f8cb46a6b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
 		),
 
 	TP_fast_assign(
-		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
 		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
-		__entry->len = vcpu->arch.emulate_ctxt.decode._eip
-			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->len = vcpu->arch.emulate_ctxt._eip
+			       - vcpu->arch.emulate_ctxt.fetch.start;
 		memcpy(__entry->insn,
-		       vcpu->arch.emulate_ctxt.decode.fetch.data,
+		       vcpu->arch.emulate_ctxt.fetch.data,
 		       15);
 		__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
 		__entry->failed = failed;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e452fe31e40..694538a043e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4507,24 +4507,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
-static void init_decode_cache(struct decode_cache *c,
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
 			      const unsigned long *regs)
 {
-	memset(c, 0, offsetof(struct decode_cache, regs));
-	memcpy(c->regs, regs, sizeof(c->regs));
+	memset(&ctxt->twobyte, 0,
+	       (void *)&ctxt->regs - (void *)&ctxt->twobyte);
+	memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
 
-	c->fetch.start = 0;
-	c->fetch.end = 0;
-	c->io_read.pos = 0;
-	c->io_read.end = 0;
-	c->mem_read.pos = 0;
-	c->mem_read.end = 0;
+	ctxt->fetch.start = 0;
+	ctxt->fetch.end = 0;
+	ctxt->io_read.pos = 0;
+	ctxt->io_read.end = 0;
+	ctxt->mem_read.pos = 0;
+	ctxt->mem_read.end = 0;
 }
 
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-	struct decode_cache *c = &ctxt->decode;
 	int cs_db, cs_l;
 
 	/*
@@ -4546,28 +4546,27 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 							  X86EMUL_MODE_PROT16;
 	ctxt->guest_mode = is_guest_mode(vcpu);
 
-	init_decode_cache(c, vcpu->arch.regs);
+	init_decode_cache(ctxt, vcpu->arch.regs);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	c->op_bytes = 2;
-	c->ad_bytes = 2;
-	c->_eip = ctxt->eip + inc_eip;
+	ctxt->op_bytes = 2;
+	ctxt->ad_bytes = 2;
+	ctxt->_eip = ctxt->eip + inc_eip;
 	ret = emulate_int_real(ctxt, irq);
 
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;
 
-	ctxt->eip = c->_eip;
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	ctxt->eip = ctxt->_eip;
+	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 
@@ -4631,7 +4630,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 {
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-	struct decode_cache *c = &ctxt->decode;
 	bool writeback = true;
 
 	kvm_clear_exception_queue(vcpu);
@@ -4661,7 +4659,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, c->_eip);
+		kvm_rip_write(vcpu, ctxt->_eip);
 		return EMULATE_DONE;
 	}
 
@@ -4669,7 +4667,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
 	}
 
 restart:
@@ -4707,7 +4705,7 @@ restart:
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
 	} else
@@ -5718,8 +5716,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 		 * that usually, but some bad designed PV devices (vmware
 		 * backdoor interface) need this to work
 		 */
-		struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-		memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+		struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 	}
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5849,7 +5847,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
@@ -5860,7 +5857,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 		return EMULATE_FAIL;
 
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);

From 7f4382e8fd8c87bcb7122a2f63c03e8713f594a0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 2 Jun 2011 16:16:20 +0200
Subject: [PATCH 063/143] KVM: Fixup documentation section numbering

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3755a390f886..2bd06b018b32 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1323,7 +1323,7 @@ struct kvm_lapic_state {
 Copies the input argument into the the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
-4.56 KVM_IOEVENTFD
+4.58 KVM_IOEVENTFD
 
 Capability: KVM_CAP_IOEVENTFD
 Architectures: all

From 509c75ea198fe524adaf90ca1021487b733447ce Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Thu, 2 Jun 2011 11:54:52 +0300
Subject: [PATCH 064/143] KVM: nVMX: Fix bug preventing more than two levels of
 nesting

The nested VMX feature is supposed to fully emulate VMX for the guest. This
(theoretically) not only allows it to run its own guests, but also also
to further emulate VMX for its own guests, and allow arbitrarily deep nesting.

This patch fixes a bug (discovered by Kevin Tian) in handling a VMLAUNCH
by L2, which prevented deeper nesting.

Deeper nesting now works (I only actually tested L3), but is currently
*absurdly* slow, to the point of being unusable.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 58badf1657d4..f5b49c7fc89d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5694,8 +5694,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->nested.nested_run_pending)
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 
-	if (exit_reason == EXIT_REASON_VMLAUNCH ||
-	    exit_reason == EXIT_REASON_VMRESUME)
+	if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME))
 		vmx->nested.nested_run_pending = 1;
 	else
 		vmx->nested.nested_run_pending = 0;

From 8d9c975fc5b825cb76953a1b45a84195ffc6f4ab Mon Sep 17 00:00:00 2001
From: "Yang, Wei Y" <wei.y.yang@intel.com>
Date: Fri, 3 Jun 2011 11:13:35 +0800
Subject: [PATCH 065/143] KVM: Remove SMEP bit from CR4_RESERVED_BITS

This patch removes SMEP bit from CR4_RESERVED_BITS.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Shan, Haitao <haitao.shan@intel.com>
Signed-off-by: Li, Xin <xin.li@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d167039ecdf4..fc38eca116c0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
-			  | X86_CR4_OSXSAVE \
+			  | X86_CR4_OSXSAVE | X86_CR4_SMEP              \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)

From c68b734fba402b9bfdd49e23b776c42dbeaf1f5b Mon Sep 17 00:00:00 2001
From: "Yang, Wei Y" <wei.y.yang@intel.com>
Date: Fri, 3 Jun 2011 11:13:42 +0800
Subject: [PATCH 066/143] KVM: Add SMEP support when setting CR4

This patch adds SMEP handling when setting CR4.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Shan, Haitao <haitao.shan@intel.com>
Signed-off-by: Li, Xin <xin.li@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 694538a043e7..ba5cd27b429a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -580,6 +580,14 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
 
+static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -599,14 +607,17 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
-	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
+				   X86_CR4_PAE | X86_CR4_SMEP;
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
 	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 		return 1;
 
+	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;

From 611c120f7486a19e7df2225f875a52ef0b599ae8 Mon Sep 17 00:00:00 2001
From: "Yang, Wei Y" <wei.y.yang@intel.com>
Date: Fri, 3 Jun 2011 11:14:03 +0800
Subject: [PATCH 067/143] KVM: Mask function7 ebx against host capability word9

This patch masks CPUID leaf 7 ebx against host capability word9.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Shan, Haitao <haitao.shan@intel.com>
Signed-off-by: Li, Xin <xin.li@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ba5cd27b429a..ff4623b1b102 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2359,6 +2359,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 		F(PMM) | F(PMM_EN);
 
+	/* cpuid 7.0.ebx */
+	const u32 kvm_supported_word9_x86_features =
+		F(SMEP);
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 	do_cpuid_1_ent(entry, function, index);
@@ -2393,7 +2397,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
-	/* function 4 and 0xb have additional index. */
+	/* function 4 has additional index. */
 	case 4: {
 		int i, cache_type;
 
@@ -2410,8 +2414,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 7: {
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* Mask ebx against host capbability word 9 */
+		if (index == 0) {
+			entry->ebx &= kvm_supported_word9_x86_features;
+			cpuid_mask(&entry->ebx, 9);
+		} else
+			entry->ebx = 0;
+		entry->eax = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	}
 	case 9:
 		break;
+	/* function 0xb has additional index. */
 	case 0xb: {
 		int i, level_type;
 

From e57d4a356ad3ac46881399c424cc6cf6dd16359d Mon Sep 17 00:00:00 2001
From: "Yang, Wei Y" <wei.y.yang@intel.com>
Date: Fri, 3 Jun 2011 11:14:16 +0800
Subject: [PATCH 068/143] KVM: Add instruction fetch checking when walking
 guest page table

This patch adds instruction fetch checking when walking guest page table,
to implement SMEP when emulating instead of executing natively.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Shan, Haitao <haitao.shan@intel.com>
Signed-off-by: Li, Xin <xin.li@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5ec..1caeb4d22e01 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -246,6 +246,12 @@ walk:
 			gfn_t gfn;
 			u32 ac;
 
+			/* check if the kernel is fetching from user page */
+			if (unlikely(pte_access & PT_USER_MASK) &&
+			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+				if (fetch_fault && !user_fault)
+					eperm = true;
+
 			gfn = gpte_to_gfn_lvl(pte, lvl);
 			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
@@ -305,7 +311,8 @@ error:
 
 	walker->fault.error_code |= write_fault | user_fault;
 
-	if (fetch_fault && mmu->nx)
+	if (fetch_fault && (mmu->nx ||
+			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
 		walker->fault.error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->fault.error_code |= PFERR_RSVD_MASK;

From 91e3d71db29d9b3b67b64e2a08e724a7ff538b4c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 3 Jun 2011 08:51:05 +0200
Subject: [PATCH 069/143] KVM: Clarify KVM_ASSIGN_PCI_DEVICE documentation

Neither host_irq nor the guest_msi struct are used anymore today.
Tag the former, drop the latter to avoid confusion.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 7 +------
 include/linux/kvm.h               | 8 +-------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 2bd06b018b32..0ebe9225b171 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1143,15 +1143,10 @@ Assigns an IRQ to a passed-through device.
 
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
-	__u32 host_irq;
+	__u32 host_irq; /* ignored (legacy field) */
 	__u32 guest_irq;
 	__u32 flags;
 	union {
-		struct {
-			__u32 addr_lo;
-			__u32 addr_hi;
-			__u32 data;
-		} guest_msi;
 		__u32 reserved[12];
 	};
 };
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 55ef181521ff..9c9ca7c7b491 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -773,20 +773,14 @@ struct kvm_assigned_pci_dev {
 
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
-	__u32 host_irq;
+	__u32 host_irq; /* ignored (legacy field) */
 	__u32 guest_irq;
 	__u32 flags;
 	union {
-		struct {
-			__u32 addr_lo;
-			__u32 addr_hi;
-			__u32 data;
-		} guest_msi;
 		__u32 reserved[12];
 	};
 };
 
-
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
 	__u16 entry_nr;

From 1dda606c5f94b14a8f36c220d1d8844bab68a720 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 8 Jun 2011 02:45:37 +0200
Subject: [PATCH 070/143] KVM: Add compat ioctl for KVM_SET_SIGNAL_MASK

KVM has an ioctl to define which signal mask should be used while running
inside VCPU_RUN. At least for big endian systems, this mask is different
on 32-bit and 64-bit systems (though the size is identical).

Add a compat wrapper that converts the mask to whatever the kernel accepts,
allowing 32-bit kvm user space to set signal masks.

This patch fixes qemu with --enable-io-thread on ppc64 hosts when running
32-bit user land.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 kernel/compat.c     |  1 +
 virt/kvm/kvm_main.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd5..18197ae2d465 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
 	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
 	}
 }
+EXPORT_SYMBOL_GPL(sigset_from_compat);
 
 asmlinkage long
 compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa2321ac77cc..11d2783eb9df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -84,6 +84,10 @@ struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
+				  unsigned long arg);
+#endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 
@@ -1586,7 +1590,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 static struct file_operations kvm_vcpu_fops = {
 	.release        = kvm_vcpu_release,
 	.unlocked_ioctl = kvm_vcpu_ioctl,
-	.compat_ioctl   = kvm_vcpu_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = kvm_vcpu_compat_ioctl,
+#endif
 	.mmap           = kvm_vcpu_mmap,
 	.llseek		= noop_llseek,
 };
@@ -1875,6 +1881,50 @@ out:
 	return r;
 }
 
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *filp,
+				  unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = compat_ptr(arg);
+	int r;
+
+	if (vcpu->kvm->mm != current->mm)
+		return -EIO;
+
+	switch (ioctl) {
+	case KVM_SET_SIGNAL_MASK: {
+		struct kvm_signal_mask __user *sigmask_arg = argp;
+		struct kvm_signal_mask kvm_sigmask;
+		compat_sigset_t csigset;
+		sigset_t sigset;
+
+		if (argp) {
+			r = -EFAULT;
+			if (copy_from_user(&kvm_sigmask, argp,
+					   sizeof kvm_sigmask))
+				goto out;
+			r = -EINVAL;
+			if (kvm_sigmask.len != sizeof csigset)
+				goto out;
+			r = -EFAULT;
+			if (copy_from_user(&csigset, sigmask_arg->sigset,
+					   sizeof csigset))
+				goto out;
+		}
+		sigset_from_compat(&sigset, &csigset);
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		break;
+	}
+	default:
+		r = kvm_vcpu_ioctl(filp, ioctl, arg);
+	}
+
+out:
+	return r;
+}
+#endif
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {

From 9f3191aec595ef9f3c80bc96664fd7aef57ef5be Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 11 Jun 2011 12:23:55 +0200
Subject: [PATCH 071/143] KVM: Fix off-by-one in overflow check of
 KVM_ASSIGN_SET_MSIX_NR

KVM_MAX_MSIX_PER_DEV implies that up to that many MSI-X entries can be
requested. But the kernel so far rejected already the upper limit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/assigned-dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 6cc4b97ec458..4e9eaeb518c7 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -617,7 +617,7 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 	if (adev->entries_nr == 0) {
 		adev->entries_nr = entry_nr->entry_nr;
 		if (adev->entries_nr == 0 ||
-		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
 			r = -EINVAL;
 			goto msix_nr_out;
 		}

From 58f0964ee445d6703bf2bfd5170e75fb0920ad8f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 11 Jun 2011 12:24:24 +0200
Subject: [PATCH 072/143] KVM: Fix KVM_ASSIGN_SET_MSIX_ENTRY documentation

The documented behavior did not match the implemented one (which also
never changed).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0ebe9225b171..b2511360b8f5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1234,8 +1234,10 @@ Type: vm ioctl
 Parameters: struct kvm_assigned_msix_nr (in)
 Returns: 0 on success, -1 on error
 
-Set the number of MSI-X interrupts for an assigned device. This service can
-only be called once in the lifetime of an assigned device.
+Set the number of MSI-X interrupts for an assigned device. The number is
+reset again by terminating the MSI-X assignment of the device via
+KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
+point will fail.
 
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;

From 02668b061db1b9f7f18872e594ac68e237db0bed Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Fri, 10 Jun 2011 11:35:30 +0200
Subject: [PATCH 073/143] KVM: fix XSAVE bit scanning (now properly)

commit 123108f1c1aafd51d6a5c79cc04d7999dd88a930 tried to fix KVMs
XSAVE valid feature scanning, but it was wrong. It was not considering
the sparse nature of this bitfield, instead reading values from
uninitialized members of the entries array.
This patch now separates subleaf indicies from KVM's array indicies
and fills the entry before querying it's value.
This fixes AVX support in KVM guests.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff4623b1b102..84f46074ca74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2447,16 +2447,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	}
 	case 0xd: {
-		int i;
+		int idx, i;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (i = 1; *nent < maxnent && i < 64; ++i) {
-			if (entry[i].eax == 0 || !supported_xcr0_bit(i))
+		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+			do_cpuid_1_ent(&entry[i], function, idx);
+			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
 				continue;
-			do_cpuid_1_ent(&entry[i], function, i);
 			entry[i].flags |=
 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 			++*nent;
+			++i;
 		}
 		break;
 	}

From 4a00efdf0c7c93dc6f6b3ce7e2d6bd4cd1ac1651 Mon Sep 17 00:00:00 2001
From: "Yang, Wei Y" <wei.y.yang@intel.com>
Date: Mon, 13 Jun 2011 21:52:33 +0800
Subject: [PATCH 074/143] KVM: Enable DRNG feature support for KVM

This patch exposes DRNG feature to KVM guests.

The RDRAND instruction can provide software with sequences of
random numbers generated from white noise.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84f46074ca74..6f1e54d0828d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2345,7 +2345,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C);
+		F(F16C) | F(RDRAND);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |

From d9c3476d8a99455cd3af1bd773acd77aa947a934 Mon Sep 17 00:00:00 2001
From: "Yang, Wei" <wei.y.yang@intel.com>
Date: Tue, 14 Jun 2011 20:10:17 +0800
Subject: [PATCH 075/143] KVM: Remove RDWRGSFS bit from CR4_RESERVED_BITS

This patch removes RDWRGSFS bit from CR4_RESERVED_BITS.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h        | 2 +-
 arch/x86/include/asm/processor-flags.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc38eca116c0..554be456f11e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
-			  | X86_CR4_OSXSAVE | X86_CR4_SMEP              \
+			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 59ab4dffa377..2dddb317bb39 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,6 +59,7 @@
 #define X86_CR4_OSFXSR	0x00000200 /* enable fast FPU save and restore */
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
+#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
 #define X86_CR4_SMEP	0x00100000 /* enable SMEP support */
 

From 74dc2b4ffe3af1eac367be0178f88487cb9b240a Mon Sep 17 00:00:00 2001
From: "Yang, Wei" <wei.y.yang@intel.com>
Date: Tue, 14 Jun 2011 20:10:18 +0800
Subject: [PATCH 076/143] KVM: Add RDWRGSFS support when setting CR4

This patch adds RDWRGSFS support when setting CR4.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f1e54d0828d..423e0cda8e91 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -588,6 +588,14 @@ static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_SMEP));
 }
 
+static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -618,6 +626,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
 		return 1;
 
+	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;

From 176f61da82435eae09cc96f70b530d1ba0746b8b Mon Sep 17 00:00:00 2001
From: "Yang, Wei" <wei.y.yang@intel.com>
Date: Tue, 14 Jun 2011 20:10:19 +0800
Subject: [PATCH 077/143] KVM: Expose RDWRGSFS bit to KVM guests

This patch exposes RDWRGSFS bit to KVM guests.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 423e0cda8e91..76c16a5c6c56 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(SMEP);
+		F(SMEP) | F(FSGSBASE);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();

From a01c8f9b4e266df1d7166d23216f2060648f862d Mon Sep 17 00:00:00 2001
From: "Yang, Wei" <wei.y.yang@intel.com>
Date: Tue, 14 Jun 2011 15:19:06 +0800
Subject: [PATCH 078/143] KVM: Enable ERMS feature support for KVM

This patch exposes ERMS feature to KVM guests.

The REP MOVSB/STOSB instruction can enhance fast strings attempts to
move as much of the data with larger size load/stores as possible.

Signed-off-by: Yang, Wei <wei.y.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76c16a5c6c56..0b803f04bde7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(SMEP) | F(FSGSBASE);
+		F(SMEP) | F(FSGSBASE) | F(ERMS);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();

From 411c588dfb863feee78b721d5e7c86ac38921c49 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 6 Jun 2011 16:11:54 +0300
Subject: [PATCH 079/143] KVM: MMU: Adjust shadow paging to work when SMEP=1
 and CR0.WP=0

When CR0.WP=0, we sometimes map user pages as kernel pages (to allow
the kernel to write to them).  Unfortunately this also allows the kernel
to fetch from these pages, even if CR4.SMEP is set.

Adjust for this by also setting NX on the spte in these circumstances.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/mmu.txt | 18 ++++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/mmu.c                | 14 +++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index f46aa58389ca..5dc972c09b55 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -165,6 +165,10 @@ Shadow pages contain the following information:
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
+  role.smep_andnot_wp:
+    Contains the value of cr4.smep && !cr0.wp for which the page is valid
+    (pages for which this is true are different from other pages; see the
+    treatment of cr0.wp=0 below).
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
@@ -317,6 +321,20 @@ on fault type:
 
 (user write faults generate a #PF)
 
+In the first case there is an additional complication if CR4.SMEP is
+enabled: since we've turned the page into a kernel page, the kernel may now
+execute it.  We handle this by also setting spte.nx.  If we get a user
+fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.
+
+To prevent an spte that was converted into a kernel page with cr0.wp=0
+from being written by the kernel after cr0.wp has changed to 1, we make
+the value of cr0.wp part of the page role.  This means that an spte created
+with one value of cr0.wp cannot be used when cr0.wp has a different value -
+it will simply be missed by the shadow page lookup code.  A similar issue
+exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
+changing cr4.smep to 1.  To avoid this, the value of !cr0.wp && cr4.smep
+is also made a part of the page role.
+
 Large pages
 ===========
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 554be456f11e..da6bbee878ca 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
 		unsigned invalid:1;
 		unsigned nxe:1;
 		unsigned cr0_wp:1;
+		unsigned smep_andnot_wp:1;
 	};
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 15afa1e1eaf9..da0f3b081076 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1985,8 +1985,17 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= PT_WRITABLE_MASK;
 
 		if (!vcpu->arch.mmu.direct_map
-		    && !(pte_access & ACC_WRITE_MASK))
+		    && !(pte_access & ACC_WRITE_MASK)) {
 			spte &= ~PT_USER_MASK;
+			/*
+			 * If we converted a user page to a kernel page,
+			 * so that the kernel can write to it when cr0.wp=0,
+			 * then we should prevent the kernel from executing it
+			 * if SMEP is enabled.
+			 */
+			if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+				spte |= PT64_NX_MASK;
+		}
 
 		/*
 		 * Optimization: for pte sync, if spte was writable the hash
@@ -2955,6 +2964,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
+	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -2969,6 +2979,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.smep_andnot_wp
+		= smep && !is_write_protection(vcpu);
 
 	return r;
 }

From 45bd07b9d5202c910b31c92bd15572b560198c26 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 12 Jun 2011 18:14:08 +0300
Subject: [PATCH 080/143] KVM: MMU: make kvm_mmu_reset_context() flush the
 guest TLB

kvm_set_cr0() and kvm_set_cr4(), and possible other functions,
assume that kvm_mmu_reset_context() flushes the guest TLB.  However,
it does not.

Fix by flushing the tlb (and syncing the new root as well).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da0f3b081076..9c629b54d362 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3054,8 +3054,18 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
+	int r;
+
 	destroy_kvm_mmu(vcpu);
-	return init_kvm_mmu(vcpu);
+	r = init_kvm_mmu(vcpu);
+
+	if (r)
+		goto err;
+
+	kvm_mmu_sync_roots(vcpu);
+	kvm_mmu_flush_tlb(vcpu);
+err:
+	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 

From 24294b9a3fbe00289c039fb3e80087be66b8c415 Mon Sep 17 00:00:00 2001
From: Stuart Yoder <stuart.yoder@freescale.com>
Date: Tue, 17 May 2011 18:26:00 -0500
Subject: [PATCH 081/143] KVM: PPC: fix partial application of "exit timing in
 ticks"

When http://www.spinics.net/lists/kvm-ppc/msg02664.html
was applied to produce commit b51e7aa7ed6d8d134d02df78300ab0f91cfff4d2,
the removal of the conversion in add_exit_timing was left out.

Signed-off-by: Stuart Yoder <stuart.yoder@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/timing.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 319177df9587..07b6110a4bb7 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -56,15 +56,6 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
 {
 	u64 old;
 
-	do_div(duration, tb_ticks_per_usec);
-	if (unlikely(duration > 0xFFFFFFFF)) {
-		printk(KERN_ERR"%s - duration too big -> overflow"
-			" duration %lld type %d exit #%d\n",
-			__func__, duration, type,
-			vcpu->arch.timing_count_type[type]);
-		return;
-	}
-
 	mutex_lock(&vcpu->arch.exit_timing_lock);
 
 	vcpu->arch.timing_count_type[type]++;

From a22a2daccfa3ade5cdd9ef1e8a05cf1e6ffca42b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 7 Jun 2011 20:45:34 +0200
Subject: [PATCH 082/143] KVM: PPC: Resolve real-mode handlers through function
 exports

Up until now, Book3S KVM had variables stored in the kernel that a kernel module
or the kvm code in the kernel could read from to figure out where some real mode
helper functions are located.

This is all unnecessary. The high bits of the EA get ignore in real mode, so we
can just use the pointer as is. Also, it's a lot easier on relocations when we
use the normal way of resolving the address to a function, instead of jumping
through hoops.

This patch fixes compilation with CONFIG_RELOCATABLE=y.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h | 4 ++--
 arch/powerpc/kvm/book3s.c             | 4 ++--
 arch/powerpc/kvm/book3s_exports.c     | 4 ++--
 arch/powerpc/kvm/book3s_rmhandlers.S  | 8 --------
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index d62e703f1214..70c409b1d931 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -133,8 +133,8 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern ulong kvmppc_trampoline_lowmem;
-extern ulong kvmppc_trampoline_enter;
+extern void kvmppc_handler_lowmem_trampoline(void);
+extern void kvmppc_handler_trampoline_enter(void);
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 0f95b5cce033..73fdab8a567f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1342,8 +1342,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu_book3s->slb_nr = 64;
 
 	/* remember where some real-mode handlers are */
-	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
-	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
+	vcpu->arch.trampoline_lowmem = (ulong)kvmppc_handler_lowmem_trampoline;
+	vcpu->arch.trampoline_enter = (ulong)kvmppc_handler_trampoline_enter;
 	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
 #ifdef CONFIG_PPC_BOOK3S_64
 	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 1dd5a1ddfd0d..f94fd9a4c69a 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -20,8 +20,8 @@
 #include <linux/module.h>
 #include <asm/kvm_book3s.h>
 
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
+EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
+EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_rmcall);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 1a1b34487e71..5f5d07475c91 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -251,12 +251,4 @@ define_load_up(altivec)
 define_load_up(vsx)
 #endif
 
-.global kvmppc_trampoline_lowmem
-kvmppc_trampoline_lowmem:
-	PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
-
-.global kvmppc_trampoline_enter
-kvmppc_trampoline_enter:
-	PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
-
 #include "book3s_segment.S"

From 685659ee70db0bac47ffd619c726cf600e504fd7 Mon Sep 17 00:00:00 2001
From: yu liu <yu.liu@freescale.com>
Date: Tue, 14 Jun 2011 18:34:25 -0500
Subject: [PATCH 083/143] powerpc/e500: Save SPEFCSR in flush_spe_to_thread()

giveup_spe() saves the SPE state which is protected by MSR[SPE].
However, modifying SPEFSCR does not trap when MSR[SPE]=0.
And since SPEFSCR is already saved/restored in _switch(),
not all the callers want to save SPEFSCR again.
Thus, saving SPEFSCR should not belong to giveup_spe().

This patch moves SPEFSCR saving to flush_spe_to_thread(),
and cleans up the caller that needs to save SPEFSCR accordingly.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kernel/head_fsl_booke.S | 2 --
 arch/powerpc/kernel/process.c        | 1 +
 arch/powerpc/kernel/traps.c          | 5 +----
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 5ecf54cfa7d4..aede4f87b682 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -792,8 +792,6 @@ _GLOBAL(giveup_spe)
 	evmwumiaa evr6, evr6, evr6	/* evr6 <- ACC = 0 * 0 + ACC */
 	li	r4,THREAD_ACC
 	evstddx	evr6, r4, r3		/* save off accumulator */
-	mfspr	r6,SPRN_SPEFSCR
-	stw	r6,THREAD_SPEFSCR(r3)	/* save spefscr register value */
 	beq	1f
 	lwz	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 	lis	r3,MSR_SPE@h
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 91e52df3d81d..60ac2a9251db 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -213,6 +213,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
 #ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
 #endif
+			tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
 			giveup_spe(tsk);
 		}
 		preempt_enable();
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1a0141426cda..f19d9777d3c1 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1387,10 +1387,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
 	int code = 0;
 	int err;
 
-	preempt_disable();
-	if (regs->msr & MSR_SPE)
-		giveup_spe(current);
-	preempt_enable();
+	flush_spe_to_thread(current);
 
 	spefscr = current->thread.spefscr;
 	fpexc_mode = current->thread.fpexc_mode;

From c51584d52e3878aa9b2bb98cdfb87173e7acf560 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:27 -0500
Subject: [PATCH 084/143] powerpc/e500: SPE register saving: take arbitrary
 struct offset

Previously, these macros hardcoded THREAD_EVR0 as the base of the save
area, relative to the base register passed.  This base offset is now
passed as a separate macro parameter, allowing reuse with other SPE
save areas, such as used by KVM.

Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/ppc_asm.h   | 28 ++++++++++++++++------------
 arch/powerpc/kernel/head_fsl_booke.S |  6 +++---
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 1b422381fc16..368f72f79808 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -150,18 +150,22 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define REST_16VSRSU(n,b,base)	REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base)
 #define REST_32VSRSU(n,b,base)	REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base)
 
-#define SAVE_EVR(n,s,base)	evmergehi s,s,n; stw s,THREAD_EVR0+4*(n)(base)
-#define SAVE_2EVRS(n,s,base)	SAVE_EVR(n,s,base); SAVE_EVR(n+1,s,base)
-#define SAVE_4EVRS(n,s,base)	SAVE_2EVRS(n,s,base); SAVE_2EVRS(n+2,s,base)
-#define SAVE_8EVRS(n,s,base)	SAVE_4EVRS(n,s,base); SAVE_4EVRS(n+4,s,base)
-#define SAVE_16EVRS(n,s,base)	SAVE_8EVRS(n,s,base); SAVE_8EVRS(n+8,s,base)
-#define SAVE_32EVRS(n,s,base)	SAVE_16EVRS(n,s,base); SAVE_16EVRS(n+16,s,base)
-#define REST_EVR(n,s,base)	lwz s,THREAD_EVR0+4*(n)(base); evmergelo n,s,n
-#define REST_2EVRS(n,s,base)	REST_EVR(n,s,base); REST_EVR(n+1,s,base)
-#define REST_4EVRS(n,s,base)	REST_2EVRS(n,s,base); REST_2EVRS(n+2,s,base)
-#define REST_8EVRS(n,s,base)	REST_4EVRS(n,s,base); REST_4EVRS(n+4,s,base)
-#define REST_16EVRS(n,s,base)	REST_8EVRS(n,s,base); REST_8EVRS(n+8,s,base)
-#define REST_32EVRS(n,s,base)	REST_16EVRS(n,s,base); REST_16EVRS(n+16,s,base)
+/*
+ * b = base register for addressing, o = base offset from register of 1st EVR
+ * n = first EVR, s = scratch
+ */
+#define SAVE_EVR(n,s,b,o)	evmergehi s,s,n; stw s,o+4*(n)(b)
+#define SAVE_2EVRS(n,s,b,o)	SAVE_EVR(n,s,b,o); SAVE_EVR(n+1,s,b,o)
+#define SAVE_4EVRS(n,s,b,o)	SAVE_2EVRS(n,s,b,o); SAVE_2EVRS(n+2,s,b,o)
+#define SAVE_8EVRS(n,s,b,o)	SAVE_4EVRS(n,s,b,o); SAVE_4EVRS(n+4,s,b,o)
+#define SAVE_16EVRS(n,s,b,o)	SAVE_8EVRS(n,s,b,o); SAVE_8EVRS(n+8,s,b,o)
+#define SAVE_32EVRS(n,s,b,o)	SAVE_16EVRS(n,s,b,o); SAVE_16EVRS(n+16,s,b,o)
+#define REST_EVR(n,s,b,o)	lwz s,o+4*(n)(b); evmergelo n,s,n
+#define REST_2EVRS(n,s,b,o)	REST_EVR(n,s,b,o); REST_EVR(n+1,s,b,o)
+#define REST_4EVRS(n,s,b,o)	REST_2EVRS(n,s,b,o); REST_2EVRS(n+2,s,b,o)
+#define REST_8EVRS(n,s,b,o)	REST_4EVRS(n,s,b,o); REST_4EVRS(n+4,s,b,o)
+#define REST_16EVRS(n,s,b,o)	REST_8EVRS(n,s,b,o); REST_8EVRS(n+8,s,b,o)
+#define REST_32EVRS(n,s,b,o)	REST_16EVRS(n,s,b,o); REST_16EVRS(n+16,s,b,o)
 
 /* Macros to adjust thread priority for hardware multithreading */
 #define HMT_VERY_LOW	or	31,31,31	# very low priority
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index aede4f87b682..fe37dd0dfd17 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -656,7 +656,7 @@ load_up_spe:
 	cmpi	0,r4,0
 	beq	1f
 	addi	r4,r4,THREAD	/* want THREAD of last_task_used_spe */
-	SAVE_32EVRS(0,r10,r4)
+	SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
 	evxor	evr10, evr10, evr10	/* clear out evr10 */
 	evmwumiaa evr10, evr10, evr10	/* evr10 <- ACC = 0 * 0 + ACC */
 	li	r5,THREAD_ACC
@@ -676,7 +676,7 @@ load_up_spe:
 	stw	r4,THREAD_USED_SPE(r5)
 	evlddx	evr4,r10,r5
 	evmra	evr4,evr4
-	REST_32EVRS(0,r10,r5)
+	REST_32EVRS(0,r10,r5,THREAD_EVR0)
 #ifndef CONFIG_SMP
 	subi	r4,r5,THREAD
 	stw	r4,last_task_used_spe@l(r3)
@@ -787,7 +787,7 @@ _GLOBAL(giveup_spe)
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	lwz	r5,PT_REGS(r3)
 	cmpi	0,r5,0
-	SAVE_32EVRS(0, r4, r3)
+	SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
 	evxor	evr6, evr6, evr6	/* clear out evr6 */
 	evmwumiaa evr6, evr6, evr6	/* evr6 <- ACC = 0 * 0 + ACC */
 	li	r4,THREAD_ACC

From ecee273fc48f7f48f0c2f074335c43aaa790c308 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:29 -0500
Subject: [PATCH 085/143] KVM: PPC: booke: use shadow_msr

Keep the guest MSR and the guest-mode true MSR separate, rather than
modifying the guest MSR on each guest entry to produce a true MSR.

Any bits which should be modified based on guest MSR must be explicitly
propagated from vcpu->arch.shared->msr to vcpu->arch.shadow_msr in
kvmppc_set_msr().

While we're modifying the guest entry code, reorder a few instructions
to bury some load latencies.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |  2 +-
 arch/powerpc/kernel/asm-offsets.c   |  2 +-
 arch/powerpc/kvm/booke.c            |  1 +
 arch/powerpc/kvm/booke_interrupts.S | 17 ++++++-----------
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 186f150b9b89..12cb1807e8d7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -219,12 +219,12 @@ struct kvm_vcpu_arch {
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
-	ulong shadow_msr;
 	ulong hflags;
 	ulong guest_owned_ext;
 #endif
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
+	ulong shadow_msr;
 	ulong sprg4;
 	ulong sprg5;
 	ulong sprg6;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 36e1c8a29be8..25de8e4808a4 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -404,12 +404,12 @@ int main(void)
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
 	/* book3s */
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
-	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8462b3a1c1c7..05cedb5f8210 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -514,6 +514,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pc = 0;
 	vcpu->arch.shared->msr = 0;
+	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
 	vcpu->arch.shadow_pid = 1;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index b58ccae95904..55410cc45ad7 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -24,8 +24,6 @@
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
-#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS)
-
 #define VCPU_GPR(n)     (VCPU_GPRS + (n * 4))
 
 /* The host stack layout: */
@@ -405,20 +403,17 @@ lightweight_exit:
 
 	/* Finish loading guest volatiles and jump to guest. */
 	lwz	r3, VCPU_CTR(r4)
+	lwz	r5, VCPU_CR(r4)
+	lwz	r6, VCPU_PC(r4)
+	lwz	r7, VCPU_SHADOW_MSR(r4)
 	mtctr	r3
-	lwz	r3, VCPU_CR(r4)
-	mtcr	r3
+	mtcr	r5
+	mtsrr0	r6
+	mtsrr1	r7
 	lwz	r5, VCPU_GPR(r5)(r4)
 	lwz	r6, VCPU_GPR(r6)(r4)
 	lwz	r7, VCPU_GPR(r7)(r4)
 	lwz	r8, VCPU_GPR(r8)(r4)
-	lwz	r3, VCPU_PC(r4)
-	mtsrr0	r3
-	lwz	r3, VCPU_SHARED(r4)
-	lwz	r3, (VCPU_SHARED_MSR + 4)(r3)
-	oris	r3, r3, KVMPPC_MSR_MASK@h
-	ori	r3, r3, KVMPPC_MSR_MASK@l
-	mtsrr1	r3
 
 	/* Clear any debug events which occurred since we disabled MSR[DE].
 	 * XXX This gives us a 3-instruction window in which a breakpoint

From 4cd35f675ba41a99a477e28a6add4a66833325f2 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:31 -0500
Subject: [PATCH 086/143] KVM: PPC: e500: Save/restore SPE state

This is done lazily.  The SPE save will be done only if the guest has
used SPE since the last preemption or heavyweight exit.  Restore will be
done only on demand, when enabling MSR_SPE in the shadow MSR, in response
to an SPE fault or mtmsr emulation.

For SPEFSCR, Linux already switches it on context switch (non-lazily), so
the only remaining bit is to save it between qemu and the guest.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h  |  6 ++
 arch/powerpc/include/asm/reg_booke.h |  1 +
 arch/powerpc/kernel/asm-offsets.c    |  7 +++
 arch/powerpc/kvm/booke.c             | 84 +++++++++++++++++++++++++++-
 arch/powerpc/kvm/booke.h             | 22 +++-----
 arch/powerpc/kvm/booke_interrupts.S  | 38 +++++++++++++
 arch/powerpc/kvm/e500.c              |  7 ++-
 7 files changed, 148 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 12cb1807e8d7..c4ce1054b866 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -195,6 +195,12 @@ struct kvm_vcpu_arch {
 	u64 fpr[32];
 	u64 fpscr;
 
+#ifdef CONFIG_SPE
+	ulong evr[32];
+	ulong spefscr;
+	ulong host_spefscr;
+	u64 acc;
+#endif
 #ifdef CONFIG_ALTIVEC
 	vector128 vr[32];
 	vector128 vscr;
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 0f0ad9fa01c1..9ec0b39f9ddc 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -318,6 +318,7 @@
 #define ESR_ILK		0x00100000	/* Instr. Cache Locking */
 #define ESR_PUO		0x00040000	/* Unimplemented Operation exception */
 #define ESR_BO		0x00020000	/* Byte Ordering */
+#define ESR_SPV		0x00000080	/* Signal Processing operation */
 
 /* Bit definitions related to the DBCR0. */
 #if defined(CONFIG_40x)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 25de8e4808a4..ecd2b3ad7ff6 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -497,6 +497,13 @@ int main(void)
 	DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7));
 #endif
 
+#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
+	DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0]));
+	DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc));
+	DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr));
+	DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
 						arch.timing_exit.tv32.tbu));
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 05cedb5f8210..0ecbecb2f7cc 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  * Copyright IBM Corp. 2007
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
@@ -78,6 +79,57 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 	}
 }
 
+#ifdef CONFIG_SPE
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_save_guest_spe(vcpu);
+	vcpu->arch.shadow_msr &= ~MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_load_guest_spe(vcpu);
+	vcpu->arch.shadow_msr |= MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.shared->msr & MSR_SPE) {
+		if (!(vcpu->arch.shadow_msr & MSR_SPE))
+			kvmppc_vcpu_enable_spe(vcpu);
+	} else if (vcpu->arch.shadow_msr & MSR_SPE) {
+		kvmppc_vcpu_disable_spe(vcpu);
+	}
+}
+#else
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
+		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+
+	vcpu->arch.shared->msr = new_msr;
+
+	if (vcpu->arch.shared->msr & MSR_WE) {
+		kvm_vcpu_block(vcpu);
+		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+	};
+
+	kvmppc_vcpu_sync_spe(vcpu);
+}
+
 static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
                                        unsigned int priority)
 {
@@ -344,10 +396,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
-	case BOOKE_INTERRUPT_SPE_UNAVAIL:
-		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL);
+#ifdef CONFIG_SPE
+	case BOOKE_INTERRUPT_SPE_UNAVAIL: {
+		if (vcpu->arch.shared->msr & MSR_SPE)
+			kvmppc_vcpu_enable_spe(vcpu);
+		else
+			kvmppc_booke_queue_irqprio(vcpu,
+						   BOOKE_IRQPRIO_SPE_UNAVAIL);
 		r = RESUME_GUEST;
 		break;
+	}
 
 	case BOOKE_INTERRUPT_SPE_FP_DATA:
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
@@ -358,6 +416,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
 		r = RESUME_GUEST;
 		break;
+#else
+	case BOOKE_INTERRUPT_SPE_UNAVAIL:
+		/*
+		 * Guest wants SPE, but host kernel doesn't support it.  Send
+		 * an "unimplemented operation" program check to the guest.
+		 */
+		kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
+		r = RESUME_GUEST;
+		break;
+
+	/*
+	 * These really should never happen without CONFIG_SPE,
+	 * as we should never enable the real MSR[SPE] in the guest.
+	 */
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
+		       __func__, exit_nr, vcpu->arch.pc);
+		run->hw.hardware_exit_reason = exit_nr;
+		r = RESUME_HOST;
+		break;
+#endif
 
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 492bb7030358..0fa1732ddcb7 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -52,24 +52,18 @@
 
 extern unsigned long kvmppc_booke_handlers;
 
-/* Helper function for "full" MSR writes. No need to call this if only EE is
- * changing. */
-static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
-{
-	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
-		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
-
-	vcpu->arch.shared->msr = new_msr;
-
-	if (vcpu->arch.shared->msr & MSR_WE) {
-		kvm_vcpu_block(vcpu);
-		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
-	};
-}
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance);
 int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
 int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
 
+/* low-level asm code to transfer guest state */
+void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
+void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
+
+/* high-level function, manages flags, host state */
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 55410cc45ad7..8cb3dfe29f75 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  */
@@ -239,6 +240,14 @@ _GLOBAL(kvmppc_resume_host)
 heavyweight_exit:
 	/* Not returning to guest. */
 
+#ifdef CONFIG_SPE
+	/* save guest SPEFSCR and load host SPEFSCR */
+	mfspr	r9, SPRN_SPEFSCR
+	stw	r9, VCPU_SPEFSCR(r4)
+	lwz	r9, VCPU_HOST_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r9
+#endif
+
 	/* We already saved guest volatile register state; now save the
 	 * non-volatiles. */
 	stw	r15, VCPU_GPR(r15)(r4)
@@ -340,6 +349,14 @@ _GLOBAL(__kvmppc_vcpu_run)
 	lwz	r30, VCPU_GPR(r30)(r4)
 	lwz	r31, VCPU_GPR(r31)(r4)
 
+#ifdef CONFIG_SPE
+	/* save host SPEFSCR and load guest SPEFSCR */
+	mfspr	r3, SPRN_SPEFSCR
+	stw	r3, VCPU_HOST_SPEFSCR(r4)
+	lwz	r3, VCPU_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r3
+#endif
+
 lightweight_exit:
 	stw	r2, HOST_R2(r1)
 
@@ -425,3 +442,24 @@ lightweight_exit:
 	lwz	r3, VCPU_GPR(r3)(r4)
 	lwz	r4, VCPU_GPR(r4)(r4)
 	rfi
+
+#ifdef CONFIG_SPE
+_GLOBAL(kvmppc_save_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	SAVE_32EVRS(0, r4, r3, VCPU_EVR)
+	evxor   evr6, evr6, evr6
+	evmwumiaa evr6, evr6, evr6
+	li	r4,VCPU_ACC
+	evstddx evr6, r4, r3		/* save acc */
+	blr
+
+_GLOBAL(kvmppc_load_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	li      r4,VCPU_ACC
+	evlddx  evr6,r4,r3
+	evmra   evr6,evr6		/* load acc */
+	REST_32EVRS(0, r4, r3, VCPU_EVR)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 318dbc61ba44..797a7447c268 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, <yu.liu@freescale.com>
  *
@@ -41,6 +41,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvmppc_e500_tlb_put(vcpu);
+
+#ifdef CONFIG_SPE
+	if (vcpu->arch.shadow_msr & MSR_SPE)
+		kvmppc_vcpu_disable_spe(vcpu);
+#endif
 }
 
 int kvmppc_core_check_processor_compat(void)

From 6fc4d1eb911cced6bc103f2b36497397e99e8c43 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:34 -0500
Subject: [PATCH 087/143] KVM: PPC: e500: Disable preloading TLB1 in
 tlb_load().

Since TLB1 loading doesn't check the shadow TLB before allocating another
entry, you can get duplicates.

Once shadow PIDs are enabled in a later patch, we won't need to
invalidate the TLB on every switch, so this optimization won't be
needed anyway.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500_tlb.c | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index b18fe353397d..e0ab2160932a 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -144,24 +144,6 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	int i;
-	unsigned register mas0;
-
-	/* Load all valid TLB1 entries to reduce guest tlb miss fault */
-	local_irq_disable();
-	mas0 = mfspr(SPRN_MAS0);
-	for (i = 0; i < tlb1_max_shadow_size(); i++) {
-		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
-
-		if (get_tlb_v(stlbe)) {
-			mtspr(SPRN_MAS0, MAS0_TLBSEL(1)
-					| MAS0_ESEL(to_htlb1_esel(i)));
-			__write_host_tlbe(stlbe);
-		}
-	}
-	mtspr(SPRN_MAS0, mas0);
-	local_irq_enable();
 }
 
 void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)

From 0ef309956cecbaf6d96c31371bf393c296886fa6 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:35 -0500
Subject: [PATCH 088/143] KVM: PPC: e500: don't use MAS0 as intermediate
 storage.

This avoids races.  It also means that we use the shadow TLB way,
rather than the hardware hint -- if this is a problem, we could do
a tlbsx before inserting a TLB0 entry.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500_tlb.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index e0ab2160932a..f1b37e0de86c 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -112,13 +112,18 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
 /*
  * writing shadow tlb entry to host TLB
  */
-static inline void __write_host_tlbe(struct tlbe *stlbe)
+static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0)
 {
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS0, mas0);
 	mtspr(SPRN_MAS1, stlbe->mas1);
 	mtspr(SPRN_MAS2, stlbe->mas2);
 	mtspr(SPRN_MAS3, stlbe->mas3);
 	mtspr(SPRN_MAS7, stlbe->mas7);
-	__asm__ __volatile__ ("tlbwe\n" : : );
+	asm volatile("isync; tlbwe" : : : "memory");
+	local_irq_restore(flags);
 }
 
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -126,20 +131,15 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 {
 	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
 
-	local_irq_disable();
 	if (tlbsel == 0) {
-		__write_host_tlbe(stlbe);
+		__write_host_tlbe(stlbe,
+				  MAS0_TLBSEL(0) |
+				  MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1)));
 	} else {
-		unsigned register mas0;
-
-		mas0 = mfspr(SPRN_MAS0);
-
-		mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel)));
-		__write_host_tlbe(stlbe);
-
-		mtspr(SPRN_MAS0, mas0);
+		__write_host_tlbe(stlbe,
+				  MAS0_TLBSEL(1) |
+				  MAS0_ESEL(to_htlb1_esel(esel)));
 	}
-	local_irq_enable();
 }
 
 void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)

From 59c1f4e35c3db6c7ea5a04503a43bcbeb98977df Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:37 -0500
Subject: [PATCH 089/143] KVM: PPC: e500: Eliminate shadow_pages[], and use
 pfns instead.

This is in line with what other architectures do, and will allow us to
map things other than ordinary, unreserved kernel pages -- such as
dedicated devices, or large contiguous reserved regions.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_e500.h |  2 --
 arch/powerpc/kvm/e500_tlb.c         | 56 ++++++++++-------------------
 2 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index 7a2a565f88c4..f181ad1d6045 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -34,8 +34,6 @@ struct kvmppc_vcpu_e500 {
 	struct tlbe *guest_tlb[E500_TLB_NUM];
 	/* TLB that's actually used when the guest is running. */
 	struct tlbe *shadow_tlb[E500_TLB_NUM];
-	/* Pages which are referenced in the shadow TLB. */
-	struct page **shadow_pages[E500_TLB_NUM];
 
 	unsigned int guest_tlb_size[E500_TLB_NUM];
 	unsigned int shadow_tlb_size[E500_TLB_NUM];
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index f1b37e0de86c..0291c3cf5055 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -188,17 +188,16 @@ static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500,
 		int tlbsel, int esel)
 {
 	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-	struct page *page = vcpu_e500->shadow_pages[tlbsel][esel];
+	unsigned long pfn;
 
-	if (page) {
-		vcpu_e500->shadow_pages[tlbsel][esel] = NULL;
+	pfn = stlbe->mas3 >> PAGE_SHIFT;
+	pfn |= stlbe->mas7 << (32 - PAGE_SHIFT);
 
-		if (get_tlb_v(stlbe)) {
-			if (tlbe_is_writable(stlbe))
-				kvm_release_page_dirty(page);
-			else
-				kvm_release_page_clean(page);
-		}
+	if (get_tlb_v(stlbe)) {
+		if (tlbe_is_writable(stlbe))
+			kvm_release_pfn_dirty(pfn);
+		else
+			kvm_release_pfn_clean(pfn);
 	}
 }
 
@@ -271,37 +270,36 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
 {
-	struct page *new_page;
 	struct tlbe *stlbe;
-	hpa_t hpaddr;
+	unsigned long pfn;
 
 	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
 
-	/* Get reference to new page. */
-	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
-	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
+	/*
+	 * Translate guest physical to true physical, acquiring
+	 * a page reference if it is normal, non-reserved memory.
+	 */
+	pfn = gfn_to_pfn(vcpu_e500->vcpu.kvm, gfn);
+	if (is_error_pfn(pfn)) {
+		printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
 				(long)gfn);
-		kvm_release_page_clean(new_page);
+		kvm_release_pfn_clean(pfn);
 		return;
 	}
-	hpaddr = page_to_phys(new_page);
 
 	/* Drop reference to old page. */
 	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
 
-	vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
-
 	/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
 	stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
 		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas3 = (hpaddr & MAS3_RPN)
+	stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
 		| e500_shadow_mas3_attrib(gtlbe->mas3,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
+	stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
 
 	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
 			     stlbe->mas3, stlbe->mas7);
@@ -712,16 +710,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (vcpu_e500->shadow_tlb[1] == NULL)
 		goto err_out_guest1;
 
-	vcpu_e500->shadow_pages[0] = (struct page **)
-		kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-	if (vcpu_e500->shadow_pages[0] == NULL)
-		goto err_out_shadow1;
-
-	vcpu_e500->shadow_pages[1] = (struct page **)
-		kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL);
-	if (vcpu_e500->shadow_pages[1] == NULL)
-		goto err_out_page0;
-
 	/* Init TLB configuration register */
 	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
 	vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0];
@@ -730,10 +718,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 
 	return 0;
 
-err_out_page0:
-	kfree(vcpu_e500->shadow_pages[0]);
-err_out_shadow1:
-	kfree(vcpu_e500->shadow_tlb[1]);
 err_out_guest1:
 	kfree(vcpu_e500->guest_tlb[1]);
 err_out_shadow0:
@@ -746,8 +730,6 @@ err_out:
 
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
-	kfree(vcpu_e500->shadow_pages[1]);
-	kfree(vcpu_e500->shadow_pages[0]);
 	kfree(vcpu_e500->shadow_tlb[1]);
 	kfree(vcpu_e500->guest_tlb[1]);
 	kfree(vcpu_e500->shadow_tlb[0]);

From 9973d54eeafcd1c3a2e89f0f59280c4c1e03e73b Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:39 -0500
Subject: [PATCH 090/143] KVM: PPC: e500: Support large page mappings of PFNMAP
 vmas.

This allows large pages to be used on guest mappings backed by things like
/dev/mem, resulting in a significant speedup when guest memory
is mapped this way (it's useful for directly-assigned MMIO, too).

This is not a substitute for hugetlbfs integration, but is useful for
configurations where devices are directly assigned on chips without an
IOMMU -- in these cases, we need guest physical and true physical to
match, and be contiguous, so static reservation and mapping via /dev/mem
is the most straightforward way to set things up.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500_tlb.c | 103 ++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 0291c3cf5055..7f808c52e64a 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -270,28 +270,113 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
 {
+	struct kvm_memory_slot *slot;
 	struct tlbe *stlbe;
-	unsigned long pfn;
+	unsigned long pfn, hva;
+	int pfnmap = 0;
+	int tsize = BOOK3E_PAGESZ_4K;
 
 	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
 
 	/*
 	 * Translate guest physical to true physical, acquiring
 	 * a page reference if it is normal, non-reserved memory.
+	 *
+	 * gfn_to_memslot() must succeed because otherwise we wouldn't
+	 * have gotten this far.  Eventually we should just pass the slot
+	 * pointer through from the first lookup.
 	 */
-	pfn = gfn_to_pfn(vcpu_e500->vcpu.kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
-				(long)gfn);
-		kvm_release_pfn_clean(pfn);
-		return;
+	slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	if (tlbsel == 1) {
+		struct vm_area_struct *vma;
+		down_read(&current->mm->mmap_sem);
+
+		vma = find_vma(current->mm, hva);
+		if (vma && hva >= vma->vm_start &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			/*
+			 * This VMA is a physically contiguous region (e.g.
+			 * /dev/mem) that bypasses normal Linux page
+			 * management.  Find the overlap between the
+			 * vma and the memslot.
+			 */
+
+			unsigned long start, end;
+			unsigned long slot_start, slot_end;
+
+			pfnmap = 1;
+
+			start = vma->vm_pgoff;
+			end = start +
+			      ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+
+			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
+
+			slot_start = pfn - (gfn - slot->base_gfn);
+			slot_end = slot_start + slot->npages;
+
+			if (start < slot_start)
+				start = slot_start;
+			if (end > slot_end)
+				end = slot_end;
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+			/*
+			 * Now find the largest tsize (up to what the guest
+			 * requested) that will cover gfn, stay within the
+			 * range, and for which gfn and pfn are mutually
+			 * aligned.
+			 */
+
+			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+				unsigned long gfn_start, gfn_end, tsize_pages;
+				tsize_pages = 1 << (tsize - 2);
+
+				gfn_start = gfn & ~(tsize_pages - 1);
+				gfn_end = gfn_start + tsize_pages;
+
+				if (gfn_start + pfn - gfn < start)
+					continue;
+				if (gfn_end + pfn - gfn > end)
+					continue;
+				if ((gfn & (tsize_pages - 1)) !=
+				    (pfn & (tsize_pages - 1)))
+					continue;
+
+				gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+				pfn &= ~(tsize_pages - 1);
+				break;
+			}
+		}
+
+		up_read(&current->mm->mmap_sem);
+	}
+
+	if (likely(!pfnmap)) {
+		pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+		if (is_error_pfn(pfn)) {
+			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
+					(long)gfn);
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
 	}
 
 	/* Drop reference to old page. */
 	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
 
-	/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
-	stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
+	/* Force TS=1 IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize)
 		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,

From a4cd8b23ac5786943202c0174c717956947db43c Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:34:41 -0500
Subject: [PATCH 091/143] KVM: PPC: e500: enable magic page

This is a shared page used for paravirtualization.  It is always present
in the guest kernel's effective address space at the address indicated
by the hypercall that enables it.

The physical address specified by the hypercall is not used, as
e500 does not have real mode.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/ppc-pv.txt |  8 +++++---
 arch/powerpc/include/asm/kvm_ppc.h   |  1 +
 arch/powerpc/kvm/booke.c             | 11 +++++++++++
 arch/powerpc/kvm/e500_tlb.c          | 22 +++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c           |  3 ++-
 5 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 3ab969c59046..2b7ce190cde4 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can
 map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
 
 With this hypercall issued the guest always gets the magic page mapped at the
-desired location in effective and physical address space. For now, we always
-map the page to -4096. This way we can access it using absolute load and store
-functions. The following instruction reads the first field of the magic page:
+desired location. The first parameter indicates the effective address when the
+MMU is enabled. The second parameter indicates the address in real mode, if
+applicable to the target. For now, we always map the page to -4096. This way we
+can access it using absolute load and store functions. The following
+instruction reads the first field of the magic page:
 
 	ld	rX, -4096(0)
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9345238edecf..c662f140283a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -109,6 +109,7 @@ extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
+extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
 /*
  * Cuts out inst bits with ordering according to spec.
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0ecbecb2f7cc..4538956daecf 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -472,6 +472,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gpa_t gpaddr;
 		gfn_t gfn;
 
+#ifdef CONFIG_KVM_E500
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+			kvmppc_map_magic(vcpu);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			r = RESUME_GUEST;
+
+			break;
+		}
+#endif
+
 		/* Check the guest TLB. */
 		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
 		if (gtlb_index < 0) {
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 7f808c52e64a..c09e642ee537 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -76,7 +76,8 @@ static inline unsigned int tlb0_get_next_victim(
 
 static inline unsigned int tlb1_max_shadow_size(void)
 {
-	return tlb1_entry_num - tlbcam_index;
+	/* reserve one entry for magic page */
+	return tlb1_entry_num - tlbcam_index - 1;
 }
 
 static inline int tlbe_is_writable(struct tlbe *tlbe)
@@ -142,6 +143,25 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 }
 
+void kvmppc_map_magic(struct kvm_vcpu *vcpu)
+{
+	struct tlbe magic;
+	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+	pfn_t pfn;
+
+	pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+	get_page(pfn_to_page(pfn));
+
+	magic.mas1 = MAS1_VALID | MAS1_TS |
+		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
+	magic.mas3 = (pfn << PAGE_SHIFT) |
+		     MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+	magic.mas7 = pfn >> (32 - PAGE_SHIFT);
+
+	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
+}
+
 void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
 {
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 616dd516ca1f..24e2b64b6a48 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -73,7 +73,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	}
 	case HC_VENDOR_KVM | KVM_HC_FEATURES:
 		r = HC_EV_SUCCESS;
-#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */
+#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500)
+		/* XXX Missing magic page on 44x */
 		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
 #endif
 

From 08b7fa92b9250eab0f493f7721977e781a887b3d Mon Sep 17 00:00:00 2001
From: Liu Yu <yu.liu@freescale.com>
Date: Tue, 14 Jun 2011 18:34:59 -0500
Subject: [PATCH 092/143] KVM: PPC: e500: Stop keeping shadow TLB

Instead of a fully separate set of TLB entries, keep just the
pfn and dirty status.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_e500.h |  20 +-
 arch/powerpc/kvm/e500_tlb.c         | 317 ++++++++++++----------------
 2 files changed, 154 insertions(+), 183 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index f181ad1d6045..4a6d77a8b8a0 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -29,15 +29,23 @@ struct tlbe{
 	u32 mas7;
 };
 
+#define E500_TLB_VALID 1
+#define E500_TLB_DIRTY 2
+
+struct tlbe_priv {
+	pfn_t pfn;
+	unsigned int flags; /* E500_TLB_* */
+};
+
 struct kvmppc_vcpu_e500 {
 	/* Unmodified copy of the guest's TLB. */
-	struct tlbe *guest_tlb[E500_TLB_NUM];
-	/* TLB that's actually used when the guest is running. */
-	struct tlbe *shadow_tlb[E500_TLB_NUM];
+	struct tlbe *gtlb_arch[E500_TLB_NUM];
 
-	unsigned int guest_tlb_size[E500_TLB_NUM];
-	unsigned int shadow_tlb_size[E500_TLB_NUM];
-	unsigned int guest_tlb_nv[E500_TLB_NUM];
+	/* KVM internal information associated with each guest TLB entry */
+	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+	unsigned int gtlb_size[E500_TLB_NUM];
+	unsigned int gtlb_nv[E500_TLB_NUM];
 
 	u32 host_pid[E500_PID_NUM];
 	u32 pid[E500_PID_NUM];
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c09e642ee537..9d1e28d443c4 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -41,25 +41,14 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 		printk("Guest TLB%d:\n", tlbsel);
-		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
-			tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+		for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
+			tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
 			if (tlbe->mas1 & MAS1_VALID)
 				printk(" G[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
 					tlbsel, i, tlbe->mas1, tlbe->mas2,
 					tlbe->mas3, tlbe->mas7);
 		}
 	}
-
-	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
-		printk("Shadow TLB%d:\n", tlbsel);
-		for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) {
-			tlbe = &vcpu_e500->shadow_tlb[tlbsel][i];
-			if (tlbe->mas1 & MAS1_VALID)
-				printk(" S[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
-					tlbsel, i, tlbe->mas1, tlbe->mas2,
-					tlbe->mas3, tlbe->mas7);
-		}
-	}
 }
 
 static inline unsigned int tlb0_get_next_victim(
@@ -67,9 +56,9 @@ static inline unsigned int tlb0_get_next_victim(
 {
 	unsigned int victim;
 
-	victim = vcpu_e500->guest_tlb_nv[0]++;
-	if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
-		vcpu_e500->guest_tlb_nv[0] = 0;
+	victim = vcpu_e500->gtlb_nv[0]++;
+	if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
+		vcpu_e500->gtlb_nv[0] = 0;
 
 	return victim;
 }
@@ -128,10 +117,8 @@ static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0)
 }
 
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel)
+		int tlbsel, int esel, struct tlbe *stlbe)
 {
-	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-
 	if (tlbsel == 0) {
 		__write_host_tlbe(stlbe,
 				  MAS0_TLBSEL(0) |
@@ -141,6 +128,8 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 				  MAS0_TLBSEL(1) |
 				  MAS0_ESEL(to_htlb1_esel(esel)));
 	}
+	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
+			     stlbe->mas3, stlbe->mas7);
 }
 
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
@@ -178,8 +167,8 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
-	for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
-		struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+	for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
+		struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
 		unsigned int tid;
 
 		if (eaddr < get_tlb_eaddr(tlbe))
@@ -204,60 +193,33 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return -1;
 }
 
-static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel)
+static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv,
+					  struct tlbe *gtlbe,
+					  pfn_t pfn)
 {
-	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-	unsigned long pfn;
+	priv->pfn = pfn;
+	priv->flags = E500_TLB_VALID;
 
-	pfn = stlbe->mas3 >> PAGE_SHIFT;
-	pfn |= stlbe->mas7 << (32 - PAGE_SHIFT);
-
-	if (get_tlb_v(stlbe)) {
-		if (tlbe_is_writable(stlbe))
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
-	}
+	if (tlbe_is_writable(gtlbe))
+		priv->flags |= E500_TLB_DIRTY;
 }
 
-static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel)
+static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv)
 {
-	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+	if (priv->flags & E500_TLB_VALID) {
+		if (priv->flags & E500_TLB_DIRTY)
+			kvm_release_pfn_dirty(priv->pfn);
+		else
+			kvm_release_pfn_clean(priv->pfn);
 
-	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
-	stlbe->mas1 = 0;
-	trace_kvm_stlb_inval(index_of(tlbsel, esel));
+		priv->flags = 0;
+	}
 }
 
 static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-		gva_t eaddr, gva_t eend, u32 tid)
+					int esel)
 {
-	unsigned int pid = tid & 0xff;
-	unsigned int i;
-
-	/* XXX Replace loop with fancy data structures. */
-	for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) {
-		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
-		unsigned int tid;
-
-		if (!get_tlb_v(stlbe))
-			continue;
-
-		if (eend < get_tlb_eaddr(stlbe))
-			continue;
-
-		if (eaddr > get_tlb_end(stlbe))
-			continue;
-
-		tid = get_tlb_tid(stlbe);
-		if (tid && (tid != pid))
-			continue;
-
-		kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
-		write_host_tlbe(vcpu_e500, 1, i);
-	}
+	mtspr(SPRN_MMUCSR0, MMUCSR0_TLB1FI);
 }
 
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
@@ -274,7 +236,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 	tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
 
 	vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
-		| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 	vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
 		| MAS1_TID(vcpu_e500->pid[pidsel])
 		| MAS1_TSIZE(tsized);
@@ -287,16 +249,35 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 	vcpu_e500->mas7 = 0;
 }
 
+static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+					   struct tlbe *gtlbe, int tsize,
+					   struct tlbe_priv *priv,
+					   u64 gvaddr, struct tlbe *stlbe)
+{
+	pfn_t pfn = priv->pfn;
+
+	/* Force TS=1 IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize)
+		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN)
+		| e500_shadow_mas2_attrib(gtlbe->mas2,
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
+	stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
+		| e500_shadow_mas3_attrib(gtlbe->mas3,
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
+	stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
+}
+
+
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
+	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel,
+	struct tlbe *stlbe)
 {
 	struct kvm_memory_slot *slot;
-	struct tlbe *stlbe;
 	unsigned long pfn, hva;
 	int pfnmap = 0;
 	int tsize = BOOK3E_PAGESZ_4K;
-
-	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+	struct tlbe_priv *priv;
 
 	/*
 	 * Translate guest physical to true physical, acquiring
@@ -392,35 +373,25 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		}
 	}
 
-	/* Drop reference to old page. */
-	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
+	/* Drop old priv and setup new one. */
+	priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
+	kvmppc_e500_priv_release(priv);
+	kvmppc_e500_priv_setup(priv, gtlbe, pfn);
 
-	/* Force TS=1 IPROT=0 for all guest mappings. */
-	stlbe->mas1 = MAS1_TSIZE(tsize)
-		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
-	stlbe->mas2 = (gvaddr & MAS2_EPN)
-		| e500_shadow_mas2_attrib(gtlbe->mas2,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
-		| e500_shadow_mas3_attrib(gtlbe->mas3,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
-
-	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-			     stlbe->mas3, stlbe->mas7);
+	kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
-static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel)
+static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+				int esel, struct tlbe *stlbe)
 {
 	struct tlbe *gtlbe;
 
-	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+	gtlbe = &vcpu_e500->gtlb_arch[0][esel];
 
 	kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
 			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
-			gtlbe, tlbsel, esel);
+			gtlbe, 0, esel, stlbe);
 
 	return esel;
 }
@@ -429,16 +400,16 @@ static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
  * the shadow TLB. */
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-		u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe)
+		u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe)
 {
 	unsigned int victim;
 
-	victim = vcpu_e500->guest_tlb_nv[1]++;
+	victim = vcpu_e500->gtlb_nv[1]++;
 
-	if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size()))
-		vcpu_e500->guest_tlb_nv[1] = 0;
+	if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size()))
+		vcpu_e500->gtlb_nv[1] = 0;
 
-	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim);
+	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe);
 
 	return victim;
 }
@@ -449,33 +420,19 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
 	if (usermode) {
-		struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-		int i;
-
-		/* XXX Replace loop with fancy data structures. */
-		for (i = 0; i < tlb1_max_shadow_size(); i++)
-			kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
-
 		_tlbil_all();
 	}
 }
 
-static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel)
+static inline int kvmppc_e500_gtlbe_invalidate(
+				struct kvmppc_vcpu_e500 *vcpu_e500,
+				int tlbsel, int esel)
 {
-	struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+	struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
 	if (unlikely(get_tlb_iprot(gtlbe)))
 		return -1;
 
-	if (tlbsel == 1) {
-		kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe),
-				get_tlb_end(gtlbe),
-				get_tlb_tid(gtlbe));
-	} else {
-		kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
-	}
-
 	gtlbe->mas1 = 0;
 
 	return 0;
@@ -486,10 +443,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 	int esel;
 
 	if (value & MMUCSR0_TLB0FI)
-		for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
 	if (value & MMUCSR0_TLB1FI)
-		for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
 	_tlbil_all();
@@ -513,7 +470,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 
 	if (ia) {
 		/* invalidate all entries */
-		for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
 	} else {
 		ea &= 0xfffff000;
@@ -537,9 +494,9 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 	tlbsel = get_tlb_tlbsel(vcpu_e500);
 	esel = get_tlb_esel(vcpu_e500, tlbsel);
 
-	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 	vcpu_e500->mas0 &= ~MAS0_NV(~0);
-	vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+	vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 	vcpu_e500->mas1 = gtlbe->mas1;
 	vcpu_e500->mas2 = gtlbe->mas2;
 	vcpu_e500->mas3 = gtlbe->mas3;
@@ -562,14 +519,14 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
 		if (esel >= 0) {
-			gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+			gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 			break;
 		}
 	}
 
 	if (gtlbe) {
 		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
-			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 		vcpu_e500->mas1 = gtlbe->mas1;
 		vcpu_e500->mas2 = gtlbe->mas2;
 		vcpu_e500->mas3 = gtlbe->mas3;
@@ -582,7 +539,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 		victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
 
 		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
-			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 		vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0)
 			| (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0))
 			| (vcpu_e500->mas4 & MAS4_TSIZED(~0));
@@ -599,23 +556,16 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	u64 eaddr;
-	u64 raddr;
-	u32 tid;
 	struct tlbe *gtlbe;
-	int tlbsel, esel, stlbsel, sesel;
+	int tlbsel, esel;
 
 	tlbsel = get_tlb_tlbsel(vcpu_e500);
 	esel = get_tlb_esel(vcpu_e500, tlbsel);
 
-	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
-	if (get_tlb_v(gtlbe) && tlbsel == 1) {
-		eaddr = get_tlb_eaddr(gtlbe);
-		tid = get_tlb_tid(gtlbe);
-		kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr,
-				get_tlb_end(gtlbe), tid);
-	}
+	if (get_tlb_v(gtlbe) && tlbsel == 1)
+		kvmppc_e500_tlb1_invalidate(vcpu_e500, esel);
 
 	gtlbe->mas1 = vcpu_e500->mas1;
 	gtlbe->mas2 = vcpu_e500->mas2;
@@ -627,6 +577,11 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
+		struct tlbe stlbe;
+		int stlbsel, sesel;
+		u64 eaddr;
+		u64 raddr;
+
 		switch (tlbsel) {
 		case 0:
 			/* TLB0 */
@@ -634,7 +589,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 
 			stlbsel = 0;
-			sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
+			sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
 
 			break;
 
@@ -649,13 +604,13 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			 * are mapped on the fly. */
 			stlbsel = 1;
 			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-					raddr >> PAGE_SHIFT, gtlbe);
+					raddr >> PAGE_SHIFT, gtlbe, &stlbe);
 			break;
 
 		default:
 			BUG();
 		}
-		write_host_tlbe(vcpu_e500, stlbsel, sesel);
+		write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
 	}
 
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -695,7 +650,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct tlbe *gtlbe =
-		&vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)];
+		&vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)];
 	u64 pgmask = get_tlb_bytes(gtlbe) - 1;
 
 	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
@@ -703,38 +658,36 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	int tlbsel, i;
-
-	for (tlbsel = 0; tlbsel < 2; tlbsel++)
-		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++)
-			kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i);
-
-	/* discard all guest mapping */
-	_tlbil_all();
 }
 
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 			unsigned int index)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct tlbe_priv *priv;
+	struct tlbe *gtlbe, stlbe;
 	int tlbsel = tlbsel_of(index);
 	int esel = esel_of(index);
 	int stlbsel, sesel;
 
+	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+
 	switch (tlbsel) {
 	case 0:
 		stlbsel = 0;
 		sesel = esel;
+		priv = &vcpu_e500->gtlb_priv[stlbsel][sesel];
+
+		kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
+					priv, eaddr, &stlbe);
 		break;
 
 	case 1: {
 		gfn_t gfn = gpaddr >> PAGE_SHIFT;
-		struct tlbe *gtlbe
-			= &vcpu_e500->guest_tlb[tlbsel][esel];
 
 		stlbsel = 1;
-		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe);
+		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
+					     gtlbe, &stlbe);
 		break;
 	}
 
@@ -742,7 +695,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 		BUG();
 		break;
 	}
-	write_host_tlbe(vcpu_e500, stlbsel, sesel);
+
+	write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
 }
 
 int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -773,14 +727,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
 	struct tlbe *tlbe;
 
 	/* Insert large initial mapping for guest. */
-	tlbe = &vcpu_e500->guest_tlb[1][0];
+	tlbe = &vcpu_e500->gtlb_arch[1][0];
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
 	tlbe->mas2 = 0;
 	tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
 	tlbe->mas7 = 0;
 
 	/* 4K map for serial output. Used by kernel wrapper. */
-	tlbe = &vcpu_e500->guest_tlb[1][1];
+	tlbe = &vcpu_e500->gtlb_arch[1][1];
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
 	tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
@@ -791,52 +745,61 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF;
 
-	vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE;
-	vcpu_e500->guest_tlb[0] =
+	vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE;
+	vcpu_e500->gtlb_arch[0] =
 		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-	if (vcpu_e500->guest_tlb[0] == NULL)
+	if (vcpu_e500->gtlb_arch[0] == NULL)
 		goto err_out;
 
-	vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE;
-	vcpu_e500->shadow_tlb[0] =
-		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-	if (vcpu_e500->shadow_tlb[0] == NULL)
+	vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE;
+	vcpu_e500->gtlb_arch[1] =
+		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
+	if (vcpu_e500->gtlb_arch[1] == NULL)
 		goto err_out_guest0;
 
-	vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE;
-	vcpu_e500->guest_tlb[1] =
-		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
-	if (vcpu_e500->guest_tlb[1] == NULL)
-		goto err_out_shadow0;
-
-	vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num;
-	vcpu_e500->shadow_tlb[1] =
-		kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL);
-	if (vcpu_e500->shadow_tlb[1] == NULL)
+	vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *)
+		kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
+	if (vcpu_e500->gtlb_priv[0] == NULL)
 		goto err_out_guest1;
+	vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *)
+		kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
+
+	if (vcpu_e500->gtlb_priv[1] == NULL)
+		goto err_out_priv0;
 
 	/* Init TLB configuration register */
 	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
-	vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0];
+	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0];
 	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
-	vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1];
+	vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1];
 
 	return 0;
 
+err_out_priv0:
+	kfree(vcpu_e500->gtlb_priv[0]);
 err_out_guest1:
-	kfree(vcpu_e500->guest_tlb[1]);
-err_out_shadow0:
-	kfree(vcpu_e500->shadow_tlb[0]);
+	kfree(vcpu_e500->gtlb_arch[1]);
 err_out_guest0:
-	kfree(vcpu_e500->guest_tlb[0]);
+	kfree(vcpu_e500->gtlb_arch[0]);
 err_out:
 	return -1;
 }
 
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
-	kfree(vcpu_e500->shadow_tlb[1]);
-	kfree(vcpu_e500->guest_tlb[1]);
-	kfree(vcpu_e500->shadow_tlb[0]);
-	kfree(vcpu_e500->guest_tlb[0]);
+	int stlbsel, i;
+
+	/* release all privs */
+	for (stlbsel = 0; stlbsel < 2; stlbsel++)
+		for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) {
+			struct tlbe_priv *priv =
+				&vcpu_e500->gtlb_priv[stlbsel][i];
+			kvmppc_e500_priv_release(priv);
+		}
+
+	/* discard all guest mapping */
+	_tlbil_all();
+
+	kfree(vcpu_e500->gtlb_arch[1]);
+	kfree(vcpu_e500->gtlb_arch[0]);
 }

From dd9ebf1f94354b010f2ac7a98bf69168636cb08e Mon Sep 17 00:00:00 2001
From: Liu Yu <yu.liu@freescale.com>
Date: Tue, 14 Jun 2011 18:35:14 -0500
Subject: [PATCH 093/143] KVM: PPC: e500: Add shadow PID support

Dynamically assign host PIDs to guest PIDs, splitting each guest PID into
multiple host (shadow) PIDs based on kernel/user and MSR[IS/DS].  Use
both PID0 and PID1 so that the shadow PIDs for the right mode can be
selected, that correspond both to guest TID = zero and guest TID = guest
PID.

This allows us to significantly reduce the frequency of needing to
invalidate the entire TLB.  When the guest mode or PID changes, we just
update the host PID0/PID1.  And since the allocation of shadow PIDs is
global, multiple guests can share the TLB without conflict.

Note that KVM does not yet support the guest setting PID1 or PID2 to
a value other than zero.  This will need to be fixed for nested KVM
to work.  Until then, we enforce the requirement for guest PID1/PID2
to stay zero by failing the emulation if the guest tries to set them
to something else.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_e500.h |   8 +-
 arch/powerpc/include/asm/kvm_host.h |   1 +
 arch/powerpc/kernel/asm-offsets.c   |   1 +
 arch/powerpc/kvm/44x_tlb.c          |   4 +-
 arch/powerpc/kvm/booke.c            |  11 +-
 arch/powerpc/kvm/booke.h            |   1 +
 arch/powerpc/kvm/booke_interrupts.S |  11 +
 arch/powerpc/kvm/e500_emulate.c     |   4 +
 arch/powerpc/kvm/e500_tlb.c         | 312 +++++++++++++++++++++++++---
 arch/powerpc/kvm/e500_tlb.h         |  13 +-
 10 files changed, 334 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index 4a6d77a8b8a0..adbfca9dd100 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, <yu.liu@freescale.com>
  *
@@ -37,6 +37,8 @@ struct tlbe_priv {
 	unsigned int flags; /* E500_TLB_* */
 };
 
+struct vcpu_id_table;
+
 struct kvmppc_vcpu_e500 {
 	/* Unmodified copy of the guest's TLB. */
 	struct tlbe *gtlb_arch[E500_TLB_NUM];
@@ -59,6 +61,10 @@ struct kvmppc_vcpu_e500 {
 	u32 mas5;
 	u32 mas6;
 	u32 mas7;
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+
 	u32 l1csr0;
 	u32 l1csr1;
 	u32 hid0;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c4ce1054b866..6e05b2d13683 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -255,6 +255,7 @@ struct kvm_vcpu_arch {
 	u32 pvr;
 
 	u32 shadow_pid;
+	u32 shadow_pid1;
 	u32 pid;
 	u32 swap_pid;
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index ecd2b3ad7ff6..faf846131f45 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -402,6 +402,7 @@ int main(void)
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+	DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5f3cff83e089..33aa715dab28 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
 	}
 }
 
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
+	int usermode = vcpu->arch.shared->msr & MSR_PR;
+
 	vcpu->arch.shadow_pid = !usermode;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4538956daecf..9f2e4a5e1c4d 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -113,15 +113,18 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
 }
 #endif
 
-/* Helper function for "full" MSR writes. No need to call this if only EE is
- * changing. */
+/*
+ * Helper function for "full" MSR writes.  No need to call this if only
+ * EE/CE/ME/DE/RI are changing.
+ */
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
-	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
-		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+	u32 old_msr = vcpu->arch.shared->msr;
 
 	vcpu->arch.shared->msr = new_msr;
 
+	kvmppc_mmu_msr_notify(vcpu, old_msr);
+
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		kvm_vcpu_block(vcpu);
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 0fa1732ddcb7..8e1fe33d64e5 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -53,6 +53,7 @@
 extern unsigned long kvmppc_booke_handlers;
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance);
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 8cb3dfe29f75..42f2fb1f66e9 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -191,6 +191,12 @@ _GLOBAL(kvmppc_resume_host)
 	lwz	r3, VCPU_HOST_PID(r4)
 	mtspr	SPRN_PID, r3
 
+#ifdef CONFIG_FSL_BOOKE
+	/* we cheat and know that Linux doesn't use PID1 which is always 0 */
+	lis	r3, 0
+	mtspr	SPRN_PID1, r3
+#endif
+
 	/* Restore host IVPR before re-enabling interrupts. We cheat and know
 	 * that Linux IVPR is always 0xc0000000. */
 	lis	r3, 0xc000
@@ -365,6 +371,11 @@ lightweight_exit:
 	lwz	r3, VCPU_SHADOW_PID(r4)
 	mtspr	SPRN_PID, r3
 
+#ifdef CONFIG_FSL_BOOKE
+	lwz	r3, VCPU_SHADOW_PID1(r4)
+	mtspr	SPRN_PID1, r3
+#endif
+
 #ifdef CONFIG_44x
 	iccci	0, 0 /* XXX hack */
 #endif
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 69cd665a0caf..d48ae396f41e 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -81,8 +81,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		kvmppc_set_pid(vcpu, spr_val);
 		break;
 	case SPRN_PID1:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
 		vcpu_e500->pid[1] = spr_val; break;
 	case SPRN_PID2:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
 		vcpu_e500->pid[2] = spr_val; break;
 	case SPRN_MAS0:
 		vcpu_e500->mas0 = spr_val; break;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 9d1e28d443c4..ea394571bbb6 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -28,8 +28,196 @@
 
 #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
 
+struct id {
+	unsigned long val;
+	struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS	[0..1]
+ * guestTID	[0..255]
+ * guestPR	[0..1]
+ * ID		[1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+	struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+	struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
 static unsigned int tlb1_entry_num;
 
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+	unsigned long sid;
+	int ret = -1;
+
+	sid = ++(__get_cpu_var(pcpu_last_used_sid));
+	if (sid < NUM_TIDS) {
+		__get_cpu_var(pcpu_sids).entry[sid] = entry;
+		entry->val = sid;
+		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
+		ret = sid;
+	}
+
+	/*
+	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+	 * the caller will invalidate everything and start over.
+	 *
+	 * sid > NUM_TIDS indicates a race, which we disable preemption to
+	 * avoid.
+	 */
+	WARN_ON(sid > NUM_TIDS);
+
+	return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+	if (entry && entry->val != 0 &&
+	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
+	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
+		return entry->val;
+	return -1;
+}
+
+/* Invalidate all id mappings on local core */
+static inline void local_sid_destroy_all(void)
+{
+	preempt_disable();
+	__get_cpu_var(pcpu_last_used_sid) = 0;
+	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
+	preempt_enable();
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+	return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->idt);
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+			       struct kvmppc_vcpu_e500 *vcpu_e500,
+			       int as, int pid, int pr)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+
+	BUG_ON(as >= 2);
+	BUG_ON(pid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	idt->id[as][pid][pr].val = 0;
+	idt->id[as][pid][pr].pentry = NULL;
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+					unsigned int as, unsigned int gid,
+					unsigned int pr, int avoid_recursion)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	int sid;
+
+	BUG_ON(as >= 2);
+	BUG_ON(gid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+	while (sid <= 0) {
+		/* No mapping yet */
+		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+		if (sid <= 0) {
+			_tlbil_all();
+			local_sid_destroy_all();
+		}
+
+		/* Update shadow pid when mappings are changed */
+		if (!avoid_recursion)
+			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+
+	return sid;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	preempt_disable();
+	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu),
+			get_cur_pid(&vcpu_e500->vcpu),
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu), 0,
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	preempt_enable();
+}
+
 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -134,14 +322,19 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct tlbe magic;
 	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+	unsigned int stid;
 	pfn_t pfn;
 
 	pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
 	get_page(pfn_to_page(pfn));
 
-	magic.mas1 = MAS1_VALID | MAS1_TS |
+	preempt_disable();
+	stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
+
+	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
 		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
 	magic.mas3 = (pfn << PAGE_SHIFT) |
@@ -149,15 +342,76 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 	magic.mas7 = pfn >> (32 - PAGE_SHIFT);
 
 	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
+	preempt_enable();
 }
 
 void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	/* Shadow PID may be expired on local core */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
 }
 
 void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
 {
-	_tlbil_all();
+}
+
+static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
+					 int tlbsel, int esel)
+{
+	struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	unsigned int pr, tid, ts, pid;
+	u32 val, eaddr;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	preempt_disable();
+
+	/* One guest ID may be mapped to two shadow IDs */
+	for (pr = 0; pr < 2; pr++) {
+		/*
+		 * The shadow PID can have a valid mapping on at most one
+		 * host CPU.  In the common case, it will be valid on this
+		 * CPU, in which case (for TLB0) we do a local invalidation
+		 * of the specific address.
+		 *
+		 * If the shadow PID is not valid on the current host CPU, or
+		 * if we're invalidating a TLB1 entry, we invalidate the
+		 * entire shadow PID.
+		 */
+		if (tlbsel == 1 ||
+		    (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) {
+			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+			continue;
+		}
+
+		/*
+		 * The guest is invalidating a TLB0 entry which is in a PID
+		 * that has a valid shadow mapping on this host CPU.  We
+		 * search host TLB0 to invalidate it's shadow TLB entry,
+		 * similar to __tlbil_va except that we need to look in AS1.
+		 */
+		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+		eaddr = get_tlb_eaddr(gtlbe);
+
+		local_irq_save(flags);
+
+		mtspr(SPRN_MAS6, val);
+		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+		val = mfspr(SPRN_MAS1);
+		if (val & MAS1_VALID) {
+			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+			asm volatile("tlbwe");
+		}
+
+		local_irq_restore(flags);
+	}
+
+	preempt_enable();
 }
 
 /* Search the guest TLB for a matching entry. */
@@ -216,12 +470,6 @@ static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv)
 	}
 }
 
-static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-					int esel)
-{
-	mtspr(SPRN_MMUCSR0, MMUCSR0_TLB1FI);
-}
-
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 		unsigned int eaddr, int as)
 {
@@ -255,10 +503,15 @@ static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 					   u64 gvaddr, struct tlbe *stlbe)
 {
 	pfn_t pfn = priv->pfn;
+	unsigned int stid;
+
+	stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe),
+				   get_cur_pr(&vcpu_e500->vcpu), 0);
 
 	/* Force TS=1 IPROT=0 for all guest mappings. */
 	stlbe->mas1 = MAS1_TSIZE(tsize)
-		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
+		| MAS1_TID(stid) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
@@ -414,14 +667,12 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return victim;
 }
 
-/* Invalidate all guest kernel mappings when enter usermode,
- * so that when they fault back in they will get the
- * proper permission bits. */
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
-	if (usermode) {
-		_tlbil_all();
-	}
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	/* Recalc shadow pid since MSR changes */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
 }
 
 static inline int kvmppc_e500_gtlbe_invalidate(
@@ -449,7 +700,8 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 		for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
-	_tlbil_all();
+	/* Invalidate all vcpu id mappings */
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -480,7 +732,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
 	}
 
-	_tlbil_all();
+	/* Invalidate all vcpu id mappings */
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -564,8 +817,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
-	if (get_tlb_v(gtlbe) && tlbsel == 1)
-		kvmppc_e500_tlb1_invalidate(vcpu_e500, esel);
+	if (get_tlb_v(gtlbe))
+		kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
 
 	gtlbe->mas1 = vcpu_e500->mas1;
 	gtlbe->mas2 = vcpu_e500->mas2;
@@ -582,6 +835,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 		u64 eaddr;
 		u64 raddr;
 
+		preempt_disable();
 		switch (tlbsel) {
 		case 0:
 			/* TLB0 */
@@ -611,6 +865,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			BUG();
 		}
 		write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
+		preempt_enable();
 	}
 
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -672,6 +927,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 
 	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
+	preempt_disable();
 	switch (tlbsel) {
 	case 0:
 		stlbsel = 0;
@@ -697,6 +953,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 	}
 
 	write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
+	preempt_enable();
 }
 
 int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -718,8 +975,10 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
-	vcpu_e500->pid[0] = vcpu->arch.shadow_pid =
-		vcpu->arch.pid = pid;
+	if (vcpu->arch.pid != pid) {
+		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
 }
 
 void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -767,6 +1026,9 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (vcpu_e500->gtlb_priv[1] == NULL)
 		goto err_out_priv0;
 
+	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+		goto err_out_priv1;
+
 	/* Init TLB configuration register */
 	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
 	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0];
@@ -775,6 +1037,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 
 	return 0;
 
+err_out_priv1:
+	kfree(vcpu_e500->gtlb_priv[1]);
 err_out_priv0:
 	kfree(vcpu_e500->gtlb_priv[0]);
 err_out_guest1:
@@ -797,9 +1061,7 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 			kvmppc_e500_priv_release(priv);
 		}
 
-	/* discard all guest mapping */
-	_tlbil_all();
-
+	kvmppc_e500_id_table_free(vcpu_e500);
 	kfree(vcpu_e500->gtlb_arch[1]);
 	kfree(vcpu_e500->gtlb_arch[0]);
 }
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index 458946b4775d..59b88e99a235 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
  *
@@ -55,6 +55,7 @@ extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
 extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 
 /* TLB helper functions */
 static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
@@ -110,6 +111,16 @@ static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
 	return vcpu->arch.pid & 0xff;
 }
 
+static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
+}
+
+static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & MSR_PR);
+}
+
 static inline unsigned int get_cur_spid(
 		const struct kvmppc_vcpu_e500 *vcpu_e500)
 {

From 1aee47a0276f75a371e13a936a48f64eb5d3ec1b Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 14 Jun 2011 18:35:20 -0500
Subject: [PATCH 094/143] KVM: PPC: e500: Don't search over the entire TLB0.

Only look in the 4 entries that could possibly contain the
entry we're looking for.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500_tlb.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index ea394571bbb6..13c432ea2fa8 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -418,11 +418,21 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
 static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 		gva_t eaddr, int tlbsel, unsigned int pid, int as)
 {
+	int size = vcpu_e500->gtlb_size[tlbsel];
+	int set_base;
 	int i;
 
-	/* XXX Replace loop with fancy data structures. */
-	for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
-		struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
+	if (tlbsel == 0) {
+		int mask = size / KVM_E500_TLB0_WAY_NUM - 1;
+		set_base = (eaddr >> PAGE_SHIFT) & mask;
+		set_base *= KVM_E500_TLB0_WAY_NUM;
+		size = KVM_E500_TLB0_WAY_NUM;
+	} else {
+		set_base = 0;
+	}
+
+	for (i = 0; i < size; i++) {
+		struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i];
 		unsigned int tid;
 
 		if (eaddr < get_tlb_eaddr(tlbe))
@@ -441,7 +451,7 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 		if (get_tlb_ts(tlbe) != as && as != -1)
 			continue;
 
-		return i;
+		return set_base + i;
 	}
 
 	return -1;

From f8f7e5ee1037e347eafff8f526913b92cec54873 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 21 Jun 2011 14:00:10 -0300
Subject: [PATCH 095/143] Revert "KVM: MMU: make kvm_mmu_reset_context() flush
 the guest TLB"

This reverts commit bee931d31e588b8eb86b7edee32fac2d16930cd7.

TLB flush should be done lazily during guest entry, in
kvm_mmu_load().

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c629b54d362..da0f3b081076 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3054,18 +3054,8 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
-	int r;
-
 	destroy_kvm_mmu(vcpu);
-	r = init_kvm_mmu(vcpu);
-
-	if (r)
-		goto err;
-
-	kvm_mmu_sync_roots(vcpu);
-	kvm_mmu_flush_tlb(vcpu);
-err:
-	return r;
+	return init_kvm_mmu(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 

From 134291bf3cb434a9039298ba6b15ef33e65ba542 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 1 Jul 2011 01:34:56 +0900
Subject: [PATCH 096/143] KVM: MMU: Clean up the error handling of
 walk_addr_generic()

Avoid two step jump to the error handling part.  This eliminates the use
of the variables present and rsvd_fault.

We also use the const type qualifier to show that write/user/fetch_fault
do not change in the function.

Both of these were suggested by Ingo Molnar.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 82 +++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 50 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1caeb4d22e01..f0746d27e33e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -125,18 +125,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-	bool eperm, present, rsvd_fault;
-	int offset, write_fault, user_fault, fetch_fault;
-
-	write_fault = access & PFERR_WRITE_MASK;
-	user_fault = access & PFERR_USER_MASK;
-	fetch_fault = access & PFERR_FETCH_MASK;
+	bool eperm;
+	int offset;
+	const int write_fault = access & PFERR_WRITE_MASK;
+	const int user_fault  = access & PFERR_USER_MASK;
+	const int fetch_fault = access & PFERR_FETCH_MASK;
+	u16 errcode = 0;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
 walk:
-	present = true;
-	eperm = rsvd_fault = false;
+	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
 
@@ -144,10 +143,8 @@ walk:
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte)) {
-			present = false;
+		if (!is_present_gpte(pte))
 			goto error;
-		}
 		--walker->level;
 	}
 #endif
@@ -170,35 +167,27 @@ walk:
 
 		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
 					      PFERR_USER_MASK|PFERR_WRITE_MASK);
-		if (unlikely(real_gfn == UNMAPPED_GVA)) {
-			present = false;
-			break;
-		}
+		if (unlikely(real_gfn == UNMAPPED_GVA))
+			goto error;
 		real_gfn = gpa_to_gfn(real_gfn);
 
 		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
-		if (unlikely(kvm_is_error_hva(host_addr))) {
-			present = false;
-			break;
-		}
+		if (unlikely(kvm_is_error_hva(host_addr)))
+			goto error;
 
 		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
-			present = false;
-			break;
-		}
+		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+			goto error;
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (unlikely(!is_present_gpte(pte))) {
-			present = false;
-			break;
-		}
+		if (unlikely(!is_present_gpte(pte)))
+			goto error;
 
 		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
 					      walker->level))) {
-			rsvd_fault = true;
-			break;
+			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+			goto error;
 		}
 
 		if (unlikely(write_fault && !is_writable_pte(pte)
@@ -213,17 +202,15 @@ walk:
 			eperm = true;
 #endif
 
-		if (!eperm && !rsvd_fault
-		    && unlikely(!(pte & PT_ACCESSED_MASK))) {
+		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
 			int ret;
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
 			ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
 						  pte, pte|PT_ACCESSED_MASK);
-			if (unlikely(ret < 0)) {
-				present = false;
-				break;
-			} else if (ret)
+			if (unlikely(ret < 0))
+				goto error;
+			else if (ret)
 				goto walk;
 
 			mark_page_dirty(vcpu->kvm, table_gfn);
@@ -276,8 +263,10 @@ walk:
 		--walker->level;
 	}
 
-	if (unlikely(!present || eperm || rsvd_fault))
+	if (unlikely(eperm)) {
+		errcode |= PFERR_PRESENT_MASK;
 		goto error;
+	}
 
 	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
 		int ret;
@@ -285,10 +274,9 @@ walk:
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
 		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
 					  pte, pte|PT_DIRTY_MASK);
-		if (unlikely(ret < 0)) {
-			present = false;
+		if (unlikely(ret < 0))
 			goto error;
-		} else if (ret)
+		else if (ret)
 			goto walk;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
@@ -303,20 +291,14 @@ walk:
 	return 1;
 
 error:
-	walker->fault.vector = PF_VECTOR;
-	walker->fault.error_code_valid = true;
-	walker->fault.error_code = 0;
-	if (present)
-		walker->fault.error_code |= PFERR_PRESENT_MASK;
-
-	walker->fault.error_code |= write_fault | user_fault;
-
+	errcode |= write_fault | user_fault;
 	if (fetch_fault && (mmu->nx ||
 			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
-		walker->fault.error_code |= PFERR_FETCH_MASK;
-	if (rsvd_fault)
-		walker->fault.error_code |= PFERR_RSVD_MASK;
+		errcode |= PFERR_FETCH_MASK;
 
+	walker->fault.vector = PF_VECTOR;
+	walker->fault.error_code_valid = true;
+	walker->fault.error_code = errcode;
 	walker->fault.address = addr;
 	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 

From 92c1c1e85bdd72b41f9fd39b9d43d4b3c146d02d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 1 Jul 2011 01:36:07 +0900
Subject: [PATCH 097/143] KVM: MMU: Rename the walk label in
 walk_addr_generic()

The current name does not explain the meaning well.  So give it a better
name "retry_walk" to show that we are trying the walk again.

This was suggested by Ingo Molnar.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f0746d27e33e..9c0afba1ab8e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -134,7 +134,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
-walk:
+retry_walk:
 	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
@@ -211,7 +211,7 @@ walk:
 			if (unlikely(ret < 0))
 				goto error;
 			else if (ret)
-				goto walk;
+				goto retry_walk;
 
 			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
@@ -277,7 +277,7 @@ walk:
 		if (unlikely(ret < 0))
 			goto error;
 		else if (ret)
-			goto walk;
+			goto retry_walk;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;

From 3c8c652ae4c984a950d0672597085db866509914 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 1 Jul 2011 01:37:24 +0900
Subject: [PATCH 098/143] KVM: MMU: Introduce is_last_gpte() to clean up
 walk_addr_generic()

Suggested by Ingo and Avi.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9c0afba1ab8e..1e1c2444cef5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -113,6 +113,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 	return access;
 }
 
+static bool FNAME(is_last_gpte)(struct guest_walker *walker,
+				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				pt_element_t gpte)
+{
+	if (walker->level == PT_PAGE_TABLE_LEVEL)
+		return true;
+
+	if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
+	    (PTTYPE == 64 || is_pse(vcpu)))
+		return true;
+
+	if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
+	    (mmu->root_level == PT64_ROOT_LEVEL))
+		return true;
+
+	return false;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -221,13 +239,7 @@ retry_walk:
 
 		walker->ptes[walker->level - 1] = pte;
 
-		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
-		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				is_large_pte(pte) &&
-				(PTTYPE == 64 || is_pse(vcpu))) ||
-		    ((walker->level == PT_PDPE_LEVEL) &&
-				is_large_pte(pte) &&
-				mmu->root_level == PT64_ROOT_LEVEL)) {
+		if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
 			int lvl = walker->level;
 			gpa_t real_gpa;
 			gfn_t gfn;

From 149dbdb1859be46a063a5b1b0aa99a5f999b7632 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:16:42 +0000
Subject: [PATCH 099/143] KVM: PPC: Fix machine checks on 32-bit Book3S

Commit 69acc0d3ba ("KVM: PPC: Resolve real-mode handlers through
function exports") resulted in vcpu->arch.trampoline_lowmem and
vcpu->arch.trampoline_enter ending up with kernel virtual addresses
rather than physical addresses.  This is OK on 64-bit Book3S machines,
which ignore the top 4 bits of the effective address in real mode,
but on 32-bit Book3S machines, accessing these addresses in real mode
causes machine check interrupts, as the hardware uses the whole
effective address as the physical address in real mode.

This fixes the problem by using __pa() to convert these addresses
to physical addresses.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 73fdab8a567f..83500fb62c83 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -28,6 +28,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
@@ -1342,8 +1343,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu_book3s->slb_nr = 64;
 
 	/* remember where some real-mode handlers are */
-	vcpu->arch.trampoline_lowmem = (ulong)kvmppc_handler_lowmem_trampoline;
-	vcpu->arch.trampoline_enter = (ulong)kvmppc_handler_trampoline_enter;
+	vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
+	vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
 	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
 #ifdef CONFIG_PPC_BOOK3S_64
 	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;

From c4befc58a0cc5a8cc5b4a7234d67b6b16dec4e70 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:17:33 +0000
Subject: [PATCH 100/143] KVM: PPC: Move fields between struct kvm_vcpu_arch
 and kvmppc_vcpu_book3s

This moves the slb field, which represents the state of the emulated
SLB, from the kvmppc_vcpu_book3s struct to the kvm_vcpu_arch, and the
hpte_hash_[v]pte[_long] fields from kvm_vcpu_arch to kvmppc_vcpu_book3s.
This is in accord with the principle that the kvm_vcpu_arch struct
represents the state of the emulated CPU, and the kvmppc_vcpu_book3s
struct holds the auxiliary data structures used in the emulation.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h | 35 +++++++------
 arch/powerpc/include/asm/kvm_host.h   | 34 ++++++-------
 arch/powerpc/kvm/book3s.c             |  9 ++--
 arch/powerpc/kvm/book3s_64_mmu.c      | 54 ++++++++++----------
 arch/powerpc/kvm/book3s_mmu_hpte.c    | 71 ++++++++++++++++-----------
 arch/powerpc/kvm/trace.h              |  2 +-
 6 files changed, 107 insertions(+), 98 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 70c409b1d931..f7b2bafe7047 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -24,20 +24,6 @@
 #include <linux/kvm_host.h>
 #include <asm/kvm_book3s_asm.h>
 
-struct kvmppc_slb {
-	u64 esid;
-	u64 vsid;
-	u64 orige;
-	u64 origv;
-	bool valid	: 1;
-	bool Ks		: 1;
-	bool Kp		: 1;
-	bool nx		: 1;
-	bool large	: 1;	/* PTEs are 16MB */
-	bool tb		: 1;	/* 1TB segment */
-	bool class	: 1;
-};
-
 struct kvmppc_bat {
 	u64 raw;
 	u32 bepi;
@@ -67,11 +53,22 @@ struct kvmppc_sid_map {
 #define VSID_POOL_SIZE	(SID_CONTEXTS * 16)
 #endif
 
+struct hpte_cache {
+	struct hlist_node list_pte;
+	struct hlist_node list_pte_long;
+	struct hlist_node list_vpte;
+	struct hlist_node list_vpte_long;
+	struct rcu_head rcu_head;
+	u64 host_va;
+	u64 pfn;
+	ulong slot;
+	struct kvmppc_pte pte;
+};
+
 struct kvmppc_vcpu_book3s {
 	struct kvm_vcpu vcpu;
 	struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
 	struct kvmppc_sid_map sid_map[SID_MAP_NUM];
-	struct kvmppc_slb slb[64];
 	struct {
 		u64 esid;
 		u64 vsid;
@@ -81,7 +78,6 @@ struct kvmppc_vcpu_book3s {
 	struct kvmppc_bat dbat[8];
 	u64 hid[6];
 	u64 gqr[8];
-	int slb_nr;
 	u64 sdr1;
 	u64 hior;
 	u64 msr_mask;
@@ -94,6 +90,13 @@ struct kvmppc_vcpu_book3s {
 #endif
 	int context_id[SID_CONTEXTS];
 	ulong prog_flags; /* flags to inject when giving a 700 trap */
+
+	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
+	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+	int hpte_cache_count;
+	spinlock_t mmu_lock;
 };
 
 #define CONTEXT_HOST		0
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6e05b2d13683..069eb9fc6c41 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -163,16 +163,18 @@ struct kvmppc_mmu {
 	bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
 
-struct hpte_cache {
-	struct hlist_node list_pte;
-	struct hlist_node list_pte_long;
-	struct hlist_node list_vpte;
-	struct hlist_node list_vpte_long;
-	struct rcu_head rcu_head;
-	u64 host_va;
-	u64 pfn;
-	ulong slot;
-	struct kvmppc_pte pte;
+struct kvmppc_slb {
+	u64 esid;
+	u64 vsid;
+	u64 orige;
+	u64 origv;
+	bool valid	: 1;
+	bool Ks		: 1;
+	bool Kp		: 1;
+	bool nx		: 1;
+	bool large	: 1;	/* PTEs are 16MB */
+	bool tb		: 1;	/* 1TB segment */
+	bool class	: 1;
 };
 
 struct kvm_vcpu_arch {
@@ -187,6 +189,9 @@ struct kvm_vcpu_arch {
 	ulong highmem_handler;
 	ulong rmcall;
 	ulong host_paca_phys;
+	struct kvmppc_slb slb[64];
+	int slb_max;		/* # valid entries in slb[] */
+	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
 
@@ -305,15 +310,6 @@ struct kvm_vcpu_arch {
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
-
-#ifdef CONFIG_PPC_BOOK3S
-	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
-	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
-	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
-	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
-	int hpte_cache_count;
-	spinlock_t mmu_lock;
-#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 83500fb62c83..5d0babefe913 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,7 +17,6 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -34,6 +33,8 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "trace.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
@@ -1191,8 +1192,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
 		for (i = 0; i < 64; i++) {
-			sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
-			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
 		}
 	} else {
 		for (i = 0; i < 16; i++)
@@ -1340,7 +1341,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.pvr = 0x84202;
 #endif
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
-	vcpu_book3s->slb_nr = 64;
+	vcpu->arch.slb_nr = 64;
 
 	/* remember where some real-mode handlers are */
 	vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index d7889ef3211e..c6d3e194b6b4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-				struct kvmppc_vcpu_book3s *vcpu_book3s,
+				struct kvm_vcpu *vcpu,
 				gva_t eaddr)
 {
 	int i;
 	u64 esid = GET_ESID(eaddr);
 	u64 esid_1t = GET_ESID_1T(eaddr);
 
-	for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
 		u64 cmp_esid = esid;
 
-		if (!vcpu_book3s->slb[i].valid)
+		if (!vcpu->arch.slb[i].valid)
 			continue;
 
-		if (vcpu_book3s->slb[i].tb)
+		if (vcpu->arch.slb[i].tb)
 			cmp_esid = esid_1t;
 
-		if (vcpu_book3s->slb[i].esid == cmp_esid)
-			return &vcpu_book3s->slb[i];
+		if (vcpu->arch.slb[i].esid == cmp_esid)
+			return &vcpu->arch.slb[i];
 	}
 
 	dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
 		eaddr, esid, esid_1t);
-	for (i = 0; i < vcpu_book3s->slb_nr; i++) {
-	    if (vcpu_book3s->slb[i].vsid)
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+	    if (vcpu->arch.slb[i].vsid)
 		dprintk("  %d: %c%c%c %llx %llx\n", i,
-			vcpu_book3s->slb[i].valid ? 'v' : ' ',
-			vcpu_book3s->slb[i].large ? 'l' : ' ',
-			vcpu_book3s->slb[i].tb    ? 't' : ' ',
-			vcpu_book3s->slb[i].esid,
-			vcpu_book3s->slb[i].vsid);
+			vcpu->arch.slb[i].valid ? 'v' : ' ',
+			vcpu->arch.slb[i].large ? 'l' : ' ',
+			vcpu->arch.slb[i].tb    ? 't' : ' ',
+			vcpu->arch.slb[i].esid,
+			vcpu->arch.slb[i].vsid);
 	}
 
 	return NULL;
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
 	struct kvmppc_slb *slb;
 
-	slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr);
+	slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
 	if (!slb)
 		return 0;
 
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return 0;
 	}
 
-	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
 	if (!slbe)
 		goto no_seg_found;
 
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 	esid_1t = GET_ESID_1T(rb);
 	slb_nr = rb & 0xfff;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
 	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 
 static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return 0;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	return slbe->orige;
 }
 
 static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return 0;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	return slbe->origv;
 }
 
 static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
 	dprintk("KVM MMU: slbie(0x%llx)\n", ea);
 
-	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea);
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 
 	if (!slbe)
 		return;
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 
 static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	int i;
 
 	dprintk("KVM MMU: slbia()\n");
 
-	for (i = 1; i < vcpu_book3s->slb_nr; i++)
-		vcpu_book3s->slb[i].valid = false;
+	for (i = 1; i < vcpu->arch.slb_nr; i++)
+		vcpu->arch.slb[i].valid = false;
 
 	if (vcpu->arch.shared->msr & MSR_IR) {
 		kvmppc_mmu_flush_segments(vcpu);
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	ulong mp_ea = vcpu->arch.magic_page_ea;
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 		if (slb)
 			gvsid = slb->vsid;
 	}
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 79751d8dd131..41cb0017e757 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,7 +21,6 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -29,6 +28,8 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 
+#include "trace.h"
+
 #define PTE_SIZE	12
 
 static struct kmem_cache *hpte_cache;
@@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
 void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	u64 index;
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 
 	trace_kvm_book3s_mmu_map(pte);
 
-	spin_lock(&vcpu->arch.mmu_lock);
+	spin_lock(&vcpu3s->mmu_lock);
 
 	/* Add to ePTE list */
 	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
-	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
 
 	/* Add to ePTE_long list */
 	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
 	hlist_add_head_rcu(&pte->list_pte_long,
-			   &vcpu->arch.hpte_hash_pte_long[index]);
+			   &vcpu3s->hpte_hash_pte_long[index]);
 
 	/* Add to vPTE list */
 	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
-	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
 
 	/* Add to vPTE_long list */
 	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
 	hlist_add_head_rcu(&pte->list_vpte_long,
-			   &vcpu->arch.hpte_hash_vpte_long[index]);
+			   &vcpu3s->hpte_hash_vpte_long[index]);
 
-	spin_unlock(&vcpu->arch.mmu_lock);
+	spin_unlock(&vcpu3s->mmu_lock);
 }
 
 static void free_pte_rcu(struct rcu_head *head)
@@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head)
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
 	trace_kvm_book3s_mmu_invalidate(pte);
 
 	/* Different for 32 and 64 bit */
 	kvmppc_mmu_invalidate_pte(vcpu, pte);
 
-	spin_lock(&vcpu->arch.mmu_lock);
+	spin_lock(&vcpu3s->mmu_lock);
 
 	/* pte already invalidated in between? */
 	if (hlist_unhashed(&pte->list_pte)) {
-		spin_unlock(&vcpu->arch.mmu_lock);
+		spin_unlock(&vcpu3s->mmu_lock);
 		return;
 	}
 
@@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	else
 		kvm_release_pfn_clean(pte->pfn);
 
-	spin_unlock(&vcpu->arch.mmu_lock);
+	spin_unlock(&vcpu3s->mmu_lock);
 
-	vcpu->arch.hpte_cache_count--;
+	vcpu3s->hpte_cache_count--;
 	call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
 static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hpte_cache *pte;
 	struct hlist_node *node;
 	int i;
@@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 	rcu_read_lock();
 
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
 		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			invalidate_pte(vcpu, pte);
@@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 
 static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
-	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+	list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
 
 	rcu_read_lock();
 
@@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 
 static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
-	list = &vcpu->arch.hpte_hash_pte_long[
+	list = &vcpu3s->hpte_hash_pte_long[
 			kvmppc_mmu_hash_pte_long(guest_ea)];
 
 	rcu_read_lock();
@@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 /* Flush with mask 0xfffffffff */
 static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xfffffffffULL;
 
-	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+	list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
 
 	rcu_read_lock();
 
@@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xffffff000ULL;
 
-	list = &vcpu->arch.hpte_hash_vpte_long[
+	list = &vcpu3s->hpte_hash_vpte_long[
 		kvmppc_mmu_hash_vpte_long(guest_vp)];
 
 	rcu_read_lock();
@@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 
 void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	int i;
@@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 	rcu_read_lock();
 
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
 		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			if ((pte->pte.raddr >= pa_start) &&
@@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 
 struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hpte_cache *pte;
 
 	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
-	vcpu->arch.hpte_cache_count++;
+	vcpu3s->hpte_cache_count++;
 
-	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
 		kvmppc_mmu_pte_flush_all(vcpu);
 
 	return pte;
@@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
 
 int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 {
-	/* init hpte lookup hashes */
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 
-	spin_lock_init(&vcpu->arch.mmu_lock);
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+
+	spin_lock_init(&vcpu3s->mmu_lock);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 3aca1b042b8c..d62a14b2cd0f 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush,
 	),
 
 	TP_fast_assign(
-		__entry->count		= vcpu->arch.hpte_cache_count;
+		__entry->count		= to_book3s(vcpu)->hpte_cache_count;
 		__entry->p1		= p1;
 		__entry->p2		= p2;
 		__entry->type		= type;

From f05ed4d56e9cff1c46d2b3049ba0c72e7e29392f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:17:58 +0000
Subject: [PATCH 101/143] KVM: PPC: Split out code from book3s.c into
 book3s_pr.c

In preparation for adding code to enable KVM to use hypervisor mode
on 64-bit Book 3S processors, this splits book3s.c into two files,
book3s.c and book3s_pr.c, where book3s_pr.c contains the code that is
specific to running the guest in problem state (user mode) and book3s.c
contains code which should apply to all Book 3S processors.

In doing this, we abstract some details, namely the interrupt offset,
updating the interrupt pending flag, and detecting if the guest is
in a critical section.  These are all things that will be different
when we use hypervisor mode.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h |   35 +
 arch/powerpc/kvm/Makefile             |    2 +
 arch/powerpc/kvm/book3s.c             |  995 +-----------------------
 arch/powerpc/kvm/book3s_pr.c          | 1009 +++++++++++++++++++++++++
 4 files changed, 1053 insertions(+), 988 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_pr.c

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f7b2bafe7047..480fff6090db 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -113,6 +113,7 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
 extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
@@ -150,6 +151,20 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return to_book3s(vcpu)->hior;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	if (pending_now)
+		vcpu->arch.shared->int_pending = 1;
+	else if (old_pending)
+		vcpu->arch.shared->int_pending = 0;
+}
+
 static inline ulong dsisr(void)
 {
 	ulong r;
@@ -247,6 +262,26 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return to_svcpu(vcpu)->fault_dar;
 }
 
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+	return crit;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3			0x113724FA
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4d6863823f69..bf9854fa7f63 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -43,6 +43,7 @@ kvm-book3s_64-objs := \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
+	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
@@ -56,6 +57,7 @@ kvm-book3s_32-objs := \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
+	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5d0babefe913..163e3e13c87c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,17 +38,6 @@
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define DEBUG_EXT */
-
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-			     ulong msr);
-
-/* Some compatibility defines */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define MSR_USER32 MSR_USER
-#define MSR_USER64 MSR_USER
-#define HW_PAGE_SIZE PAGE_SIZE
-#endif
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exits",       VCPU_STAT(sum_exits) },
@@ -79,100 +68,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
-	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
-	       sizeof(get_paca()->shadow_vcpu));
-	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_32
-	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
-#endif
-}
-
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
-	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
-	       sizeof(get_paca()->shadow_vcpu));
-	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
-#endif
-
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
-}
-
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-	ulong smsr = vcpu->arch.shared->msr;
-
-	/* Guest MSR values */
-	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
-	/* Process MSR values */
-	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-	/* External providers the guest reserved */
-	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
-	/* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-	smsr |= MSR_ISF | MSR_HV;
-#endif
-	vcpu->arch.shadow_msr = smsr;
-}
-
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
-{
-	ulong old_msr = vcpu->arch.shared->msr;
-
-#ifdef EXIT_DEBUG
-	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
-#endif
-
-	msr &= to_book3s(vcpu)->msr_mask;
-	vcpu->arch.shared->msr = msr;
-	kvmppc_recalc_shadow_msr(vcpu);
-
-	if (msr & MSR_POW) {
-		if (!vcpu->arch.pending_exceptions) {
-			kvm_vcpu_block(vcpu);
-			vcpu->stat.halt_wakeup++;
-
-			/* Unset POW bit after we woke up */
-			msr &= ~MSR_POW;
-			vcpu->arch.shared->msr = msr;
-		}
-	}
-
-	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
-		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
-		kvmppc_mmu_flush_segments(vcpu);
-		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-
-		/* Preload magic page segment when in kernel mode */
-		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
-			struct kvm_vcpu_arch *a = &vcpu->arch;
-
-			if (msr & MSR_DR)
-				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
-			else
-				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
-		}
-	}
-
-	/* Preload FPU if it's enabled */
-	if (vcpu->arch.shared->msr & MSR_FP)
-		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-}
-
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
 	vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
 	vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
-	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
+	kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
 	vcpu->arch.mmu.reset_msr(vcpu);
 }
 
@@ -206,11 +106,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+
 	clear_bit(kvmppc_book3s_vec2irqprio(vec),
 		  &vcpu->arch.pending_exceptions);
 
-	if (!vcpu->arch.pending_exceptions)
-		vcpu->arch.shared->int_pending = 0;
+	kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+				  old_pending);
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -269,20 +171,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	int deliver = 1;
 	int vec = 0;
 	ulong flags = 0ULL;
-	ulong crit_raw = vcpu->arch.shared->critical;
-	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-	bool crit;
-
-	/* Truncate crit indicators in 32 bit mode */
-	if (!(vcpu->arch.shared->msr & MSR_SF)) {
-		crit_raw &= 0xffffffff;
-		crit_r1 &= 0xffffffff;
-	}
-
-	/* Critical section when crit == r1 */
-	crit = (crit_raw == crit_r1);
-	/* ... and we're in supervisor mode */
-	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+	bool crit = kvmppc_critical_section(vcpu);
 
 	switch (priority) {
 	case BOOK3S_IRQPRIO_DECREMENTER:
@@ -394,64 +283,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	}
 
 	/* Tell the guest about our interrupt status */
-	if (*pending)
-		vcpu->arch.shared->int_pending = 1;
-	else if (old_pending)
-		vcpu->arch.shared->int_pending = 0;
-}
-
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
-{
-	u32 host_pvr;
-
-	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
-	vcpu->arch.pvr = pvr;
-#ifdef CONFIG_PPC_BOOK3S_64
-	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
-		kvmppc_mmu_book3s_64_init(vcpu);
-		to_book3s(vcpu)->hior = 0xfff00000;
-		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
-	} else
-#endif
-	{
-		kvmppc_mmu_book3s_32_init(vcpu);
-		to_book3s(vcpu)->hior = 0;
-		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
-	}
-
-	/* If we are in hypervisor level on 970, we can tell the CPU to
-	 * treat DCBZ as 32 bytes store */
-	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
-	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
-	    !strcmp(cur_cpu_spec->platform, "ppc970"))
-		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-
-	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
-	   really needs them in a VM on Cell and force disable them. */
-	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
-		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
-
-#ifdef CONFIG_PPC_BOOK3S_32
-	/* 32 bit Book3S always has 32 byte dcbz */
-	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-#endif
-
-	/* On some CPUs we can execute paired single operations natively */
-	asm ( "mfpvr %0" : "=r"(host_pvr));
-	switch (host_pvr) {
-	case 0x00080200:	/* lonestar 2.0 */
-	case 0x00088202:	/* lonestar 2.2 */
-	case 0x70000100:	/* gekko 1.0 */
-	case 0x00080100:	/* gekko 2.0 */
-	case 0x00083203:	/* gekko 2.3a */
-	case 0x00083213:	/* gekko 2.3b */
-	case 0x00083204:	/* gekko 2.4 */
-	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
-	case 0x00087200:	/* broadway */
-		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
-		/* Enable HID2.PSE - in case we need it later */
-		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
-	}
+	kvmppc_update_int_pending(vcpu, *pending, old_pending);
 }
 
 pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -473,44 +305,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 	return gfn_to_pfn(vcpu->kvm, gfn);
 }
 
-/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
- * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
- * emulate 32 bytes dcbz length.
- *
- * The Book3s_64 inventors also realized this case and implemented a special bit
- * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
- *
- * My approach here is to patch the dcbz instruction on executing pages.
- */
-static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
-{
-	struct page *hpage;
-	u64 hpage_offset;
-	u32 *page;
-	int i;
-
-	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage)) {
-		kvm_release_page_clean(hpage);
-		return;
-	}
-
-	hpage_offset = pte->raddr & ~PAGE_MASK;
-	hpage_offset &= ~0xFFFULL;
-	hpage_offset /= 4;
-
-	get_page(hpage);
-	page = kmap_atomic(hpage, KM_USER0);
-
-	/* patch dcbz into reserved instruction, so we trap */
-	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
-		if ((page[i] & 0xff0007ff) == INS_DCBZ)
-			page[i] &= 0xfffffff7;
-
-	kunmap_atomic(page, KM_USER0);
-	put_page(hpage);
-}
-
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
 			 struct kvmppc_pte *pte)
 {
@@ -608,519 +402,6 @@ mmio:
 	return EMULATE_DO_MMIO;
 }
 
-static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	ulong mp_pa = vcpu->arch.magic_page_pa;
-
-	if (unlikely(mp_pa) &&
-	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
-		return 1;
-	}
-
-	return kvm_is_visible_gfn(vcpu->kvm, gfn);
-}
-
-int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-			    ulong eaddr, int vec)
-{
-	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
-	int r = RESUME_GUEST;
-	int relocated;
-	int page_found = 0;
-	struct kvmppc_pte pte;
-	bool is_mmio = false;
-	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
-	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
-	u64 vsid;
-
-	relocated = data ? dr : ir;
-
-	/* Resolve real address if translation turned on */
-	if (relocated) {
-		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
-	} else {
-		pte.may_execute = true;
-		pte.may_read = true;
-		pte.may_write = true;
-		pte.raddr = eaddr & KVM_PAM;
-		pte.eaddr = eaddr;
-		pte.vpage = eaddr >> 12;
-	}
-
-	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-	case 0:
-		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
-		break;
-	case MSR_DR:
-	case MSR_IR:
-		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
-
-		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
-			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
-		else
-			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
-		pte.vpage |= vsid;
-
-		if (vsid == -1)
-			page_found = -EINVAL;
-		break;
-	}
-
-	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-		/*
-		 * If we do the dcbz hack, we have to NX on every execution,
-		 * so we can patch the executing code. This renders our guest
-		 * NX-less.
-		 */
-		pte.may_execute = !data;
-	}
-
-	if (page_found == -ENOENT) {
-		/* Page not found in guest PTE entries */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-		kvmppc_book3s_queue_irqprio(vcpu, vec);
-	} else if (page_found == -EPERM) {
-		/* Storage protection */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr =
-			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
-		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-		kvmppc_book3s_queue_irqprio(vcpu, vec);
-	} else if (page_found == -EINVAL) {
-		/* Page not found in guest SLB */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-	} else if (!is_mmio &&
-		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
-		/* The guest's PTE is not mapped yet. Map on the host */
-		kvmppc_mmu_map_page(vcpu, &pte);
-		if (data)
-			vcpu->stat.sp_storage++;
-		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-			(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
-			kvmppc_patch_dcbz(vcpu, &pte);
-	} else {
-		/* MMIO */
-		vcpu->stat.mmio_exits++;
-		vcpu->arch.paddr_accessed = pte.raddr;
-		r = kvmppc_emulate_mmio(run, vcpu);
-		if ( r == RESUME_HOST_NV )
-			r = RESUME_HOST;
-	}
-
-	return r;
-}
-
-static inline int get_fpr_index(int i)
-{
-#ifdef CONFIG_VSX
-	i *= 2;
-#endif
-	return i;
-}
-
-/* Give up external provider (FPU, Altivec, VSX) */
-void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
-{
-	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = (u64*)t->fpr;
-	int i;
-
-	if (!(vcpu->arch.guest_owned_ext & msr))
-		return;
-
-#ifdef DEBUG_EXT
-	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
-#endif
-
-	switch (msr) {
-	case MSR_FP:
-		giveup_fpu(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
-
-		vcpu->arch.fpscr = t->fpscr.val;
-		break;
-	case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-		giveup_altivec(current);
-		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
-		vcpu->arch.vscr = t->vscr;
-#endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		__giveup_vsx(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
-		break;
-	default:
-		BUG();
-	}
-
-	vcpu->arch.guest_owned_ext &= ~msr;
-	current->thread.regs->msr &= ~msr;
-	kvmppc_recalc_shadow_msr(vcpu);
-}
-
-static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
-{
-	ulong srr0 = kvmppc_get_pc(vcpu);
-	u32 last_inst = kvmppc_get_last_inst(vcpu);
-	int ret;
-
-	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
-	if (ret == -ENOENT) {
-		ulong msr = vcpu->arch.shared->msr;
-
-		msr = kvmppc_set_field(msr, 33, 33, 1);
-		msr = kvmppc_set_field(msr, 34, 36, 0);
-		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
-		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
-		return EMULATE_AGAIN;
-	}
-
-	return EMULATE_DONE;
-}
-
-static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
-{
-
-	/* Need to do paired single emulation? */
-	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
-		return EMULATE_DONE;
-
-	/* Read out the instruction */
-	if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
-		/* Need to emulate */
-		return EMULATE_FAIL;
-
-	return EMULATE_AGAIN;
-}
-
-/* Handle external providers (FPU, Altivec, VSX) */
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-			     ulong msr)
-{
-	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = (u64*)t->fpr;
-	int i;
-
-	/* When we have paired singles, we emulate in software */
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
-		return RESUME_GUEST;
-
-	if (!(vcpu->arch.shared->msr & msr)) {
-		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		return RESUME_GUEST;
-	}
-
-	/* We already own the ext */
-	if (vcpu->arch.guest_owned_ext & msr) {
-		return RESUME_GUEST;
-	}
-
-#ifdef DEBUG_EXT
-	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
-#endif
-
-	current->thread.regs->msr |= msr;
-
-	switch (msr) {
-	case MSR_FP:
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
-		t->fpscr.val = vcpu->arch.fpscr;
-		t->fpexc_mode = 0;
-		kvmppc_load_up_fpu();
-		break;
-	case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
-		t->vscr = vcpu->arch.vscr;
-		t->vrsave = -1;
-		kvmppc_load_up_altivec();
-#endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-		kvmppc_load_up_vsx();
-#endif
-		break;
-	default:
-		BUG();
-	}
-
-	vcpu->arch.guest_owned_ext |= msr;
-
-	kvmppc_recalc_shadow_msr(vcpu);
-
-	return RESUME_GUEST;
-}
-
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
-{
-	int r = RESUME_HOST;
-
-	vcpu->stat.sum_exits++;
-
-	run->exit_reason = KVM_EXIT_UNKNOWN;
-	run->ready_for_interrupt_injection = 1;
-
-	trace_kvm_book3s_exit(exit_nr, vcpu);
-	kvm_resched(vcpu);
-	switch (exit_nr) {
-	case BOOK3S_INTERRUPT_INST_STORAGE:
-		vcpu->stat.pf_instruc++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-		/* We set segments as unused segments when invalidating them. So
-		 * treat the respective fault as segment fault. */
-		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-		    == SR_INVALID) {
-			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-			r = RESUME_GUEST;
-			break;
-		}
-#endif
-
-		/* only care about PTEG not found errors, but leave NX alone */
-		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
-			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
-			vcpu->stat.sp_instruc++;
-		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-			/*
-			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
-			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
-			 *     that no guest that needs the dcbz hack does NX.
-			 */
-			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
-			r = RESUME_GUEST;
-		} else {
-			vcpu->arch.shared->msr |=
-				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	case BOOK3S_INTERRUPT_DATA_STORAGE:
-	{
-		ulong dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->stat.pf_storage++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-		/* We set segments as unused segments when invalidating them. So
-		 * treat the respective fault as segment fault. */
-		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
-			kvmppc_mmu_map_segment(vcpu, dar);
-			r = RESUME_GUEST;
-			break;
-		}
-#endif
-
-		/* The only case we need to handle is missing shadow PTEs */
-		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
-			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
-		} else {
-			vcpu->arch.shared->dar = dar;
-			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_DATA_SEGMENT:
-		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-			kvmppc_book3s_queue_irqprio(vcpu,
-				BOOK3S_INTERRUPT_DATA_SEGMENT);
-		}
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_INST_SEGMENT:
-		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
-			kvmppc_book3s_queue_irqprio(vcpu,
-				BOOK3S_INTERRUPT_INST_SEGMENT);
-		}
-		r = RESUME_GUEST;
-		break;
-	/* We're good on these - the host merely wanted to get our attention */
-	case BOOK3S_INTERRUPT_DECREMENTER:
-		vcpu->stat.dec_exits++;
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_EXTERNAL:
-		vcpu->stat.ext_intr_exits++;
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_PERFMON:
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_PROGRAM:
-	{
-		enum emulation_result er;
-		ulong flags;
-
-program_interrupt:
-		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
-
-		if (vcpu->arch.shared->msr & MSR_PR) {
-#ifdef EXIT_DEBUG
-			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-#endif
-			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
-			    (INS_DCBZ & 0xfffffff7)) {
-				kvmppc_core_queue_program(vcpu, flags);
-				r = RESUME_GUEST;
-				break;
-			}
-		}
-
-		vcpu->stat.emulated_inst_exits++;
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_AGAIN:
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_FAIL:
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-			kvmppc_core_queue_program(vcpu, flags);
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_DO_MMIO:
-			run->exit_reason = KVM_EXIT_MMIO;
-			r = RESUME_HOST_NV;
-			break;
-		default:
-			BUG();
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_SYSCALL:
-		if (vcpu->arch.osi_enabled &&
-		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
-		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
-			/* MOL hypercalls */
-			u64 *gprs = run->osi.gprs;
-			int i;
-
-			run->exit_reason = KVM_EXIT_OSI;
-			for (i = 0; i < 32; i++)
-				gprs[i] = kvmppc_get_gpr(vcpu, i);
-			vcpu->arch.osi_needed = 1;
-			r = RESUME_HOST_NV;
-		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
-		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
-			/* KVM PV hypercalls */
-			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
-			r = RESUME_GUEST;
-		} else {
-			/* Guest syscalls */
-			vcpu->stat.syscall_exits++;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	case BOOK3S_INTERRUPT_FP_UNAVAIL:
-	case BOOK3S_INTERRUPT_ALTIVEC:
-	case BOOK3S_INTERRUPT_VSX:
-	{
-		int ext_msr = 0;
-
-		switch (exit_nr) {
-		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
-		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
-		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
-		}
-
-		switch (kvmppc_check_ext(vcpu, exit_nr)) {
-		case EMULATE_DONE:
-			/* everything ok - let's enable the ext */
-			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
-			break;
-		case EMULATE_FAIL:
-			/* we need to emulate this instruction */
-			goto program_interrupt;
-			break;
-		default:
-			/* nothing to worry about - go again */
-			break;
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_ALIGNMENT:
-		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
-				kvmppc_get_last_inst(vcpu));
-			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
-				kvmppc_get_last_inst(vcpu));
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		}
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_MACHINE_CHECK:
-	case BOOK3S_INTERRUPT_TRACE:
-		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		r = RESUME_GUEST;
-		break;
-	default:
-		/* Ugh - bork here! What did we get? */
-		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
-		r = RESUME_HOST;
-		BUG();
-		break;
-	}
-
-
-	if (!(r & RESUME_HOST)) {
-		/* To avoid clobbering exit_reason, only check for signals if
-		 * we aren't already exiting to userspace for some other
-		 * reason. */
-		if (signal_pending(current)) {
-#ifdef EXIT_DEBUG
-			printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
-			vcpu->stat.signal_exits++;
-			run->exit_reason = KVM_EXIT_INTR;
-			r = -EINTR;
-		} else {
-			/* In case an interrupt came in that was triggered
-			 * from userspace (like DEC), we need to check what
-			 * to inject now! */
-			kvmppc_core_deliver_interrupts(vcpu);
-		}
-	}
-
-	trace_kvm_book3s_reenter(r, vcpu);
-
-	return r;
-}
-
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -1181,69 +462,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-	int i;
-
-	sregs->pvr = vcpu->arch.pvr;
-
-	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-		for (i = 0; i < 64; i++) {
-			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
-			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
-		}
-	} else {
-		for (i = 0; i < 16; i++)
-			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
-
-		for (i = 0; i < 8; i++) {
-			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
-			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
-		}
-	}
-
-	return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-	int i;
-
-	kvmppc_set_pvr(vcpu, sregs->pvr);
-
-	vcpu3s->sdr1 = sregs->u.s.sdr1;
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-		for (i = 0; i < 64; i++) {
-			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
-						    sregs->u.s.ppc64.slb[i].slbe);
-		}
-	} else {
-		for (i = 0; i < 16; i++) {
-			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
-		}
-		for (i = 0; i < 8; i++) {
-			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
-				       (u32)sregs->u.s.ppc32.ibat[i]);
-			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
-				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
-			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
-				       (u32)sregs->u.s.ppc32.dbat[i]);
-			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
-				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
-		}
-	}
-
-	/* Flush the MMU after messing with the segments */
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return 0;
-}
-
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	return -ENOTSUPP;
@@ -1298,202 +516,3 @@ out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
-
-int kvmppc_core_check_processor_compat(void)
-{
-	return 0;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s;
-	struct kvm_vcpu *vcpu;
-	int err = -ENOMEM;
-	unsigned long p;
-
-	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
-	if (!vcpu_book3s)
-		goto out;
-
-	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
-	if (!vcpu_book3s->shadow_vcpu)
-		goto free_vcpu;
-
-	vcpu = &vcpu_book3s->vcpu;
-	err = kvm_vcpu_init(vcpu, kvm, id);
-	if (err)
-		goto free_shadow_vcpu;
-
-	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-	/* the real shared page fills the last 4k of our page */
-	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
-	if (!p)
-		goto uninit_vcpu;
-
-	vcpu->arch.host_retip = kvm_return_point;
-	vcpu->arch.host_msr = mfmsr();
-#ifdef CONFIG_PPC_BOOK3S_64
-	/* default to book3s_64 (970fx) */
-	vcpu->arch.pvr = 0x3C0301;
-#else
-	/* default to book3s_32 (750) */
-	vcpu->arch.pvr = 0x84202;
-#endif
-	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
-	vcpu->arch.slb_nr = 64;
-
-	/* remember where some real-mode handlers are */
-	vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
-	vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
-	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-	vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
-	vcpu->arch.shadow_msr = MSR_USER64;
-
-	err = kvmppc_mmu_init(vcpu);
-	if (err < 0)
-		goto uninit_vcpu;
-
-	return vcpu;
-
-uninit_vcpu:
-	kvm_vcpu_uninit(vcpu);
-free_shadow_vcpu:
-	kfree(vcpu_book3s->shadow_vcpu);
-free_vcpu:
-	vfree(vcpu_book3s);
-out:
-	return ERR_PTR(err);
-}
-
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-
-	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
-	kvm_vcpu_uninit(vcpu);
-	kfree(vcpu_book3s->shadow_vcpu);
-	vfree(vcpu_book3s);
-}
-
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-	int ret;
-	double fpr[32][TS_FPRWIDTH];
-	unsigned int fpscr;
-	int fpexc_mode;
-#ifdef CONFIG_ALTIVEC
-	vector128 vr[32];
-	vector128 vscr;
-	unsigned long uninitialized_var(vrsave);
-	int used_vr;
-#endif
-#ifdef CONFIG_VSX
-	int used_vsr;
-#endif
-	ulong ext_msr;
-
-	/* No need to go into the guest when all we do is going out */
-	if (signal_pending(current)) {
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
-	}
-
-	/* Save FPU state in stack */
-	if (current->thread.regs->msr & MSR_FP)
-		giveup_fpu(current);
-	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
-	fpscr = current->thread.fpscr.val;
-	fpexc_mode = current->thread.fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-	/* Save Altivec state in stack */
-	used_vr = current->thread.used_vr;
-	if (used_vr) {
-		if (current->thread.regs->msr & MSR_VEC)
-			giveup_altivec(current);
-		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
-		vscr = current->thread.vscr;
-		vrsave = current->thread.vrsave;
-	}
-#endif
-
-#ifdef CONFIG_VSX
-	/* Save VSX state in stack */
-	used_vsr = current->thread.used_vsr;
-	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
-			__giveup_vsx(current);
-#endif
-
-	/* Remember the MSR with disabled extensions */
-	ext_msr = current->thread.regs->msr;
-
-	/* XXX we get called with irq disabled - change that! */
-	local_irq_enable();
-
-	/* Preload FPU if it's enabled */
-	if (vcpu->arch.shared->msr & MSR_FP)
-		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-
-	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
-
-	local_irq_disable();
-
-	current->thread.regs->msr = ext_msr;
-
-	/* Make sure we save the guest FPU/Altivec/VSX state */
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
-
-	/* Restore FPU state from stack */
-	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
-	current->thread.fpscr.val = fpscr;
-	current->thread.fpexc_mode = fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-	/* Restore Altivec state from stack */
-	if (used_vr && current->thread.used_vr) {
-		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
-		current->thread.vscr = vscr;
-		current->thread.vrsave = vrsave;
-	}
-	current->thread.used_vr = used_vr;
-#endif
-
-#ifdef CONFIG_VSX
-	current->thread.used_vsr = used_vsr;
-#endif
-
-	return ret;
-}
-
-static int kvmppc_book3s_init(void)
-{
-	int r;
-
-	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-		     THIS_MODULE);
-
-	if (r)
-		return r;
-
-	r = kvmppc_mmu_hpte_sysinit();
-
-	return r;
-}
-
-static void kvmppc_book3s_exit(void)
-{
-	kvmppc_mmu_hpte_sysexit();
-	kvm_exit();
-}
-
-module_init(kvmppc_book3s_init);
-module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644
index 000000000000..fcdc97eb9411
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ * Functions relating to running KVM on Book 3S processors where
+ * we don't have access to hypervisor mode, and we run the guest
+ * in problem state (user mode).
+ *
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "trace.h"
+
+/* #define EXIT_DEBUG */
+/* #define DEBUG_EXT */
+
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr);
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#endif
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
+	       sizeof(get_paca()->shadow_vcpu));
+	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
+#endif
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+	       sizeof(get_paca()->shadow_vcpu));
+	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+#endif
+
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	kvmppc_giveup_ext(vcpu, MSR_VEC);
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+}
+
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+	ulong smsr = vcpu->arch.shared->msr;
+
+	/* Guest MSR values */
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
+	/* Process MSR values */
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+	/* External providers the guest reserved */
+	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
+	/* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+	smsr |= MSR_ISF | MSR_HV;
+#endif
+	vcpu->arch.shadow_msr = smsr;
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	ulong old_msr = vcpu->arch.shared->msr;
+
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+
+	msr &= to_book3s(vcpu)->msr_mask;
+	vcpu->arch.shared->msr = msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	if (msr & MSR_POW) {
+		if (!vcpu->arch.pending_exceptions) {
+			kvm_vcpu_block(vcpu);
+			vcpu->stat.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			vcpu->arch.shared->msr = msr;
+		}
+	}
+
+	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
+	}
+
+	/* Preload FPU if it's enabled */
+	if (vcpu->arch.shared->msr & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	u32 host_pvr;
+
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
+	vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
+	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+		kvmppc_mmu_book3s_64_init(vcpu);
+		to_book3s(vcpu)->hior = 0xfff00000;
+		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+	} else
+#endif
+	{
+		kvmppc_mmu_book3s_32_init(vcpu);
+		to_book3s(vcpu)->hior = 0;
+		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+	}
+
+	/* If we are in hypervisor level on 970, we can tell the CPU to
+	 * treat DCBZ as 32 bytes store */
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+	    !strcmp(cur_cpu_spec->platform, "ppc970"))
+		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
+	   really needs them in a VM on Cell and force disable them. */
+	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	/* 32 bit Book3S always has 32 byte dcbz */
+	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+	/* On some CPUs we can execute paired single operations natively */
+	asm ( "mfpvr %0" : "=r"(host_pvr));
+	switch (host_pvr) {
+	case 0x00080200:	/* lonestar 2.0 */
+	case 0x00088202:	/* lonestar 2.2 */
+	case 0x70000100:	/* gekko 1.0 */
+	case 0x00080100:	/* gekko 2.0 */
+	case 0x00083203:	/* gekko 2.3a */
+	case 0x00083213:	/* gekko 2.3b */
+	case 0x00083204:	/* gekko 2.4 */
+	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+	case 0x00087200:	/* broadway */
+		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+		/* Enable HID2.PSE - in case we need it later */
+		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+	}
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	struct page *hpage;
+	u64 hpage_offset;
+	u32 *page;
+	int i;
+
+	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+	if (is_error_page(hpage)) {
+		kvm_release_page_clean(hpage);
+		return;
+	}
+
+	hpage_offset = pte->raddr & ~PAGE_MASK;
+	hpage_offset &= ~0xFFFULL;
+	hpage_offset /= 4;
+
+	get_page(hpage);
+	page = kmap_atomic(hpage, KM_USER0);
+
+	/* patch dcbz into reserved instruction, so we trap */
+	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+		if ((page[i] & 0xff0007ff) == INS_DCBZ)
+			page[i] &= 0xfffffff7;
+
+	kunmap_atomic(page, KM_USER0);
+	put_page(hpage);
+}
+
+static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (unlikely(mp_pa) &&
+	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
+		return 1;
+	}
+
+	return kvm_is_visible_gfn(vcpu->kvm, gfn);
+}
+
+int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			    ulong eaddr, int vec)
+{
+	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+	int r = RESUME_GUEST;
+	int relocated;
+	int page_found = 0;
+	struct kvmppc_pte pte;
+	bool is_mmio = false;
+	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
+	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
+	u64 vsid;
+
+	relocated = data ? dr : ir;
+
+	/* Resolve real address if translation turned on */
+	if (relocated) {
+		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
+	} else {
+		pte.may_execute = true;
+		pte.may_read = true;
+		pte.may_write = true;
+		pte.raddr = eaddr & KVM_PAM;
+		pte.eaddr = eaddr;
+		pte.vpage = eaddr >> 12;
+	}
+
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+		break;
+	case MSR_DR:
+	case MSR_IR:
+		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
+			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+		else
+			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+		pte.vpage |= vsid;
+
+		if (vsid == -1)
+			page_found = -EINVAL;
+		break;
+	}
+
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+		/*
+		 * If we do the dcbz hack, we have to NX on every execution,
+		 * so we can patch the executing code. This renders our guest
+		 * NX-less.
+		 */
+		pte.may_execute = !data;
+	}
+
+	if (page_found == -ENOENT) {
+		/* Page not found in guest PTE entries */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EPERM) {
+		/* Storage protection */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr =
+			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EINVAL) {
+		/* Page not found in guest SLB */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+	} else if (!is_mmio &&
+		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
+		/* The guest's PTE is not mapped yet. Map on the host */
+		kvmppc_mmu_map_page(vcpu, &pte);
+		if (data)
+			vcpu->stat.sp_storage++;
+		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+			kvmppc_patch_dcbz(vcpu, &pte);
+	} else {
+		/* MMIO */
+		vcpu->stat.mmio_exits++;
+		vcpu->arch.paddr_accessed = pte.raddr;
+		r = kvmppc_emulate_mmio(run, vcpu);
+		if ( r == RESUME_HOST_NV )
+			r = RESUME_HOST;
+	}
+
+	return r;
+}
+
+static inline int get_fpr_index(int i)
+{
+#ifdef CONFIG_VSX
+	i *= 2;
+#endif
+	return i;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+	u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+	u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+	u64 *thread_fpr = (u64*)t->fpr;
+	int i;
+
+	if (!(vcpu->arch.guest_owned_ext & msr))
+		return;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+	switch (msr) {
+	case MSR_FP:
+		giveup_fpu(current);
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
+
+		vcpu->arch.fpscr = t->fpscr.val;
+		break;
+	case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+		giveup_altivec(current);
+		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
+		vcpu->arch.vscr = t->vscr;
+#endif
+		break;
+	case MSR_VSX:
+#ifdef CONFIG_VSX
+		__giveup_vsx(current);
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+		break;
+	default:
+		BUG();
+	}
+
+	vcpu->arch.guest_owned_ext &= ~msr;
+	current->thread.regs->msr &= ~msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+}
+
+static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
+{
+	ulong srr0 = kvmppc_get_pc(vcpu);
+	u32 last_inst = kvmppc_get_last_inst(vcpu);
+	int ret;
+
+	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+	if (ret == -ENOENT) {
+		ulong msr = vcpu->arch.shared->msr;
+
+		msr = kvmppc_set_field(msr, 33, 33, 1);
+		msr = kvmppc_set_field(msr, 34, 36, 0);
+		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
+		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
+		return EMULATE_AGAIN;
+	}
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+
+	/* Need to do paired single emulation? */
+	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+		return EMULATE_DONE;
+
+	/* Read out the instruction */
+	if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
+		/* Need to emulate */
+		return EMULATE_FAIL;
+
+	return EMULATE_AGAIN;
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+	u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+	u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+	u64 *thread_fpr = (u64*)t->fpr;
+	int i;
+
+	/* When we have paired singles, we emulate in software */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+		return RESUME_GUEST;
+
+	if (!(vcpu->arch.shared->msr & msr)) {
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		return RESUME_GUEST;
+	}
+
+	/* We already own the ext */
+	if (vcpu->arch.guest_owned_ext & msr) {
+		return RESUME_GUEST;
+	}
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+	current->thread.regs->msr |= msr;
+
+	switch (msr) {
+	case MSR_FP:
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
+
+		t->fpscr.val = vcpu->arch.fpscr;
+		t->fpexc_mode = 0;
+		kvmppc_load_up_fpu();
+		break;
+	case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
+		t->vscr = vcpu->arch.vscr;
+		t->vrsave = -1;
+		kvmppc_load_up_altivec();
+#endif
+		break;
+	case MSR_VSX:
+#ifdef CONFIG_VSX
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+		kvmppc_load_up_vsx();
+#endif
+		break;
+	default:
+		BUG();
+	}
+
+	vcpu->arch.guest_owned_ext |= msr;
+
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	return RESUME_GUEST;
+}
+
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	trace_kvm_book3s_exit(exit_nr, vcpu);
+	kvm_resched(vcpu);
+	switch (exit_nr) {
+	case BOOK3S_INTERRUPT_INST_STORAGE:
+		vcpu->stat.pf_instruc++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
+		    == SR_INVALID) {
+			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+			r = RESUME_GUEST;
+			break;
+		}
+#endif
+
+		/* only care about PTEG not found errors, but leave NX alone */
+		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
+			vcpu->stat.sp_instruc++;
+		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+			/*
+			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+			 *     that no guest that needs the dcbz hack does NX.
+			 */
+			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+			r = RESUME_GUEST;
+		} else {
+			vcpu->arch.shared->msr |=
+				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	case BOOK3S_INTERRUPT_DATA_STORAGE:
+	{
+		ulong dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+			kvmppc_mmu_map_segment(vcpu, dar);
+			r = RESUME_GUEST;
+			break;
+		}
+#endif
+
+		/* The only case we need to handle is missing shadow PTEs */
+		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
+		} else {
+			vcpu->arch.shared->dar = dar;
+			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_DATA_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_INST_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_INST_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		enum emulation_result er;
+		ulong flags;
+
+program_interrupt:
+		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
+
+		if (vcpu->arch.shared->msr & MSR_PR) {
+#ifdef EXIT_DEBUG
+			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+#endif
+			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
+			    (INS_DCBZ & 0xfffffff7)) {
+				kvmppc_core_queue_program(vcpu, flags);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+
+		vcpu->stat.emulated_inst_exits++;
+		er = kvmppc_emulate_instruction(run, vcpu);
+		switch (er) {
+		case EMULATE_DONE:
+			r = RESUME_GUEST_NV;
+			break;
+		case EMULATE_AGAIN:
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_FAIL:
+			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+			kvmppc_core_queue_program(vcpu, flags);
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_DO_MMIO:
+			run->exit_reason = KVM_EXIT_MMIO;
+			r = RESUME_HOST_NV;
+			break;
+		default:
+			BUG();
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+		if (vcpu->arch.osi_enabled &&
+		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
+			u64 *gprs = run->osi.gprs;
+			int i;
+
+			run->exit_reason = KVM_EXIT_OSI;
+			for (i = 0; i < 32; i++)
+				gprs[i] = kvmppc_get_gpr(vcpu, i);
+			vcpu->arch.osi_needed = 1;
+			r = RESUME_HOST_NV;
+		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			vcpu->stat.syscall_exits++;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	case BOOK3S_INTERRUPT_FP_UNAVAIL:
+	case BOOK3S_INTERRUPT_ALTIVEC:
+	case BOOK3S_INTERRUPT_VSX:
+	{
+		int ext_msr = 0;
+
+		switch (exit_nr) {
+		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
+		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
+		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
+		}
+
+		switch (kvmppc_check_ext(vcpu, exit_nr)) {
+		case EMULATE_DONE:
+			/* everything ok - let's enable the ext */
+			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+			break;
+		case EMULATE_FAIL:
+			/* we need to emulate this instruction */
+			goto program_interrupt;
+			break;
+		default:
+			/* nothing to worry about - go again */
+			break;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_ALIGNMENT:
+		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
+			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
+				kvmppc_get_last_inst(vcpu));
+			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
+				kvmppc_get_last_inst(vcpu));
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+	case BOOK3S_INTERRUPT_TRACE:
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+	default:
+		/* Ugh - bork here! What did we get? */
+		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(current)) {
+#ifdef EXIT_DEBUG
+			printk(KERN_EMERG "KVM: Going back to host\n");
+#endif
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			/* In case an interrupt came in that was triggered
+			 * from userspace (like DEC), we need to check what
+			 * to inject now! */
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	trace_kvm_book3s_reenter(r, vcpu);
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+		}
+	} else {
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
+
+		for (i = 0; i < 8; i++) {
+			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+		}
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	vcpu3s->sdr1 = sregs->u.s.sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
+						    sregs->u.s.ppc64.slb[i].slbe);
+		}
+	} else {
+		for (i = 0; i < 16; i++) {
+			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+		}
+		for (i = 0; i < 8; i++) {
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+				       (u32)sregs->u.s.ppc32.ibat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+				       (u32)sregs->u.s.ppc32.dbat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+		}
+	}
+
+	/* Flush the MMU after messing with the segments */
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s;
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long p;
+
+	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+	if (!vcpu_book3s)
+		goto out;
+
+	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
+		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
+	if (!vcpu_book3s->shadow_vcpu)
+		goto free_vcpu;
+
+	vcpu = &vcpu_book3s->vcpu;
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_shadow_vcpu;
+
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	/* the real shared page fills the last 4k of our page */
+	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
+	if (!p)
+		goto uninit_vcpu;
+
+	vcpu->arch.host_retip = kvm_return_point;
+	vcpu->arch.host_msr = mfmsr();
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* default to book3s_64 (970fx) */
+	vcpu->arch.pvr = 0x3C0301;
+#else
+	/* default to book3s_32 (750) */
+	vcpu->arch.pvr = 0x84202;
+#endif
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+	vcpu->arch.slb_nr = 64;
+
+	/* remember where some real-mode handlers are */
+	vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
+	vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
+	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+#ifdef CONFIG_PPC_BOOK3S_64
+	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+#else
+	vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
+#endif
+
+	vcpu->arch.shadow_msr = MSR_USER64;
+
+	err = kvmppc_mmu_init(vcpu);
+	if (err < 0)
+		goto uninit_vcpu;
+
+	return vcpu;
+
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+free_shadow_vcpu:
+	kfree(vcpu_book3s->shadow_vcpu);
+free_vcpu:
+	vfree(vcpu_book3s);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu_book3s->shadow_vcpu);
+	vfree(vcpu_book3s);
+}
+
+extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret;
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
+#ifdef CONFIG_ALTIVEC
+	vector128 vr[32];
+	vector128 vscr;
+	unsigned long uninitialized_var(vrsave);
+	int used_vr;
+#endif
+#ifdef CONFIG_VSX
+	int used_vsr;
+#endif
+	ulong ext_msr;
+
+	/* No need to go into the guest when all we do is going out */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	/* Save FPU state in stack */
+	if (current->thread.regs->msr & MSR_FP)
+		giveup_fpu(current);
+	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+	fpscr = current->thread.fpscr.val;
+	fpexc_mode = current->thread.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+	/* Save Altivec state in stack */
+	used_vr = current->thread.used_vr;
+	if (used_vr) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+		vscr = current->thread.vscr;
+		vrsave = current->thread.vrsave;
+	}
+#endif
+
+#ifdef CONFIG_VSX
+	/* Save VSX state in stack */
+	used_vsr = current->thread.used_vsr;
+	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
+			__giveup_vsx(current);
+#endif
+
+	/* Remember the MSR with disabled extensions */
+	ext_msr = current->thread.regs->msr;
+
+	/* XXX we get called with irq disabled - change that! */
+	local_irq_enable();
+
+	/* Preload FPU if it's enabled */
+	if (vcpu->arch.shared->msr & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
+
+	local_irq_disable();
+
+	current->thread.regs->msr = ext_msr;
+
+	/* Make sure we save the guest FPU/Altivec/VSX state */
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	kvmppc_giveup_ext(vcpu, MSR_VEC);
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+	/* Restore FPU state from stack */
+	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = fpscr;
+	current->thread.fpexc_mode = fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+	/* Restore Altivec state from stack */
+	if (used_vr && current->thread.used_vr) {
+		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+		current->thread.vscr = vscr;
+		current->thread.vrsave = vrsave;
+	}
+	current->thread.used_vr = used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+	current->thread.used_vsr = used_vsr;
+#endif
+
+	return ret;
+}
+
+static int kvmppc_book3s_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+		     THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hpte_sysinit();
+
+	return r;
+}
+
+static void kvmppc_book3s_exit(void)
+{
+	kvmppc_mmu_hpte_sysexit();
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);

From b01c8b54a1a271c0fc4243845927fe1d250767a3 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:18:26 +0000
Subject: [PATCH 102/143] powerpc, KVM: Rework KVM checks in first-level
 interrupt handlers

Instead of branching out-of-line with the DO_KVM macro to check if we
are in a KVM guest at the time of an interrupt, this moves the KVM
check inline in the first-level interrupt handlers.  This speeds up
the non-KVM case and makes sure that none of the interrupt handlers
are missing the check.

Because the first-level interrupt handlers are now larger, some things
had to be move out of line in exceptions-64s.S.

This all necessitated some minor changes to the interrupt entry code
in KVM.  This also streamlines the book3s_32 KVM test.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/exception-64s.h   | 121 ++++++++-----
 arch/powerpc/kernel/exceptions-64s.S       | 189 +++++++++++++--------
 arch/powerpc/kvm/book3s_rmhandlers.S       |  78 +++++----
 arch/powerpc/kvm/book3s_segment.S          |   7 +
 arch/powerpc/platforms/iseries/exception.S |   2 +-
 arch/powerpc/platforms/iseries/exception.h |   4 +-
 6 files changed, 247 insertions(+), 154 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index f5dfe3411f64..b6a3a443fbde 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -61,19 +61,22 @@
 #define EXC_HV	H
 #define EXC_STD
 
-#define EXCEPTION_PROLOG_1(area)					\
+#define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
 	std	r10,area+EX_R10(r13);					\
-	std	r11,area+EX_R11(r13);					\
-	std	r12,area+EX_R12(r13);					\
 	BEGIN_FTR_SECTION_NESTED(66);					\
 	mfspr	r10,SPRN_CFAR;						\
 	std	r10,area+EX_CFAR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
-	GET_SCRATCH0(r9);						\
-	std	r9,area+EX_R13(r13);					\
-	mfcr	r9
+	mfcr	r9;							\
+	extra(vec);							\
+	std	r11,area+EX_R11(r13);					\
+	std	r12,area+EX_R12(r13);					\
+	GET_SCRATCH0(r10);						\
+	std	r10,area+EX_R13(r13)
+#define EXCEPTION_PROLOG_1(area, extra, vec)				\
+	__EXCEPTION_PROLOG_1(area, extra, vec)
 
 #define __EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
@@ -85,13 +88,54 @@
 	mtspr	SPRN_##h##SRR1,r10;					\
 	h##rfid;							\
 	b	.	/* prevent speculative execution */
-#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
+#define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
 
-#define EXCEPTION_PROLOG_PSERIES(area, label, h)			\
-	EXCEPTION_PROLOG_1(area);					\
+#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)		\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
+#define __KVMTEST(n)							\
+	lbz	r10,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13);			\
+	cmpwi	r10,0;							\
+	bne	do_kvm_##n
+
+#define __KVM_HANDLER(area, h, n)					\
+do_kvm_##n:								\
+	ld	r10,area+EX_R10(r13);					\
+	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	ld	r9,area+EX_R9(r13);					\
+	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	li	r12,n;							\
+	b	kvmppc_interrupt
+
+#define __KVM_HANDLER_SKIP(area, h, n)					\
+do_kvm_##n:								\
+	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
+	ld	r10,area+EX_R10(r13);					\
+	beq	89f;							\
+	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	ld	r9,area+EX_R9(r13);					\
+	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	li	r12,n;							\
+	b	kvmppc_interrupt;					\
+89:	mtocrf	0x80,r9;						\
+	ld	r9,area+EX_R9(r13);					\
+	b	kvmppc_skip_##h##interrupt
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define KVMTEST(n)			__KVMTEST(n)
+#define KVM_HANDLER(area, h, n)		__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST(n)
+#define KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)
+#endif
+
+#define NOTEST(n)
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
@@ -164,57 +208,54 @@
 	.globl label##_pSeries;				\
 label##_pSeries:					\
 	HMT_MEDIUM;					\
-	DO_KVM	vec;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
+				 EXC_STD, KVMTEST, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
 	.globl label##_hv;				\
 label##_hv:						\
 	HMT_MEDIUM;					\
-	DO_KVM	vec;					\
-	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV)
+	SET_SCRATCH0(r13);	/* save r13 */			\
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
+				 EXC_HV, KVMTEST, vec)
 
-#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h)			\
-	HMT_MEDIUM;							\
-	DO_KVM	vec;							\
-	SET_SCRATCH0(r13);    /* save r13 */				\
-	GET_PACA(r13);							\
-	std	r9,PACA_EXGEN+EX_R9(r13);	/* save r9, r10 */	\
-	std	r10,PACA_EXGEN+EX_R10(r13);				\
+#define __SOFTEN_TEST(h)						\
 	lbz	r10,PACASOFTIRQEN(r13);					\
-	mfcr	r9;							\
 	cmpwi	r10,0;							\
-	beq	masked_##h##interrupt;					\
-	GET_SCRATCH0(r10);						\
-	std	r10,PACA_EXGEN+EX_R13(r13);				\
-	std	r11,PACA_EXGEN+EX_R11(r13);				\
-	std	r12,PACA_EXGEN+EX_R12(r13);				\
-	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
-	ld	r10,PACAKMSR(r13);	/* get MSR value for kernel */	\
-	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
-	LOAD_HANDLER(r12,label##_common)				\
-	mtspr	SPRN_##h##SRR0,r12;					\
-	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
-	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
-	b	.	/* prevent speculative execution */
-#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h)			\
-	__MASKABLE_EXCEPTION_PSERIES(vec, label, h)
+	beq	masked_##h##interrupt
+#define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
+
+#define SOFTEN_TEST(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_STD)
+
+#define SOFTEN_TEST_HV(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_HV)
+
+#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
+	HMT_MEDIUM;							\
+	SET_SCRATCH0(r13);    /* save r13 */				\
+	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);		\
+	EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
+#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
+	__MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)			\
 	. = loc;							\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
-	_MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD)
+	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
+				    EXC_STD, SOFTEN_TEST)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
 	.globl label##_hv;						\
 label##_hv:								\
-	_MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV)
+	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
+				    EXC_HV, SOFTEN_TEST_HV)
 
 #ifdef CONFIG_PPC_ISERIES
 #define DISABLE_INTS				\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a85f4874cba7..e76472cbf3b5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -40,7 +40,6 @@ __start_interrupts:
 	.globl system_reset_pSeries;
 system_reset_pSeries:
 	HMT_MEDIUM;
-	DO_KVM	0x100;
 	SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -65,67 +64,45 @@ BEGIN_FTR_SECTION
 	beq	.
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+				 NOTEST, 0x100)
 
 	. = 0x200
-_machine_check_pSeries:
-	HMT_MEDIUM
-	DO_KVM	0x200
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+machine_check_pSeries_1:
+	/* This is moved out of line as it can be patched by FW, but
+	 * some code path might still want to branch into the original
+	 * vector
+	 */
+	b	machine_check_pSeries
 
 	. = 0x300
 	.globl data_access_pSeries
 data_access_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x300
 	SET_SCRATCH0(r13)
+#ifndef CONFIG_POWER4_ONLY
 BEGIN_FTR_SECTION
-	GET_PACA(r13)
-	std	r9,PACA_EXSLB+EX_R9(r13)
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	mfspr	r10,SPRN_DAR
-	mfspr	r9,SPRN_DSISR
-	srdi	r10,r10,60
-	rlwimi	r10,r9,16,0x20
-	mfcr	r9
-	cmpwi	r10,0x2c
-	beq	do_stab_bolted_pSeries
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXGEN+EX_R11(r13)
-	ld	r11,PACA_EXSLB+EX_R9(r13)
-	std	r12,PACA_EXGEN+EX_R12(r13)
-	GET_SCRATCH0(r12)
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	std	r11,PACA_EXGEN+EX_R9(r13)
-	std	r12,PACA_EXGEN+EX_R13(r13)
-	EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD)
-FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
+	b	data_access_check_stab
+data_access_not_stab:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
+#endif
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
+				 KVMTEST, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x380
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
-	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
-	mfcr	r9
 #ifdef __DISABLED__
 	/* Keep that around for when we re-implement dynamic VSIDs */
 	cmpdi	r3,0
 	bge	slb_miss_user_pseries
 #endif /* __DISABLED__ */
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
 	b	.slb_miss_realmode
 #else
@@ -147,24 +124,16 @@ data_access_slb_pSeries:
 	.globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x480
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
-	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
-	mfcr	r9
 #ifdef __DISABLED__
 	/* Keep that around for when we re-implement dynamic VSIDs */
 	cmpdi	r3,0
 	bge	slb_miss_user_pseries
 #endif /* __DISABLED__ */
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
 	b	.slb_miss_realmode
 #else
@@ -184,26 +153,46 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD)
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	FTR_SECTION_ELSE
-		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV)
+		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
+					    EXC_HV, SOFTEN_TEST_HV)
+		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
-	MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer)
+	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
 system_call_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0xc00
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9,PACA_EXGEN+EX_R9(r13)
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	mfcr	r9
+	KVMTEST(0xc00)
+	GET_SCRATCH0(r13)
+#endif
 BEGIN_FTR_SECTION
 	cmpdi	r0,0x1ebe
 	beq-	1f
@@ -220,6 +209,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	rfid
 	b	.	/* prevent speculative execution */
 
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+
 /* Fast LE/BE switch system call */
 1:	mfspr	r12,SPRN_SRR1
 	xori	r12,r12,MSR_LE
@@ -228,6 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -262,30 +254,93 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
+
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
+
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
 
 /*** Out of line interrupts support ***/
 
+	/* moved from 0x200 */
+machine_check_pSeries:
+	.globl machine_check_fwnmi
+machine_check_fwnmi:
+	HMT_MEDIUM
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
+				 EXC_STD, KVMTEST, 0x200)
+	KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
+
+#ifndef CONFIG_POWER4_ONLY
+	/* moved from 0x300 */
+data_access_check_stab:
+	GET_PACA(r13)
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	mfspr	r10,SPRN_DAR
+	mfspr	r9,SPRN_DSISR
+	srdi	r10,r10,60
+	rlwimi	r10,r9,16,0x20
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
+	rlwimi	r10,r9,8,0x300
+#endif
+	mfcr	r9
+	cmpwi	r10,0x2c
+	beq	do_stab_bolted_pSeries
+	mtcrf	0x80,r9
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	b	data_access_not_stab
+do_stab_bolted_pSeries:
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	GET_SCRATCH0(r10)
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
+#endif /* CONFIG_POWER4_ONLY */
+
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
+
+	.align	7
 	/* moved from 0xe00 */
-	STD_EXCEPTION_HV(., 0xe00, h_data_storage)
-	STD_EXCEPTION_HV(., 0xe20, h_instr_storage)
-	STD_EXCEPTION_HV(., 0xe40, emulation_assist)
-	STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */
+	STD_EXCEPTION_HV(., 0xe02, h_data_storage)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
+	STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
+	STD_EXCEPTION_HV(., 0xe42, emulation_assist)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
+	STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -317,14 +372,6 @@ masked_Hinterrupt:
 	hrfid
 	b	.
 
-	.align	7
-do_stab_bolted_pSeries:
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
-
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Vectors for the FWNMI option.  Share common code.
@@ -334,14 +381,8 @@ do_stab_bolted_pSeries:
 system_reset_fwnmi:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
-
-	.globl machine_check_fwnmi
-      .align 7
-machine_check_fwnmi:
-	HMT_MEDIUM
-	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+				 NOTEST, 0x100)
 
 #endif /* CONFIG_PPC_PSERIES */
 
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 5f5d07475c91..dd03689fc609 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -40,37 +40,42 @@
 #define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name) 		GLUE(.,name)
 
-#elif defined(CONFIG_PPC_BOOK3S_32)
+kvmppc_skip_interrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_SRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_SRR0, r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
 
-#define LOAD_SHADOW_VCPU(reg)						\
-	mfspr	reg, SPRN_SPRG_THREAD;					\
-	lwz	reg, THREAD_KVM_SVCPU(reg);				\
-	/* PPC32 can have a NULL pointer - let's check for that */	\
-	mtspr   SPRN_SPRG_SCRATCH1, r12;	/* Save r12 */		\
-	mfcr	r12;							\
-	cmpwi	reg, 0;							\
-	bne	1f;							\
-	mfspr	reg, SPRN_SPRG_SCRATCH0;				\
-	mtcr	r12;							\
-	mfspr	r12, SPRN_SPRG_SCRATCH1;				\
-	b	kvmppc_resume_\intno;					\
-1:;									\
-	mtcr	r12;							\
-	mfspr	r12, SPRN_SPRG_SCRATCH1;				\
-	tophys(reg, reg)
+kvmppc_skip_Hinterrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_HSRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_HSRR0, r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
 
 #define SHADOW_VCPU_OFF		0
 #define MSR_NOIRQ		MSR_KERNEL
 #define FUNC(name)		name
 
-#endif
-
 .macro INTERRUPT_TRAMPOLINE intno
 
 .global kvmppc_trampoline_\intno
 kvmppc_trampoline_\intno:
 
-	SET_SCRATCH0(r13)		/* Save r13 */
+	mtspr	SPRN_SPRG_SCRATCH0, r13		/* Save r13 */
 
 	/*
 	 * First thing to do is to find out if we're coming
@@ -78,19 +83,28 @@ kvmppc_trampoline_\intno:
 	 *
 	 * To distinguish, we check a magic byte in the PACA/current
 	 */
-	LOAD_SHADOW_VCPU(r13)
-	PPC_STL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+	mfspr	r13, SPRN_SPRG_THREAD
+	lwz	r13, THREAD_KVM_SVCPU(r13)
+	/* PPC32 can have a NULL pointer - let's check for that */
+	mtspr   SPRN_SPRG_SCRATCH1, r12		/* Save r12 */
 	mfcr	r12
+	cmpwi	r13, 0
+	bne	1f
+2:	mtcr	r12
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */
+	b	kvmppc_resume_\intno		/* Get back original handler */
+
+1:	tophys(r13, r13)
 	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
 	lbz	r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
 	cmpwi	r12, KVM_GUEST_MODE_NONE
 	bne	..kvmppc_handler_hasmagic_\intno
 	/* No KVM guest? Then jump back to the Linux handler! */
 	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
-	mtcr	r12
-	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	GET_SCRATCH0(r13)			/* r13 = original r13 */
-	b	kvmppc_resume_\intno		/* Get back original handler */
+	b	2b
 
 	/* Now we know we're handling a KVM guest */
 ..kvmppc_handler_hasmagic_\intno:
@@ -112,9 +126,6 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_MACHINE_CHECK
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL_HV
-#endif
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALIGNMENT
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PROGRAM
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_FP_UNAVAIL
@@ -124,14 +135,6 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_TRACE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PERFMON
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
 
-/* Those are only available on 64 bit machines */
-
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_SEGMENT
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_SEGMENT
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_VSX
-#endif
-
 /*
  * Bring us back to the faulting code, but skip the
  * faulting instruction.
@@ -163,6 +166,7 @@ kvmppc_handler_skip_ins:
 
 	/* And get back into the code */
 	RFI
+#endif
 
 /*
  * This trampoline brings us back to a real mode handler
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 451264274b8c..4a623eb28a53 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -125,6 +125,9 @@ kvmppc_handler_trampoline_enter_end:
 .global kvmppc_handler_trampoline_exit
 kvmppc_handler_trampoline_exit:
 
+.global kvmppc_interrupt
+kvmppc_interrupt:
+
 	/* Register usage at this point:
 	 *
 	 * SPRG_SCRATCH0  = guest R13
@@ -155,12 +158,16 @@ kvmppc_handler_trampoline_exit:
 	PPC_LL	r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
 
 	/* Save guest PC and MSR */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
 	andi.	r0,r12,0x2
 	beq	1f
 	mfspr	r3,SPRN_HSRR0
 	mfspr	r4,SPRN_HSRR1
 	andi.	r12,r12,0x3ffd
 	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+#endif
 1:	mfsrr0	r3
 	mfsrr1	r4
 2:
diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S
index 29c02f36b32f..f519ee17ff7d 100644
--- a/arch/powerpc/platforms/iseries/exception.S
+++ b/arch/powerpc/platforms/iseries/exception.S
@@ -167,7 +167,7 @@ BEGIN_FTR_SECTION
 	std	r12,PACA_EXGEN+EX_R13(r13)
 	EXCEPTION_PROLOG_ISERIES_1
 FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_1(PACA_EXGEN)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0)
 	EXCEPTION_PROLOG_ISERIES_1
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
 	b	data_access_common
diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h
index bae3fba5ad8e..50271b550a99 100644
--- a/arch/powerpc/platforms/iseries/exception.h
+++ b/arch/powerpc/platforms/iseries/exception.h
@@ -39,7 +39,7 @@
 label##_iSeries:							\
 	HMT_MEDIUM;							\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;	/* save r13 */			\
-	EXCEPTION_PROLOG_1(area);					\
+	EXCEPTION_PROLOG_1(area, NOTEST, 0);				\
 	EXCEPTION_PROLOG_ISERIES_1;					\
 	b	label##_common
 
@@ -48,7 +48,7 @@ label##_iSeries:							\
 label##_iSeries:							\
 	HMT_MEDIUM;							\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;	/* save r13 */			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN);					\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0);			\
 	lbz	r10,PACASOFTIRQEN(r13);					\
 	cmpwi	0,r10,0;						\
 	beq-	label##_iSeries_masked;					\

From 3cf658b605393d793ea52416c2306b86fbde9d9a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:18:52 +0000
Subject: [PATCH 103/143] KVM: PPC: Deliver program interrupts right away
 instead of queueing them

Doing so means that we don't have to save the flags anywhere and gets
rid of the last reference to to_book3s(vcpu) in arch/powerpc/kvm/book3s.c.

Doing so is OK because a program interrupt won't be generated at the
same time as any other synchronous interrupt.  If a program interrupt
and an asynchronous interrupt (external or decrementer) are generated
at the same time, the program interrupt will be delivered, which is
correct because it has a higher priority, and then the asynchronous
interrupt will be masked.

We don't ever generate system reset or machine check interrupts to the
guest, but if we did, then we would need to make sure they got delivered
rather than the program interrupt.  The current code would be wrong in
this situation anyway since it would deliver the program interrupt as
well as the reset/machine check interrupt.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 163e3e13c87c..f68a34d16035 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -129,8 +129,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
 
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
-	to_book3s(vcpu)->prog_flags = flags;
-	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
 }
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
@@ -170,7 +170,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 {
 	int deliver = 1;
 	int vec = 0;
-	ulong flags = 0ULL;
 	bool crit = kvmppc_critical_section(vcpu);
 
 	switch (priority) {
@@ -206,7 +205,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 		break;
 	case BOOK3S_IRQPRIO_PROGRAM:
 		vec = BOOK3S_INTERRUPT_PROGRAM;
-		flags = to_book3s(vcpu)->prog_flags;
 		break;
 	case BOOK3S_IRQPRIO_VSX:
 		vec = BOOK3S_INTERRUPT_VSX;
@@ -237,7 +235,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 #endif
 
 	if (deliver)
-		kvmppc_inject_interrupt(vcpu, vec, flags);
+		kvmppc_inject_interrupt(vcpu, vec, 0);
 
 	return deliver;
 }

From f9e0554deca54a42fb2cf7f68c05a4a37461c205 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:19:22 +0000
Subject: [PATCH 104/143] KVM: PPC: Pass init/destroy vm and prepare/commit
 memory region ops down

This arranges for the top-level arch/powerpc/kvm/powerpc.c file to
pass down some of the calls it gets to the lower-level subarchitecture
specific code.  The lower-level implementations (in booke.c and book3s.c)
are no-ops.  The coming book3s_hv.c will need this.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_ppc.h |  7 +++++++
 arch/powerpc/kvm/book3s_pr.c       | 20 ++++++++++++++++++++
 arch/powerpc/kvm/booke.c           | 20 ++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c         |  9 ++++++---
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c662f140283a..9b6f3f99c5eb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -111,6 +111,13 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
+extern int kvmppc_core_init_vm(struct kvm *kvm);
+extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index fcdc97eb9411..72b20b8749fa 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -984,6 +984,26 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int kvmppc_book3s_init(void)
 {
 	int r;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9f2e4a5e1c4d..9066325b23b2 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -865,6 +865,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return -ENOTSUPP;
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 int __init kvmppc_booke_init(void)
 {
 	unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 24e2b64b6a48..0c80e159c138 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -148,7 +148,7 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm)
 {
-	return 0;
+	return kvmppc_core_init_vm(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -164,6 +164,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		kvm->vcpus[i] = NULL;
 
 	atomic_set(&kvm->online_vcpus, 0);
+
+	kvmppc_core_destroy_vm(kvm);
+
 	mutex_unlock(&kvm->lock);
 }
 
@@ -212,7 +215,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return 0;
+	return kvmppc_core_prepare_memory_region(kvm, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -220,7 +223,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_memory_slot old,
                int user_alloc)
 {
-       return;
+	kvmppc_core_commit_memory_region(kvm, mem);
 }
 
 

From df6909e5d52f67be01862c5cb453e509aee661f1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:19:50 +0000
Subject: [PATCH 105/143] KVM: PPC: Move guest enter/exit down into
 subarch-specific code

Instead of doing the kvm_guest_enter/exit() and local_irq_dis/enable()
calls in powerpc.c, this moves them down into the subarch-specific
book3s_pr.c and booke.c.  This eliminates an extra local_irq_enable()
call in book3s_pr.c, and will be needed for when we do SMT4 guest
support in the book3s hypervisor mode code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_ppc.h   |  1 +
 arch/powerpc/kvm/book3s_interrupts.S |  2 +-
 arch/powerpc/kvm/book3s_pr.c         | 12 ++++++------
 arch/powerpc/kvm/booke.c             | 13 +++++++++++++
 arch/powerpc/kvm/powerpc.c           |  6 +-----
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9b6f3f99c5eb..48b7ab76de2d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -42,6 +42,7 @@ enum emulation_result {
 	EMULATE_AGAIN,        /* something went wrong. go again */
 };
 
+extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 2f0bc928b08a..8c5e0e160107 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -85,7 +85,7 @@
  *  r3: kvm_run pointer
  *  r4: vcpu pointer
  */
-_GLOBAL(__kvmppc_vcpu_entry)
+_GLOBAL(__kvmppc_vcpu_run)
 
 kvm_start_entry:
 	/* Write correct stack frame */
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 72b20b8749fa..0c0d3f274437 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -891,8 +891,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	vfree(vcpu_book3s);
 }
 
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int ret;
 	double fpr[32][TS_FPRWIDTH];
@@ -944,14 +943,15 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	/* Remember the MSR with disabled extensions */
 	ext_msr = current->thread.regs->msr;
 
-	/* XXX we get called with irq disabled - change that! */
-	local_irq_enable();
-
 	/* Preload FPU if it's enabled */
 	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
-	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
+	kvm_guest_enter();
+
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+	kvm_guest_exit();
 
 	local_irq_disable();
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9066325b23b2..ee45fa01220e 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -312,6 +312,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 		vcpu->arch.shared->int_pending = 0;
 }
 
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	local_irq_disable();
+	kvm_guest_enter();
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+	kvm_guest_exit();
+	local_irq_enable();
+
+	return ret;
+}
+
 /**
  * kvmppc_handle_exit
  *
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0c80e159c138..026036efcde0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -500,11 +500,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	kvmppc_core_deliver_interrupts(vcpu);
 
-	local_irq_disable();
-	kvm_guest_enter();
-	r = __kvmppc_vcpu_run(run, vcpu);
-	kvm_guest_exit();
-	local_irq_enable();
+	r = kvmppc_vcpu_run(run, vcpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

From 923c53caea446d246949c94703be83e68f251af7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:20:24 +0000
Subject: [PATCH 106/143] powerpc: Set up LPCR for running guest partitions

In hypervisor mode, the LPCR controls several aspects of guest
partitions, including virtual partition memory mode, and also controls
whether the hypervisor decrementer interrupts are enabled.  This sets
up LPCR at boot time so that guest partitions will use a virtual real
memory area (VRMA) composed of 16MB large pages, and hypervisor
decrementer interrupts are disabled.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/reg.h         |  4 ++++
 arch/powerpc/kernel/cpu_setup_power7.S | 18 +++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c5cae0dd176c..d879a6b91635 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -232,10 +232,12 @@
 #define   LPCR_VPM0	(1ul << (63-0))
 #define   LPCR_VPM1	(1ul << (63-1))
 #define   LPCR_ISL	(1ul << (63-2))
+#define   LPCR_VC_SH	(63-2)
 #define   LPCR_DPFD_SH	(63-11)
 #define   LPCR_VRMA_L	(1ul << (63-12))
 #define   LPCR_VRMA_LP0	(1ul << (63-15))
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
+#define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
@@ -243,8 +245,10 @@
 #define     LPCR_PECE1	0x00002000	/* decrementer can cause exit */
 #define     LPCR_PECE2	0x00001000	/* machine check etc can cause exit */
 #define   LPCR_MER	0x00000800	/* Mediated External Exception */
+#define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
+#define   LPCR_LPES_SH	2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S
index 4f9a93fcfe07..2ef6749688e9 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power7.S
@@ -61,19 +61,23 @@ __init_LPCR:
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
 	 *   DPFD = 4
+	 *   HDICE = 0
+	 *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+	 *   VRMASD = 0b10000 (L=1, LP=00)
 	 *
 	 * Other bits untouched for now
 	 */
 	mfspr	r3,SPRN_LPCR
-	ori	r3,r3,(LPCR_LPES0|LPCR_LPES1)
-	xori	r3,r3, LPCR_LPES0
+	li	r5,1
+	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
-	li	r5,7
-	sldi	r5,r5,LPCR_DPFD_SH
-	andc	r3,r3,r5
 	li	r5,4
-	sldi	r5,r5,LPCR_DPFD_SH
-	or	r3,r3,r5
+	rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+	clrrdi	r3,r3,1		/* clear HDICE */
+	li	r5,4
+	rldimi	r3,r5, LPCR_VC_SH, 0
+	li	r5,0x10
+	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
 	mtspr	SPRN_LPCR,r3
 	isync
 	blr

From 3c42bf8a717cb636e0ed2ed77194669e2ac3ed56 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:20:58 +0000
Subject: [PATCH 107/143] KVM: PPC: Split host-state fields out of
 kvmppc_book3s_shadow_vcpu

There are several fields in struct kvmppc_book3s_shadow_vcpu that
temporarily store bits of host state while a guest is running,
rather than anything relating to the particular guest or vcpu.
This splits them out into a new kvmppc_host_state structure and
modifies the definitions in asm-offsets.c to suit.

On 32-bit, we have a kvmppc_host_state structure inside the
kvmppc_book3s_shadow_vcpu since the assembly code needs to be able
to get to them both with one pointer.  On 64-bit they are separate
fields in the PACA.  This means that on 64-bit we don't need to
copy the kvmppc_host_state in and out on vcpu load/unload, and
in future will mean that the book3s_hv code doesn't need a
shadow_vcpu struct in the PACA at all.  That does mean that we
have to be careful not to rely on any values persisting in the
hstate field of the paca across any point where we could block
or get preempted.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/exception-64s.h  | 10 +--
 arch/powerpc/include/asm/kvm_book3s_asm.h | 27 +++++--
 arch/powerpc/include/asm/paca.h           |  1 +
 arch/powerpc/kernel/asm-offsets.c         | 96 +++++++++++------------
 arch/powerpc/kernel/exceptions-64s.S      |  2 +-
 arch/powerpc/kvm/book3s_interrupts.S      | 21 ++---
 arch/powerpc/kvm/book3s_rmhandlers.S      | 18 ++---
 arch/powerpc/kvm/book3s_segment.S         | 76 +++++++++---------
 8 files changed, 129 insertions(+), 122 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index b6a3a443fbde..296c9b66c04a 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -96,16 +96,16 @@
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
 #define __KVMTEST(n)							\
-	lbz	r10,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13);			\
+	lbz	r10,HSTATE_IN_GUEST(r13);			\
 	cmpwi	r10,0;							\
 	bne	do_kvm_##n
 
 #define __KVM_HANDLER(area, h, n)					\
 do_kvm_##n:								\
 	ld	r10,area+EX_R10(r13);					\
-	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	stw	r9,HSTATE_SCRATCH1(r13);			\
 	ld	r9,area+EX_R9(r13);					\
-	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	std	r12,HSTATE_SCRATCH0(r13);			\
 	li	r12,n;							\
 	b	kvmppc_interrupt
 
@@ -114,9 +114,9 @@ do_kvm_##n:								\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
 	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	stw	r9,HSTATE_SCRATCH1(r13);			\
 	ld	r9,area+EX_R9(r13);					\
-	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	std	r12,HSTATE_SCRATCH0(r13);			\
 	li	r12,n;							\
 	b	kvmppc_interrupt;					\
 89:	mtocrf	0x80,r9;						\
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d5a8a3861635..312617529864 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -60,6 +60,22 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+/*
+ * This struct goes in the PACA on 64-bit processors.  It is used
+ * to store host state that needs to be saved when we enter a guest
+ * and restored when we exit, but isn't specific to any particular
+ * guest or vcpu.  It also has some scratch fields used by the guest
+ * exit code.
+ */
+struct kvmppc_host_state {
+	ulong host_r1;
+	ulong host_r2;
+	ulong vmhandler;
+	ulong scratch0;
+	ulong scratch1;
+	u8 in_guest;
+};
+
 struct kvmppc_book3s_shadow_vcpu {
 	ulong gpr[14];
 	u32 cr;
@@ -73,17 +89,12 @@ struct kvmppc_book3s_shadow_vcpu {
 	ulong shadow_srr1;
 	ulong fault_dar;
 
-	ulong host_r1;
-	ulong host_r2;
-	ulong handler;
-	ulong scratch0;
-	ulong scratch1;
-	ulong vmhandler;
-	u8 in_guest;
-
 #ifdef CONFIG_PPC_BOOK3S_32
 	u32     sr[16];			/* Guest SRs */
+
+	struct kvmppc_host_state hstate;
 #endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
 	u8 slb_max;			/* highest used guest slb entry */
 	struct  {
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 74126765106a..58f4a18ef60c 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -149,6 +149,7 @@ struct paca_struct {
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+	struct kvmppc_host_state kvm_hstate;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index faf846131f45..dabfb7346f36 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -198,11 +198,6 @@ int main(void)
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
 	DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
-	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
-	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
-#endif
 #endif /* CONFIG_PPC64 */
 
 	/* RTAS */
@@ -416,49 +411,54 @@ int main(void)
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
-	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
-			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
-	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
-	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
-	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
-	DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
-	DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
-	DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
-	DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
-	DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
-	DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
-	DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
-	DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
-	DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
-	DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
-	DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
-	DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
-	DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
-	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
-	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
-	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
-	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
-	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
-	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 vmhandler));
-	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch0));
-	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch1));
-	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					in_guest));
-	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					   fault_dsisr));
-	DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 fault_dar));
-	DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 last_inst));
-	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					   shadow_srr1));
-#ifdef CONFIG_PPC_BOOK3S_32
-	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else	/* 32-bit */
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
 #endif
-#else
+
+	SVCPU_FIELD(SVCPU_CR, cr);
+	SVCPU_FIELD(SVCPU_XER, xer);
+	SVCPU_FIELD(SVCPU_CTR, ctr);
+	SVCPU_FIELD(SVCPU_LR, lr);
+	SVCPU_FIELD(SVCPU_PC, pc);
+	SVCPU_FIELD(SVCPU_R0, gpr[0]);
+	SVCPU_FIELD(SVCPU_R1, gpr[1]);
+	SVCPU_FIELD(SVCPU_R2, gpr[2]);
+	SVCPU_FIELD(SVCPU_R3, gpr[3]);
+	SVCPU_FIELD(SVCPU_R4, gpr[4]);
+	SVCPU_FIELD(SVCPU_R5, gpr[5]);
+	SVCPU_FIELD(SVCPU_R6, gpr[6]);
+	SVCPU_FIELD(SVCPU_R7, gpr[7]);
+	SVCPU_FIELD(SVCPU_R8, gpr[8]);
+	SVCPU_FIELD(SVCPU_R9, gpr[9]);
+	SVCPU_FIELD(SVCPU_R10, gpr[10]);
+	SVCPU_FIELD(SVCPU_R11, gpr[11]);
+	SVCPU_FIELD(SVCPU_R12, gpr[12]);
+	SVCPU_FIELD(SVCPU_R13, gpr[13]);
+	SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+	SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+	SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+	SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
+#ifdef CONFIG_PPC_BOOK3S_32
+	SVCPU_FIELD(SVCPU_SR, sr);
+#endif
+#ifdef CONFIG_PPC64
+	SVCPU_FIELD(SVCPU_SLB, slb);
+	SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+#endif
+
+	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+	HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+
+#else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -468,7 +468,7 @@ int main(void)
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
-#endif
+#endif /* CONFIG_KVM */
 
 #ifdef CONFIG_KVM_GUEST
 	DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e76472cbf3b5..6da00550afea 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -298,7 +298,7 @@ data_access_check_stab:
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
+	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
 	mfcr	r9
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 8c5e0e160107..c54b0e30cf3f 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -29,8 +29,7 @@
 #define ULONG_SIZE 		8
 #define FUNC(name) 		GLUE(.,name)
 
-#define GET_SHADOW_VCPU(reg)    \
-        addi    reg, r13, PACA_KVM_SVCPU
+#define GET_SHADOW_VCPU_R13
 
 #define DISABLE_INTERRUPTS	\
 	mfmsr   r0;		\
@@ -43,8 +42,8 @@
 #define ULONG_SIZE              4
 #define FUNC(name)		name
 
-#define GET_SHADOW_VCPU(reg)    \
-        lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+#define GET_SHADOW_VCPU_R13	\
+	lwz	r13, (THREAD + THREAD_KVM_SVCPU)(r2)
 
 #define DISABLE_INTERRUPTS	\
 	mfmsr   r0;		\
@@ -107,18 +106,12 @@ kvm_start_entry:
 	/* Load non-volatile guest state from the vcpu */
 	VCPU_LOAD_NVGPRS(r4)
 
-	GET_SHADOW_VCPU(r5)
-
-	/* Save R1/R2 in the PACA */
-	PPC_STL	r1, SVCPU_HOST_R1(r5)
-	PPC_STL	r2, SVCPU_HOST_R2(r5)
-
-	/* XXX swap in/out on load? */
-	PPC_LL	r3, VCPU_HIGHMEM_HANDLER(r4)
-	PPC_STL	r3, SVCPU_VMHANDLER(r5)
-
 kvm_start_lightweight:
 
+	GET_SHADOW_VCPU_R13
+	PPC_LL	r3, VCPU_HIGHMEM_HANDLER(r4)
+	PPC_STL	r3, HSTATE_VMHANDLER(r13)
+
 	PPC_LL	r10, VCPU_SHADOW_MSR(r4)	/* r10 = vcpu->arch.shadow_msr */
 
 	DISABLE_INTERRUPTS
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index dd03689fc609..c1f877c4a884 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -36,7 +36,6 @@
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define LOAD_SHADOW_VCPU(reg)	GET_PACA(reg)					
-#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
 #define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name) 		GLUE(.,name)
 
@@ -66,7 +65,6 @@ kvmppc_skip_Hinterrupt:
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
-#define SHADOW_VCPU_OFF		0
 #define MSR_NOIRQ		MSR_KERNEL
 #define FUNC(name)		name
 
@@ -96,14 +94,14 @@ kvmppc_trampoline_\intno:
 	b	kvmppc_resume_\intno		/* Get back original handler */
 
 1:	tophys(r13, r13)
-	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	stw	r12, HSTATE_SCRATCH1(r13)
 	mfspr	r12, SPRN_SPRG_SCRATCH1
-	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	lbz	r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stw	r12, HSTATE_SCRATCH0(r13)
+	lbz	r12, HSTATE_IN_GUEST(r13)
 	cmpwi	r12, KVM_GUEST_MODE_NONE
 	bne	..kvmppc_handler_hasmagic_\intno
 	/* No KVM guest? Then jump back to the Linux handler! */
-	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	lwz	r12, HSTATE_SCRATCH1(r13)
 	b	2b
 
 	/* Now we know we're handling a KVM guest */
@@ -146,8 +144,8 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
  *
  * R12            = free
  * R13            = Shadow VCPU (PACA)
- * SVCPU.SCRATCH0 = guest R12
- * SVCPU.SCRATCH1 = guest CR
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CR
  * SPRG_SCRATCH0  = guest R13
  *
  */
@@ -159,9 +157,9 @@ kvmppc_handler_skip_ins:
 	mtsrr0	r12
 
 	/* Clean up all state */
-	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	lwz	r12, HSTATE_SCRATCH1(r13)
 	mtcr	r12
-	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+	PPC_LL	r12, HSTATE_SCRATCH0(r13)
 	GET_SCRATCH0(r13)
 
 	/* And get back into the code */
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 4a623eb28a53..1cc25e8c0cf1 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -22,7 +22,7 @@
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define GET_SHADOW_VCPU(reg)    \
-	addi    reg, r13, PACA_KVM_SVCPU
+	mr	reg, r13
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
@@ -71,6 +71,10 @@ kvmppc_handler_trampoline_enter:
 	/* r3 = shadow vcpu */
 	GET_SHADOW_VCPU(r3)
 
+	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
+	PPC_STL	r1, HSTATE_HOST_R1(r3)
+	PPC_STL	r2, HSTATE_HOST_R2(r3)
+
 	/* Move SRR0 and SRR1 into the respective regs */
 	PPC_LL  r9, SVCPU_PC(r3)
 	mtsrr0	r9
@@ -78,7 +82,7 @@ kvmppc_handler_trampoline_enter:
 
 	/* Activate guest mode, so faults get handled by KVM */
 	li	r11, KVM_GUEST_MODE_GUEST
-	stb	r11, SVCPU_IN_GUEST(r3)
+	stb	r11, HSTATE_IN_GUEST(r3)
 
 	/* Switch to guest segment. This is subarch specific. */
 	LOAD_GUEST_SEGMENTS
@@ -132,30 +136,30 @@ kvmppc_interrupt:
 	 *
 	 * SPRG_SCRATCH0  = guest R13
 	 * R12            = exit handler id
-	 * R13            = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
-	 * SVCPU.SCRATCH0 = guest R12
-	 * SVCPU.SCRATCH1 = guest CR
+	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CR
 	 *
 	 */
 
 	/* Save registers */
 
-	PPC_STL	r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13)
-	PPC_STL	r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13)
-	PPC_STL	r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13)
-	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13)
-	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13)
-	PPC_STL	r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13)
-	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13)
-	PPC_STL	r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13)
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13)
-	PPC_STL	r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13)
-	PPC_STL	r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13)
+	PPC_STL	r0, SVCPU_R0(r13)
+	PPC_STL	r1, SVCPU_R1(r13)
+	PPC_STL	r2, SVCPU_R2(r13)
+	PPC_STL	r3, SVCPU_R3(r13)
+	PPC_STL	r4, SVCPU_R4(r13)
+	PPC_STL	r5, SVCPU_R5(r13)
+	PPC_STL	r6, SVCPU_R6(r13)
+	PPC_STL	r7, SVCPU_R7(r13)
+	PPC_STL	r8, SVCPU_R8(r13)
+	PPC_STL	r9, SVCPU_R9(r13)
+	PPC_STL	r10, SVCPU_R10(r13)
+	PPC_STL	r11, SVCPU_R11(r13)
 
 	/* Restore R1/R2 so we can handle faults */
-	PPC_LL	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
-	PPC_LL	r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+	PPC_LL	r1, HSTATE_HOST_R1(r13)
+	PPC_LL	r2, HSTATE_HOST_R2(r13)
 
 	/* Save guest PC and MSR */
 #ifdef CONFIG_PPC64
@@ -171,17 +175,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 1:	mfsrr0	r3
 	mfsrr1	r4
 2:
-	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13)
-	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13)
+	PPC_STL	r3, SVCPU_PC(r13)
+	PPC_STL	r4, SVCPU_SHADOW_SRR1(r13)
 
 	/* Get scratch'ed off registers */
 	GET_SCRATCH0(r9)
-	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	lwz	r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	PPC_LL	r8, HSTATE_SCRATCH0(r13)
+	lwz	r7, HSTATE_SCRATCH1(r13)
 
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13)
-	stw	r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13)
+	PPC_STL	r9, SVCPU_R13(r13)
+	PPC_STL	r8, SVCPU_R12(r13)
+	stw	r7, SVCPU_CR(r13)
 
 	/* Save more register state  */
 
@@ -191,11 +195,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 	mfctr	r8
 	mflr	r9
 
-	stw	r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13)
-	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13)
-	stw	r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13)
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13)
+	stw	r5, SVCPU_XER(r13)
+	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
+	stw	r7, SVCPU_FAULT_DSISR(r13)
+	PPC_STL	r8, SVCPU_CTR(r13)
+	PPC_STL	r9, SVCPU_LR(r13)
 
 	/*
 	 * In order for us to easily get the last instruction,
@@ -225,7 +229,7 @@ ld_last_inst:
 	/* Set guest mode to 'jump over instruction' so if lwz faults
 	 * we'll just continue at the next IP. */
 	li	r9, KVM_GUEST_MODE_SKIP
-	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stb	r9, HSTATE_IN_GUEST(r13)
 
 	/*    1) enable paging for data */
 	mfmsr	r9
@@ -239,13 +243,13 @@ ld_last_inst:
 	sync
 
 #endif
-	stw	r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13)
+	stw	r0, SVCPU_LAST_INST(r13)
 
 no_ld_last_inst:
 
 	/* Unset guest mode */
 	li	r9, KVM_GUEST_MODE_NONE
-	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stb	r9, HSTATE_IN_GUEST(r13)
 
 	/* Switch back to host MMU */
 	LOAD_HOST_SEGMENTS
@@ -255,7 +259,7 @@ no_ld_last_inst:
 	 * R1       = host R1
 	 * R2       = host R2
 	 * R12      = exit handler id
-	 * R13      = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)
 	 * SVCPU.*  = guest *
 	 *
 	 */
@@ -265,7 +269,7 @@ no_ld_last_inst:
 	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
 	mtsrr1	r7
 	/* Load highmem handler address */
-	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13)
+	PPC_LL	r8, HSTATE_VMHANDLER(r13)
 	mtsrr0	r8
 
 	RFI

From de56a948b9182fbcf92cb8212f114de096c2d574 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:21:34 +0000
Subject: [PATCH 108/143] KVM: PPC: Add support for Book3S processors in
 hypervisor mode

This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

This also adds a few new exports needed by the book3s_hv code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt         |  17 +
 arch/powerpc/include/asm/exception-64s.h  |  19 +-
 arch/powerpc/include/asm/kvm_asm.h        |   4 +
 arch/powerpc/include/asm/kvm_book3s.h     | 137 +++-
 arch/powerpc/include/asm/kvm_book3s_64.h  |   2 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |  12 +
 arch/powerpc/include/asm/kvm_booke.h      |   4 +
 arch/powerpc/include/asm/kvm_host.h       |  53 +-
 arch/powerpc/include/asm/kvm_ppc.h        |   6 +
 arch/powerpc/include/asm/mmu-hash64.h     |  10 +-
 arch/powerpc/include/asm/paca.h           |   2 +
 arch/powerpc/include/asm/reg.h            |   4 +
 arch/powerpc/kernel/asm-offsets.c         |  79 +++
 arch/powerpc/kernel/exceptions-64s.S      |  60 +-
 arch/powerpc/kernel/process.c             |   3 +
 arch/powerpc/kernel/setup-common.c        |   3 +
 arch/powerpc/kernel/smp.c                 |   1 +
 arch/powerpc/kvm/Kconfig                  |  37 +-
 arch/powerpc/kvm/Makefile                 |  17 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       | 258 +++++++
 arch/powerpc/kvm/book3s_exports.c         |   5 +
 arch/powerpc/kvm/book3s_hv.c              | 445 ++++++++++++
 arch/powerpc/kvm/book3s_hv_interrupts.S   | 136 ++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 806 ++++++++++++++++++++++
 arch/powerpc/kvm/book3s_segment.S         |  34 +-
 arch/powerpc/kvm/powerpc.c                |  22 +-
 arch/powerpc/kvm/trace.h                  |   2 +-
 include/linux/kvm.h                       |   6 +
 28 files changed, 2103 insertions(+), 81 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b2511360b8f5..e8875fef3eb8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1532,6 +1532,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+
+This is used on 64-bit PowerPC when emulating a pSeries partition,
+e.g. with the 'pseries' machine type in qemu.  It occurs when the
+guest does a hypercall using the 'sc 1' instruction.  The 'nr' field
+contains the hypercall number (from the guest R3), and 'args' contains
+the arguments (from the guest R4 - R12).  Userspace should put the
+return code in 'ret' and any extra returned values in args[].
+The possible hypercalls are defined in the Power Architecture Platform
+Requirements (PAPR) document available from www.power.org (free
+developer registration required to access it).
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 296c9b66c04a..69435da8f2ba 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -134,6 +134,17 @@ do_kvm_##n:								\
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_PR
+#define KVMTEST_PR(n)			__KVMTEST(n)
+#define KVM_HANDLER_PR(area, h, n)	__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_PR(n)
+#define KVM_HANDLER_PR(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)
+#endif
+
 #define NOTEST(n)
 
 /*
@@ -210,7 +221,7 @@ label##_pSeries:					\
 	HMT_MEDIUM;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST, vec)
+				 EXC_STD, KVMTEST_PR, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
@@ -227,8 +238,8 @@ label##_hv:						\
 	beq	masked_##h##interrupt
 #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
 
-#define SOFTEN_TEST(vec)						\
-	KVMTEST(vec);							\
+#define SOFTEN_TEST_PR(vec)						\
+	KVMTEST_PR(vec);						\
 	_SOFTEN_TEST(EXC_STD)
 
 #define SOFTEN_TEST_HV(vec)						\
@@ -248,7 +259,7 @@ label##_hv:						\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
-				    EXC_STD, SOFTEN_TEST)
+				    EXC_STD, SOFTEN_TEST_PR)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0951b17f4eb5..7b1f0e0fc653 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -64,8 +64,12 @@
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
 #define BOOK3S_INTERRUPT_DECREMENTER	0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
 #define BOOK3S_INTERRUPT_SYSCALL	0xc00
 #define BOOK3S_INTERRUPT_TRACE		0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 480fff6090db..5537c45d626c 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -116,6 +116,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
 extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -127,10 +128,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
@@ -140,6 +143,7 @@ extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 extern void kvmppc_handler_lowmem_trampoline(void);
 extern void kvmppc_handler_trampoline_enter(void);
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
@@ -151,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
 	return to_book3s(vcpu)->hior;
@@ -165,16 +182,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 		vcpu->arch.shared->int_pending = 0;
 }
 
-static inline ulong dsisr(void)
-{
-	ulong r;
-	asm ( "mfdsisr %0 " : "=r" (r) );
-	return r;
-}
-
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
-
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	if ( num < 14 ) {
@@ -281,6 +288,108 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 	return crit;
 }
+#else /* CONFIG_KVM_BOOK3S_PR */
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	/* Recalculate LPCR:MER based on the presence of
+	 * a pending external interrupt
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
+	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
+		vcpu->arch.lpcr |= LPCR_MER;
+	else
+		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+	return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault_dar;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+#endif
 
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
@@ -289,12 +398,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4cadd612d575..5f73388ea0af 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,9 +20,11 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#ifdef CONFIG_KVM_BOOK3S_PR
 static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 {
 	return &get_paca()->shadow_vcpu;
 }
+#endif
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 312617529864..b7b039532fbc 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -70,10 +70,22 @@ kvmppc_resume_\intno:
 struct kvmppc_host_state {
 	ulong host_r1;
 	ulong host_r2;
+	ulong host_msr;
 	ulong vmhandler;
 	ulong scratch0;
 	ulong scratch1;
 	u8 in_guest;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu *kvm_vcpu;
+	u64 dabr;
+	u64 host_mmcr[3];
+	u32 host_pmc[6];
+	u64 host_purr;
+	u64 host_spurr;
+	u64 host_dscr;
+	u64 dec_expires;
+#endif
 };
 
 struct kvmppc_book3s_shadow_vcpu {
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index 9c9ba3d59b1b..a90e09188777 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 069eb9fc6c41..4a3f790d5fc4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -33,7 +33,9 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
@@ -133,7 +135,26 @@ struct kvmppc_exit_timing {
 	};
 };
 
+struct kvmppc_pginfo {
+	unsigned long pfn;
+	atomic_t refcnt;
+};
+
 struct kvm_arch {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	unsigned long hpt_virt;
+	unsigned long ram_npages;
+	unsigned long ram_psize;
+	unsigned long ram_porder;
+	struct kvmppc_pginfo *ram_pginfo;
+	unsigned int lpid;
+	unsigned int host_lpid;
+	unsigned long host_lpcr;
+	unsigned long sdr1;
+	unsigned long host_sdr1;
+	int tlbie_lock;
+	unsigned short last_vcpu[NR_CPUS];
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
 struct kvmppc_pte {
@@ -190,7 +211,7 @@ struct kvm_vcpu_arch {
 	ulong rmcall;
 	ulong host_paca_phys;
 	struct kvmppc_slb slb[64];
-	int slb_max;		/* # valid entries in slb[] */
+	int slb_max;		/* 1 + index of last valid entry in slb[] */
 	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
@@ -212,7 +233,7 @@ struct kvm_vcpu_arch {
 #endif
 
 #ifdef CONFIG_VSX
-	u64 vsr[32];
+	u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -220,18 +241,24 @@ struct kvm_vcpu_arch {
 	u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
 	ulong pc;
 	ulong ctr;
 	ulong lr;
 
 	ulong xer;
 	u32 cr;
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S
 	ulong hflags;
 	ulong guest_owned_ext;
+	ulong purr;
+	ulong spurr;
+	ulong lpcr;
+	ulong dscr;
+	ulong amr;
+	ulong uamor;
+	u32 ctrl;
+	ulong dabr;
 #endif
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
@@ -270,6 +297,9 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 	u32 dbsr;
 
+	u64 mmcr[3];
+	u32 pmc[6];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
 	struct kvmppc_exit_timing timing_exit;
@@ -284,8 +314,12 @@ struct kvm_vcpu_arch {
 	struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+	ulong fault_dar;
+	u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-	u32 last_inst;
 	ulong fault_dear;
 	ulong fault_esr;
 	ulong queued_dear;
@@ -300,16 +334,25 @@ struct kvm_vcpu_arch {
 	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
+	u8 hcall_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
 	struct hrtimer dec_timer;
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
+	u64 dec_expires;
 	unsigned long pending_exceptions;
+	u16 last_cpu;
+	u32 last_inst;
+	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu_arch_shared shregs;
+#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48b7ab76de2d..0dafd53c30ed 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -112,6 +112,12 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index d865bd909c7d..b445e0af4c2b 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT	12
-#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
+#define HPTE_R_G		ASM_CONST(0x0000000000000008)
+#define HPTE_R_M		ASM_CONST(0x0000000000000010)
+#define HPTE_R_I		ASM_CONST(0x0000000000000020)
+#define HPTE_R_W		ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
 #define HPTE_R_C		ASM_CONST(0x0000000000000080)
 #define HPTE_R_R		ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 58f4a18ef60c..a6da12859959 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -147,8 +147,10 @@ struct paca_struct {
 	struct dtl_entry *dtl_curr;	/* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+#endif
 	struct kvmppc_host_state kvm_hstate;
 #endif
 };
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d879a6b91635..36a611b398c5 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -189,6 +189,9 @@
 #define SPRN_CTR	0x009	/* Count Register */
 #define SPRN_DSCR	0x11
 #define SPRN_CFAR	0x1c	/* Come From Address Register */
+#define SPRN_AMR	0x1d	/* Authority Mask Register */
+#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
+#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
 #define SPRN_ACOP	0x1F	/* Available Coprocessor Register */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
@@ -252,6 +255,7 @@
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
+#define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
 #define	SPRN_HMER	0x150	/* Hardware m? error recovery */
 #define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index dabfb7346f36..936267462cae 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -187,6 +187,7 @@ int main(void)
 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
@@ -392,6 +393,29 @@ int main(void)
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
@@ -403,17 +427,60 @@ int main(void)
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
 	/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+#endif
 #ifdef CONFIG_PPC_BOOK3S
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
+	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
+			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
+	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
 
 #ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
 # define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
 #else	/* 32-bit */
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
@@ -453,11 +520,23 @@ int main(void)
 
 	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
 	HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+	HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
 	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
 	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+	HSTATE_FIELD(HSTATE_PMC, host_pmc);
+	HSTATE_FIELD(HSTATE_PURR, host_purr);
+	HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+	HSTATE_FIELD(HSTATE_DABR, dabr);
+	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6da00550afea..163c041cec24 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -87,14 +87,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST, 0x300)
+				 KVMTEST_PR, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -125,7 +125,7 @@ data_access_slb_pSeries:
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 #ifdef __DISABLED__
@@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST)
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
-	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+	FTR_SECTION_ELSE
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST_PR)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
 
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
 
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
@@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
@@ -297,7 +297,7 @@ data_access_check_stab:
 	mfspr	r9,SPRN_DSISR
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
@@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
 	.align	7
@@ -336,11 +336,11 @@ do_stab_bolted_pSeries:
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -417,7 +417,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
 	.align	7
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 60ac2a9251db..ec2d0edeb134 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
 
 void enable_kernel_fp(void)
 {
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 		preempt_enable();
 	}
 }
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 79fca2651b65..22051ef04bd9 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -375,6 +375,9 @@ void __init check_for_initrd(void)
 
 int threads_per_core, threads_shift;
 cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
 
 static void __init cpu_init_thread_core_maps(int tpc)
 {
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8ebc6700b98d..09a85a9045d6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu)
 	if (likely(smp_ops))
 		smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b7baff78f90c..5d9b78ebbaa6 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
 	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
-	select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
 	bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_PR
+	bool
+	select KVM_MMIO
+
 config KVM_BOOK3S_32
 	tristate "KVM support for PowerPC book3s_32 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
 	select KVM
 	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_PR
 	---help---
 	  Support running unmodified book3s_32 guest kernels
 	  in virtual machines on book3s_32 host processors.
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32
 config KVM_BOOK3S_64
 	tristate "KVM support for PowerPC book3s_64 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_64
-	select KVM
 	select KVM_BOOK3S_64_HANDLER
+	select KVM
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
@@ -61,10 +66,37 @@ config KVM_BOOK3S_64
 
 	  If unsure, say N.
 
+config KVM_BOOK3S_64_HV
+	bool "KVM support for POWER7 using hypervisor mode in host"
+	depends on KVM_BOOK3S_64
+	---help---
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 processors that have hypervisor
+	  mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 (or later) processors, and can only emulate
+	  POWER5+, POWER6 and POWER7 processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+	def_bool y
+	depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
+	select KVM_BOOK3S_PR
+
 config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified 440 guest kernels in virtual machines on
 	  440 host processors.
@@ -89,6 +121,7 @@ config KVM_E500
 	bool "KVM support for PowerPC E500 processors"
 	depends on EXPERIMENTAL && E500
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index bf9854fa7f63..8a435a6da665 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -38,11 +38,10 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
-	$(common-objs-y) \
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+	../../../virt/kvm/coalesced_mmio.o \
 	fpu.o \
 	book3s_paired_singles.o \
-	book3s.o \
 	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
@@ -50,6 +49,18 @@ kvm-book3s_64-objs := \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
+
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o
+
+kvm-book3s_64-objs := \
+	../../../virt/kvm/kvm_main.o \
+	powerpc.o \
+	emulate.o \
+	book3s.o \
+	$(kvm-book3s_64-objs-y)
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
 
 kvm-book3s_32-objs := \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 000000000000..4a4fbec61a17
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,258 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER	24
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
+#define NR_LPIDS	(LPID_RSVD + 1)
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+	unsigned long hpt;
+	unsigned long lpid;
+
+	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+			       HPT_ORDER - PAGE_SHIFT);
+	if (!hpt) {
+		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+		return -ENOMEM;
+	}
+	kvm->arch.hpt_virt = hpt;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+		if (lpid >= NR_LPIDS) {
+			pr_err("kvm_alloc_hpt: No LPIDs free\n");
+			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+	kvm->arch.lpid = lpid;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+	kvm->arch.host_lpid = mfspr(SPRN_LPID);
+	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
+
+	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+	return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvmppc_pginfo *pginfo;
+
+	clear_bit(kvm->arch.lpid, lpid_inuse);
+	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = 0; i < kvm->arch.ram_npages; ++i)
+			put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+}
+
+static unsigned long user_page_size(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long size = PAGE_SIZE;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (vma)
+		size = vma_kernel_pagesize(vma);
+	up_read(&current->mm->mmap_sem);
+	return size;
+}
+
+static pfn_t hva_to_pfn(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page_to_pfn(page[0]);
+}
+
+long kvmppc_prepare_vrma(struct kvm *kvm,
+			 struct kvm_userspace_memory_region *mem)
+{
+	unsigned long psize, porder;
+	unsigned long i, npages;
+	struct kvmppc_pginfo *pginfo;
+	pfn_t pfn;
+	unsigned long hva;
+
+	/* First see what page size we have */
+	psize = user_page_size(mem->userspace_addr);
+	/* For now, only allow 16MB pages */
+	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
+		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
+		       psize, mem->memory_size, mem->userspace_addr);
+		return -EINVAL;
+	}
+	porder = __ilog2(psize);
+
+	npages = mem->memory_size >> porder;
+	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
+	if (!pginfo) {
+		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << porder);
+		if (user_page_size(hva) != psize)
+			goto err;
+		pfn = hva_to_pfn(hva);
+		if (pfn == 0) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
+			pr_err("oops, unaligned pfn %llx\n", pfn);
+			put_page(pfn_to_page(pfn));
+			goto err;
+		}
+		pginfo[i].pfn = pfn;
+	}
+
+	kvm->arch.ram_npages = npages;
+	kvm->arch.ram_psize = psize;
+	kvm->arch.ram_porder = porder;
+	kvm->arch.ram_pginfo = pginfo;
+
+	return 0;
+
+ err:
+	kfree(pginfo);
+	return -EINVAL;
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+	unsigned long i;
+	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long pfn;
+	unsigned long *hpte;
+	unsigned long hash;
+	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+	if (!pginfo)
+		return;
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - kvm->arch.ram_porder))
+		npages = 1ul << (40 - kvm->arch.ram_porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > HPT_NPTEG)
+		npages = HPT_NPTEG;
+
+	for (i = 0; i < npages; ++i) {
+		pfn = pginfo[i].pfn;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+		hpte += 7 * 2;
+		/* HPTE low word - RPN, protection, etc. */
+		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
+		wmb();
+		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+			HPTE_V_LARGE | HPTE_V_VALID;
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return -EINVAL;
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+	set_bit(mfspr(SPRN_LPID), lpid_inuse);
+	set_bit(LPID_RSVD, lpid_inuse);
+
+	return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				struct kvmppc_pte *gpte, bool data)
+{
+	return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index f94fd9a4c69a..88c8f26add02 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -20,6 +20,9 @@
 #include <linux/module.h>
 #include <asm/kvm_book3s.h>
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#else
 EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
 EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_rmcall);
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
 #ifdef CONFIG_VSX
 EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
 #endif
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 000000000000..60b7300568c8
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	u64 now;
+	unsigned long dec_nsec;
+
+	now = get_tb();
+	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_queue_dec(vcpu);
+	if (vcpu->arch.pending_exceptions)
+		return;
+	if (vcpu->arch.dec_expires != ~(u64)0) {
+		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+			tb_ticks_per_sec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	kvm_vcpu_block(vcpu);
+	vcpu->stat.halt_wakeup++;
+
+	if (vcpu->arch.dec_expires != ~(u64)0)
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		if (vcpu->arch.shregs.msr & MSR_PR) {
+			/* sc 1 from userspace - reflect to guest syscall */
+			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+			r = RESUME_GUEST;
+			break;
+		}
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest does a bad real-mode access,
+	 * as we have enabled VRMA (virtualized real mode area) mode in the
+	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					0x08000000);
+		r = RESUME_GUEST;
+		break;
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * We just generate a program interrupt to the guest, since
+	 * we don't emulate any guest instructions at this stage.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		kvmppc_core_queue_program(vcpu, 0x80000);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(tsk)) {
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long lpcr;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+	vcpu->arch.last_cpu = -1;
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to host PVR, since we can't spoof it */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
+	vcpu->arch.lpcr = lpcr;
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	return vcpu;
+
+free_vcpu:
+	kfree(vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	preempt_disable();
+
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 * XXX we should also block attempts to bring any
+	 * secondary threads online.
+	 */
+	if (threads_per_core > 1) {
+		int cpu = smp_processor_id();
+		int thr = cpu_thread_in_core(cpu);
+
+		if (thr)
+			goto out;
+		while (++thr < threads_per_core)
+			if (cpu_online(cpu + thr))
+				goto out;
+	}
+
+	kvm_guest_enter();
+
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	/* cancel pending dec exception if dec is positive */
+	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	return kvmppc_handle_exit(run, vcpu, current);
+
+ out:
+	preempt_enable();
+	return -EBUSY;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		return kvmppc_prepare_vrma(kvm, mem);
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	long r;
+
+	/* Allocate hashed page table */
+	r = kvmppc_alloc_hpt(kvm);
+
+	return r;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_hpt(kvm);
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hv_init();
+
+	return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 000000000000..532afaf19841
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,136 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, HSTATE_DSCR(r13)
+
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, HSTATE_DABR(r13)
+
+	/* Hard-disable interrupts */
+	mfmsr   r10
+	std	r10, HSTATE_HOST_MSR(r13)
+	rldicl  r10,r10,48,1
+	rotldi  r10,r10,16
+	mtmsrd  r10,1
+
+	/* Save host PMU registers and load guest PMU registers */
+	/* R4 is live here (vcpu pointer) but not r3 or r5 */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	isync
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r5, LPPACA_PMCINUSE(r3)
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r7, HSTATE_MMCR(r13)
+	std	r5, HSTATE_MMCR + 8(r13)
+	std	r6, HSTATE_MMCR + 16(r13)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, HSTATE_PMC(r13)
+	stw	r5, HSTATE_PMC + 4(r13)
+	stw	r6, HSTATE_PMC + 8(r13)
+	stw	r7, HSTATE_PMC + 12(r13)
+	stw	r8, HSTATE_PMC + 16(r13)
+	stw	r9, HSTATE_PMC + 20(r13)
+31:
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 */
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	mtspr	SPRN_HDEC,r8
+	extsw	r8,r8
+	add	r8,r8,r7
+	std	r8,HSTATE_DECEXP(r13)
+
+	/* Jump to partition switch code */
+	bl	.kvmppc_hv_entry_trampoline
+	nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are enabled again at this point.
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 */
+
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 000000000000..9af264840b98
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,806 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
+
+	.globl	kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+	mfspr	r13,SPRN_SRR0
+	addi	r13,r13,4
+	mtspr	SPRN_SRR0,r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
+
+	.globl	kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+	mfspr	r13,SPRN_HSRR0
+	addi	r13,r13,4
+	mtspr	SPRN_HSRR0,r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL(kvmppc_hv_entry_trampoline)
+	mfmsr	r10
+	LOAD_REG_ADDR(r5, kvmppc_hv_entry)
+	li	r0,MSR_RI
+	andc	r0,r10,r0
+	li	r6,MSR_IR | MSR_DR
+	andc	r6,r10,r6
+	mtmsrd	r0,1		/* clear RI in MSR */
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI
+
+#define ULONG_SIZE 		8
+#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_hv_entry
+kvmppc_hv_entry:
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * all other volatile GPRS = free
+	 */
+	mflr	r0
+	std	r0, HSTATE_VMHANDLER(r13)
+
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Save R1 in the PACA */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Set partition DABR */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+
+	/* Clear out SLB */
+	li	r6,0
+	slbmte	r6,r6
+	slbia
+	ptesync
+
+	/* Switch to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	ld	r6,KVM_SDR1(r9)
+	lwz	r7,KVM_LPID(r9)
+	li	r0,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	ld	r8,VCPU_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/*
+	 * Invalidate the TLB if we could possibly have stale TLB
+	 * entries for this partition on this core due to the use
+	 * of tlbiel.
+	 */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	lwz	r5,VCPU_VCPUID(r4)
+	lhz	r6,PACAPACAINDEX(r13)
+	lhz	r8,VCPU_LAST_CPU(r4)
+	sldi	r7,r6,1			/* see if this is the same vcpu */
+	add	r7,r7,r9		/* as last ran on this pcpu */
+	lhz	r0,KVM_LAST_VCPU(r7)
+	cmpw	r6,r8			/* on the same cpu core as last time? */
+	bne	3f
+	cmpw	r0,r5			/* same vcpu as this core last ran? */
+	beq	1f
+3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
+	sth	r5,KVM_LAST_VCPU(r7)
+	li	r6,128
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+	/* Load up guest SLB entries */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
+	/* Restore state of CTRL run bit; assume 1 on entry */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	mfspr	r6,SPRN_CTRLF
+	clrrdi	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	ld	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_XER(r4)
+
+	mtctr	r6
+	mtxer	r7
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+	ld	r10, VCPU_PC(r4)
+
+	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+fast_guest_return:
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/* Enter guest */
+
+	ld	r5, VCPU_LR(r4)
+	lwz	r6, VCPU_CR(r4)
+	mtlr	r5
+	mtcr	r6
+
+	ld	r0, VCPU_GPR(r0)(r4)
+	ld	r1, VCPU_GPR(r1)(r4)
+	ld	r2, VCPU_GPR(r2)(r4)
+	ld	r3, VCPU_GPR(r3)(r4)
+	ld	r5, VCPU_GPR(r5)(r4)
+	ld	r6, VCPU_GPR(r6)(r4)
+	ld	r7, VCPU_GPR(r7)(r4)
+	ld	r8, VCPU_GPR(r8)(r4)
+	ld	r9, VCPU_GPR(r9)(r4)
+	ld	r10, VCPU_GPR(r10)(r4)
+	ld	r11, VCPU_GPR(r11)(r4)
+	ld	r12, VCPU_GPR(r12)(r4)
+	ld	r13, VCPU_GPR(r13)(r4)
+
+	ld	r4, VCPU_GPR(r4)(r4)
+
+	hrfid
+	b	.
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt
+kvmppc_interrupt:
+	/*
+	 * Register contents:
+	 * R12		= interrupt vector
+	 * R13		= PACA
+	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+	std	r9, HSTATE_HOST_R2(r13)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(r0)(r9)
+	std	r1, VCPU_GPR(r1)(r9)
+	std	r2, VCPU_GPR(r2)(r9)
+	std	r3, VCPU_GPR(r3)(r9)
+	std	r4, VCPU_GPR(r4)(r9)
+	std	r5, VCPU_GPR(r5)(r9)
+	std	r6, VCPU_GPR(r6)(r9)
+	std	r7, VCPU_GPR(r7)(r9)
+	std	r8, VCPU_GPR(r8)(r9)
+	ld	r0, HSTATE_HOST_R2(r13)
+	std	r0, VCPU_GPR(r9)(r9)
+	std	r10, VCPU_GPR(r10)(r9)
+	std	r11, VCPU_GPR(r11)(r9)
+	ld	r3, HSTATE_SCRATCH0(r13)
+	lwz	r4, HSTATE_SCRATCH1(r13)
+	std	r3, VCPU_GPR(r12)(r9)
+	stw	r4, VCPU_CR(r9)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(r13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	stw	r12,VCPU_TRAP(r9)
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,0
+	bge	ignore_hdec
+2:
+
+	/* Check for mediated interrupts (could be done earlier really ...) */
+	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
+	bne+	1f
+	ld	r5,VCPU_LPCR(r9)
+	andi.	r0,r11,MSR_EE
+	beq	1f
+	andi.	r0,r5,LPCR_MER
+	bne	bounce_ext_interrupt
+1:
+
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Save HEIR (HV emulation assist reg) in last_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,-1
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	stw	r3,VCPU_LAST_INST(r9)
+
+	/* Save more register state  */
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+
+	stw	r5, VCPU_XER(r9)
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	std	r8, VCPU_CTR(r9)
+	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	6f
+7:	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Read the guest SLB and save it away */
+	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
+	mtctr	r0
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+	li	r5,0
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	bdnz	1b
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,HSTATE_PURR(r13)
+	ld	r4,HSTATE_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+	/* Clear out SLB */
+	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+
+hdec_soon:
+	/* Switch back to host partition */
+	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r6,KVM_HOST_SDR1(r4)
+	lwz	r7,KVM_HOST_LPID(r4)
+	li	r8,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r8)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r7)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(r14)(r9)
+	std	r15, VCPU_GPR(r15)(r9)
+	std	r16, VCPU_GPR(r16)(r9)
+	std	r17, VCPU_GPR(r17)(r9)
+	std	r18, VCPU_GPR(r18)(r9)
+	std	r19, VCPU_GPR(r19)(r9)
+	std	r20, VCPU_GPR(r20)(r9)
+	std	r21, VCPU_GPR(r21)(r9)
+	std	r22, VCPU_GPR(r22)(r9)
+	std	r23, VCPU_GPR(r23)(r9)
+	std	r24, VCPU_GPR(r24)(r9)
+	std	r25, VCPU_GPR(r25)(r9)
+	std	r26, VCPU_GPR(r26)(r9)
+	std	r27, VCPU_GPR(r27)(r9)
+	std	r28, VCPU_GPR(r28)(r9)
+	std	r29, VCPU_GPR(r29)(r9)
+	std	r30, VCPU_GPR(r30)(r9)
+	std	r31, VCPU_GPR(r31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* Save PMU registers */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+22:
+	/* save FP state */
+	mr	r3, r9
+	bl	.kvmppc_save_fp
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	/* Reload the host's PMU registers */
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r4, LPPACA_PMCINUSE(r3)
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+	lwz	r3, HSTATE_PMC(r13)
+	lwz	r4, HSTATE_PMC + 4(r13)
+	lwz	r5, HSTATE_PMC + 8(r13)
+	lwz	r6, HSTATE_PMC + 12(r13)
+	lwz	r8, HSTATE_PMC + 16(r13)
+	lwz	r9, HSTATE_PMC + 20(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, HSTATE_MMCR(r13)
+	ld	r4, HSTATE_MMCR + 8(r13)
+	ld	r5, HSTATE_MMCR + 16(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_MMCR0, r3
+	isync
+23:
+	/*
+	 * For external and machine check interrupts, we need
+	 * to call the Linux handler to process the interrupt.
+	 * We do that by jumping to the interrupt vector address
+	 * which we have in r12.  The [h]rfid at the end of the
+	 * handler will return to the book3s_hv_interrupts.S code.
+	 * For other interrupts we do the rfid to get back
+	 * to the book3s_interrupts.S code here.
+	 */
+	ld	r8, HSTATE_VMHANDLER(r13)
+	ld	r7, HSTATE_HOST_MSR(r13)
+
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	11f
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+
+	/* RFI into the highmem handler, or branch to interrupt handler */
+	mfmsr	r6
+	mtctr	r12
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+	mtmsrd	r6, 1			/* Clear RI in MSR */
+	mtsrr0	r8
+	mtsrr1	r7
+	beqctr
+	RFI
+
+11:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	ba	0x500
+
+6:	mfspr	r6,SPRN_HDAR
+	mfspr	r7,SPRN_HDSISR
+	b	7b
+
+ignore_hdec:
+	mr	r4,r9
+	b	fast_guest_return
+
+bounce_ext_interrupt:
+	mr	r4,r9
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10,BOOK3S_INTERRUPT_EXTERNAL
+	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+	b	fast_guest_return
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+*/
+_GLOBAL(kvmppc_save_fp)
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	isync
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VSRS
+	stxvd2x	reg,r6,r3
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	stfd	reg,reg*8+VCPU_FPRS(r3)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+	mffs	fr0
+	stfd	fr0,VCPU_FPSCR(r3)
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VRS
+	stvx	reg,r6,r3
+	reg = reg + 1
+	.endr
+	mfvscr	vr0
+	li	r6,VCPU_VSCR
+	stvx	vr0,r6,r3
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r3)
+	mtmsrd	r9
+	isync
+	blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ */
+	.globl	kvmppc_load_fp
+kvmppc_load_fp:
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	isync
+	lfd	fr0,VCPU_FPSCR(r4)
+	MTFSF_L(fr0)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VSRS
+	lxvd2x	reg,r7,r4
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	lfd	reg,reg*8+VCPU_FPRS(r4)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	li	r7,VCPU_VSCR
+	lvx	vr0,r7,r4
+	mtvscr	vr0
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VRS
+	lvx	reg,r7,r4
+	reg = reg + 1
+	.endr
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r4)
+	mtspr	SPRN_VRSAVE,r7
+	blr
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 1cc25e8c0cf1..134501691ad0 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -89,29 +89,29 @@ kvmppc_handler_trampoline_enter:
 
 	/* Enter guest */
 
-	PPC_LL	r4, (SVCPU_CTR)(r3)
-	PPC_LL	r5, (SVCPU_LR)(r3)
-	lwz	r6, (SVCPU_CR)(r3)
-	lwz	r7, (SVCPU_XER)(r3)
+	PPC_LL	r4, SVCPU_CTR(r3)
+	PPC_LL	r5, SVCPU_LR(r3)
+	lwz	r6, SVCPU_CR(r3)
+	lwz	r7, SVCPU_XER(r3)
 
 	mtctr	r4
 	mtlr	r5
 	mtcr	r6
 	mtxer	r7
 
-	PPC_LL	r0, (SVCPU_R0)(r3)
-	PPC_LL	r1, (SVCPU_R1)(r3)
-	PPC_LL	r2, (SVCPU_R2)(r3)
-	PPC_LL	r4, (SVCPU_R4)(r3)
-	PPC_LL	r5, (SVCPU_R5)(r3)
-	PPC_LL	r6, (SVCPU_R6)(r3)
-	PPC_LL	r7, (SVCPU_R7)(r3)
-	PPC_LL	r8, (SVCPU_R8)(r3)
-	PPC_LL	r9, (SVCPU_R9)(r3)
-	PPC_LL	r10, (SVCPU_R10)(r3)
-	PPC_LL	r11, (SVCPU_R11)(r3)
-	PPC_LL	r12, (SVCPU_R12)(r3)
-	PPC_LL	r13, (SVCPU_R13)(r3)
+	PPC_LL	r0, SVCPU_R0(r3)
+	PPC_LL	r1, SVCPU_R1(r3)
+	PPC_LL	r2, SVCPU_R2(r3)
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r5, SVCPU_R5(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r7, SVCPU_R7(r3)
+	PPC_LL	r8, SVCPU_R8(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
+	PPC_LL	r10, SVCPU_R10(r3)
+	PPC_LL	r11, SVCPU_R11(r3)
+	PPC_LL	r12, SVCPU_R12(r3)
+	PPC_LL	r13, SVCPU_R13(r3)
 
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 026036efcde0..3a4f379ee70f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,8 +38,12 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
+#ifndef CONFIG_KVM_BOOK3S_64_HV
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
+#else
+	return 1;
+#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -184,10 +188,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 #endif
-	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
+		r = 1;
+		break;
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
@@ -195,6 +202,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -291,6 +299,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+	vcpu->arch.dec_expires = ~(u64)0;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
@@ -317,6 +326,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 #endif
 	kvmppc_core_vcpu_load(vcpu, cpu);
+	vcpu->cpu = smp_processor_id();
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -325,6 +335,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_BOOKE
 	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 #endif
+	vcpu->cpu = -1;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -496,6 +507,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		for (i = 0; i < 32; i++)
 			kvmppc_set_gpr(vcpu, i, gprs[i]);
 		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 9; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
 	}
 
 	kvmppc_core_deliver_interrupts(vcpu);
@@ -518,6 +536,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
+	} else if (vcpu->cpu != -1) {
+		smp_send_reschedule(vcpu->cpu);
 	}
 
 	return 0;
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index d62a14b2cd0f..b135d3d397db 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
  *                         Book3S trace points                           *
  *************************************************************************/
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_KVM_BOOK3S_PR
 
 TRACE_EVENT(kvm_book3s_exit,
 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9c9ca7c7b491..a156294fc22a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
 		/* Fix the size of the union. */
 		char padding[256];
 	};

From a8606e20e41a8149456bafdf76ad29d47672027c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:22:05 +0000
Subject: [PATCH 109/143] KVM: PPC: Handle some PAPR hcalls in the kernel

This adds the infrastructure for handling PAPR hcalls in the kernel,
either early in the guest exit path while we are still in real mode,
or later once the MMU has been turned back on and we are in the full
kernel context.  The advantage of handling hcalls in real mode if
possible is that we avoid two partition switches -- and this will
become more important when we support SMT4 guests, since a partition
switch means we have to pull all of the threads in the core out of
the guest.  The disadvantage is that we can only access the kernel
linear mapping, not anything vmalloced or ioremapped, since the MMU
is off.

This also adds code to handle the following hcalls in real mode:

H_ENTER       Add an HPTE to the hashed page table
H_REMOVE      Remove an HPTE from the hashed page table
H_READ        Read HPTEs from the hashed page table
H_PROTECT     Change the protection bits in an HPTE
H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table
H_SET_DABR    Set the data address breakpoint register

Plus code to handle the following hcalls in the kernel:

H_CEDE        Idle the vcpu until an interrupt or H_PROD hcall arrives
H_PROD        Wake up a ceded vcpu
H_REGISTER_VPA Register a virtual processor area (VPA)

The code that runs in real mode has to be in the base kernel, not in
the module, if KVM is compiled as a module.  The real-mode code can
only access the kernel linear mapping, not vmalloc or ioremap space.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/hvcall.h       |   5 +
 arch/powerpc/include/asm/kvm_host.h     |  11 +
 arch/powerpc/include/asm/kvm_ppc.h      |   1 +
 arch/powerpc/kernel/asm-offsets.c       |   2 +
 arch/powerpc/kvm/Makefile               |   8 +-
 arch/powerpc/kvm/book3s_hv.c            | 170 ++++++++++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c     | 368 ++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 158 +++++++++-
 arch/powerpc/kvm/powerpc.c              |   2 +-
 9 files changed, 718 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_mmu.c

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index fd8201dddd4b..1c324ff55ea8 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -29,6 +29,10 @@
 #define H_LONG_BUSY_ORDER_100_SEC	9905  /* Long busy, hint that 100sec \
 						 is a good time to retry */
 #define H_LONG_BUSY_END_RANGE		9905  /* End of long busy range */
+
+/* Internal value used in book3s_hv kvm support; not returned to guests */
+#define H_TOO_HARD	9999
+
 #define H_HARDWARE	-1	/* Hardware error */
 #define H_FUNCTION	-2	/* Function not supported */
 #define H_PRIVILEGE	-3	/* Caller not privileged */
@@ -100,6 +104,7 @@
 #define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
 #define H_AVPN			(1UL<<(63-32))	/* An avpn is provided as a sanity test */
 #define H_ANDCOND		(1UL<<(63-33))
+#define H_LOCAL			(1UL<<(63-35))
 #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored for IO pages) */
 #define H_ICACHE_SYNCHRONIZE	(1UL<<(63-41))	/* dcbst, icbi, etc (ignored for IO pages */
 #define H_COALESCE_CAND	(1UL<<(63-42))	/* page is a good candidate for coalescing */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 4a3f790d5fc4..6ebf1721680a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -59,6 +59,10 @@ struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
 
+struct lppaca;
+struct slb_shadow;
+struct dtl;
+
 struct kvm_vm_stat {
 	u32 remote_tlb_flush;
 };
@@ -344,7 +348,14 @@ struct kvm_vcpu_arch {
 	u64 dec_expires;
 	unsigned long pending_exceptions;
 	u16 last_cpu;
+	u8 ceded;
+	u8 prodded;
 	u32 last_inst;
+
+	struct lppaca *vpa;
+	struct slb_shadow *slb_shadow;
+	struct dtl *dtl;
+	struct dtl *dtl_end;
 	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0dafd53c30ed..2afe92e6f625 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -118,6 +118,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
+extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 936267462cae..c70d106bf1a4 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -189,6 +189,7 @@ int main(void)
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -459,6 +460,7 @@ int main(void)
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
 	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 8a435a6da665..2ecffc0dc1bb 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -54,14 +54,17 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+	book3s_hv_rm_mmu.o
 
-kvm-book3s_64-objs := \
+kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
 	powerpc.o \
 	emulate.o \
 	book3s.o \
 	$(kvm-book3s_64-objs-y)
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
@@ -83,3 +86,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
+obj-y += $(kvm-book3s_64-builtin-objs-y)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 60b7300568c8..af862c30b70e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -124,6 +124,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 	       vcpu->arch.last_inst);
 }
 
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+	int r;
+	struct kvm_vcpu *v, *ret = NULL;
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(r, v, kvm) {
+		if (v->vcpu_id == id) {
+			ret = v;
+			break;
+		}
+	}
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+	vpa->shared_proc = 1;
+	vpa->yield_count = 1;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+				       unsigned long flags,
+				       unsigned long vcpuid, unsigned long vpa)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long pg_index, ra, len;
+	unsigned long pg_offset;
+	void *va;
+	struct kvm_vcpu *tvcpu;
+
+	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+	if (!tvcpu)
+		return H_PARAMETER;
+
+	flags >>= 63 - 18;
+	flags &= 7;
+	if (flags == 0 || flags == 4)
+		return H_PARAMETER;
+	if (flags < 4) {
+		if (vpa & 0x7f)
+			return H_PARAMETER;
+		/* registering new area; convert logical addr to real */
+		pg_index = vpa >> kvm->arch.ram_porder;
+		pg_offset = vpa & (kvm->arch.ram_psize - 1);
+		if (pg_index >= kvm->arch.ram_npages)
+			return H_PARAMETER;
+		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+			return H_PARAMETER;
+		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
+		ra |= pg_offset;
+		va = __va(ra);
+		if (flags <= 1)
+			len = *(unsigned short *)(va + 4);
+		else
+			len = *(unsigned int *)(va + 4);
+		if (pg_offset + len > kvm->arch.ram_psize)
+			return H_PARAMETER;
+		switch (flags) {
+		case 1:		/* register VPA */
+			if (len < 640)
+				return H_PARAMETER;
+			tvcpu->arch.vpa = va;
+			init_vpa(vcpu, va);
+			break;
+		case 2:		/* register DTL */
+			if (len < 48)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			len -= len % 48;
+			tvcpu->arch.dtl = va;
+			tvcpu->arch.dtl_end = va + len;
+			break;
+		case 3:		/* register SLB shadow buffer */
+			if (len < 8)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			tvcpu->arch.slb_shadow = va;
+			len = (len - 16) / 16;
+			tvcpu->arch.slb_shadow = va;
+			break;
+		}
+	} else {
+		switch (flags) {
+		case 5:		/* unregister VPA */
+			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
+				return H_RESOURCE;
+			tvcpu->arch.vpa = NULL;
+			break;
+		case 6:		/* unregister DTL */
+			tvcpu->arch.dtl = NULL;
+			break;
+		case 7:		/* unregister SLB shadow buffer */
+			tvcpu->arch.slb_shadow = NULL;
+			break;
+		}
+	}
+	return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long req = kvmppc_get_gpr(vcpu, 3);
+	unsigned long target, ret = H_SUCCESS;
+	struct kvm_vcpu *tvcpu;
+
+	switch (req) {
+	case H_CEDE:
+		vcpu->arch.shregs.msr |= MSR_EE;
+		vcpu->arch.ceded = 1;
+		smp_mb();
+		if (!vcpu->arch.prodded)
+			kvmppc_vcpu_block(vcpu);
+		else
+			vcpu->arch.prodded = 0;
+		smp_mb();
+		vcpu->arch.ceded = 0;
+		break;
+	case H_PROD:
+		target = kvmppc_get_gpr(vcpu, 4);
+		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		tvcpu->arch.prodded = 1;
+		smp_mb();
+		if (vcpu->arch.ceded) {
+			if (waitqueue_active(&vcpu->wq)) {
+				wake_up_interruptible(&vcpu->wq);
+				vcpu->stat.halt_wakeup++;
+			}
+		}
+		break;
+	case H_CONFER:
+		break;
+	case H_REGISTER_VPA:
+		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		break;
+	default:
+		return RESUME_HOST;
+	}
+	kvmppc_set_gpr(vcpu, 3, ret);
+	vcpu->arch.hcall_needed = 0;
+	return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			      struct task_struct *tsk)
 {
@@ -318,7 +470,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 
-int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u64 now;
 
@@ -370,6 +522,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return -EBUSY;
 }
 
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	do {
+		r = kvmppc_run_vcpu(run, vcpu);
+
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+		    !(vcpu->arch.shregs.msr & MSR_PR)) {
+			r = kvmppc_pseries_do_hcall(vcpu);
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	} while (r == RESUME_GUEST);
+	return r;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644
index 000000000000..edb0aae901a3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -0,0 +1,368 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+#define HPTE_V_HVLOCK	0x40UL
+
+static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	unsigned long porder;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long i, lpn, pa;
+	unsigned long *hpte;
+
+	/* only handle 4k, 64k and 16M pages for now */
+	porder = 12;
+	if (pteh & HPTE_V_LARGE) {
+		if ((ptel & 0xf000) == 0x1000) {
+			/* 64k page */
+			porder = 16;
+		} else if ((ptel & 0xff000) == 0) {
+			/* 16M page */
+			porder = 24;
+			/* lowest AVA bit must be 0 for 16M pages */
+			if (pteh & 0x80)
+				return H_PARAMETER;
+		} else
+			return H_PARAMETER;
+	}
+	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
+		return H_PARAMETER;
+	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+	if (!pa)
+		return H_PARAMETER;
+	/* Check WIMG */
+	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+		return H_PARAMETER;
+	pteh &= ~0x60UL;
+	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+	ptel |= pa;
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				return H_PTEG_FULL;
+			if ((*hpte & HPTE_V_VALID) == 0 &&
+			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+				break;
+			hpte += 2;
+		}
+	} else {
+		i = 0;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			return H_PTEG_FULL;
+	}
+	hpte[1] = ptel;
+	eieio();
+	hpte[0] = pteh;
+	asm volatile("ptesync" : : : "memory");
+	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+	vcpu->arch.gpr[4] = pte_index + i;
+	return H_SUCCESS;
+}
+
+static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+				      unsigned long pte_index)
+{
+	unsigned long rb, va_low;
+
+	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	va_low = pte_index >> 3;
+	if (v & HPTE_V_SECONDARY)
+		va_low = ~va_low;
+	/* xor vsid from AVA */
+	if (!(v & HPTE_V_1TB_SEG))
+		va_low ^= v >> 12;
+	else
+		va_low ^= v >> 24;
+	va_low &= 0x7ff;
+	if (v & HPTE_V_LARGE) {
+		rb |= 1;			/* L field */
+		if (r & 0xff000) {
+			/* non-16MB large page, must be 64k */
+			/* (masks depend on page size) */
+			rb |= 0x1000;		/* page encoding in LP field */
+			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
+		}
+	} else {
+		/* 4kB page */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+	}
+	rb |= (v >> 54) & 0x300;		/* B field */
+	return rb;
+}
+
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+	unsigned int tmp, old;
+	unsigned int token = LOCK_TOKEN;
+
+	asm volatile("1:lwarx	%1,0,%2\n"
+		     "	cmpwi	cr0,%1,0\n"
+		     "	bne	2f\n"
+		     "  stwcx.	%3,0,%2\n"
+		     "	bne-	1b\n"
+		     "  isync\n"
+		     "2:"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (lock), "r" (token)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn,
+		     unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
+	vcpu->arch.gpr[5] = r = hpte[1];
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = 0;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return H_SUCCESS;
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *args = &vcpu->arch.gpr[4];
+	unsigned long *hp, tlbrb[4];
+	long int i, found;
+	long int n_inval = 0;
+	unsigned long flags, req, pte_index;
+	long int local = 0;
+	long int ret = H_SUCCESS;
+
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		local = 1;
+	for (i = 0; i < 4; ++i) {
+		pte_index = args[i * 2];
+		flags = pte_index >> 56;
+		pte_index &= ((1ul << 56) - 1);
+		req = flags >> 6;
+		flags &= 3;
+		if (req == 3)
+			break;
+		if (req != 1 || flags == 3 ||
+		    pte_index >= (HPT_NPTEG << 3)) {
+			/* parameter error */
+			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
+			ret = H_PARAMETER;
+			break;
+		}
+		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		while (!lock_hpte(hp, HPTE_V_HVLOCK))
+			cpu_relax();
+		found = 0;
+		if (hp[0] & HPTE_V_VALID) {
+			switch (flags & 3) {
+			case 0:		/* absolute */
+				found = 1;
+				break;
+			case 1:		/* andcond */
+				if (!(hp[0] & args[i * 2 + 1]))
+					found = 1;
+				break;
+			case 2:		/* AVPN */
+				if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+					found = 1;
+				break;
+			}
+		}
+		if (!found) {
+			hp[0] &= ~HPTE_V_HVLOCK;
+			args[i * 2] = ((0x90 | flags) << 56) + pte_index;
+			continue;
+		}
+		/* insert R and C bits from PTE */
+		flags |= (hp[1] >> 5) & 0x0c;
+		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
+		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+		hp[0] = 0;
+	}
+	if (n_inval == 0)
+		return ret;
+
+	if (!local) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile(PPC_TLBIE(%1,%0)
+				     : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+		      unsigned long pte_index, unsigned long avpn,
+		      unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	v = hpte[0];
+	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = v & ~HPTE_V_VALID;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	hpte[1] = r;
+	eieio();
+	hpte[0] = v & ~HPTE_V_HVLOCK;
+	asm volatile("ptesync" : : : "memory");
+	return H_SUCCESS;
+}
+
+static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
+{
+	long int i;
+	unsigned long offset, rpn;
+
+	offset = realaddr & (kvm->arch.ram_psize - 1);
+	rpn = (realaddr - offset) >> PAGE_SHIFT;
+	for (i = 0; i < kvm->arch.ram_npages; ++i)
+		if (rpn == kvm->arch.ram_pginfo[i].pfn)
+			return (i << PAGE_SHIFT) + offset;
+	return HPTE_R_RPN;	/* all 1s in the RPN field */
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+		   unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte, r;
+	int i, n = 1;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (flags & H_READ_4) {
+		pte_index &= ~3;
+		n = 4;
+	}
+	for (i = 0; i < n; ++i, ++pte_index) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		r = hpte[1];
+		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
+			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
+				(r & ~HPTE_R_RPN);
+		vcpu->arch.gpr[4 + i * 2] = hpte[0];
+		vcpu->arch.gpr[5 + i * 2] = r;
+	}
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9af264840b98..319ff63b1f31 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -166,6 +166,14 @@ kvmppc_hv_entry:
 	/* Save R1 in the PACA */
 	std	r1, HSTATE_HOST_R1(r13)
 
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	lwz	r5, LPPACA_YIELDCOUNT(r3)
+	addi	r5, r5, 1
+	stw	r5, LPPACA_YIELDCOUNT(r3)
+25:
 	/* Load up DAR and DSISR */
 	ld	r5, VCPU_DAR(r4)
 	lwz	r6, VCPU_DSISR(r4)
@@ -401,6 +409,10 @@ kvmppc_interrupt:
 	cmpwi	r3,0
 	bge	ignore_hdec
 2:
+	/* See if this is something we can handle in real mode */
+	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
+	beq	hcall_try_real_mode
+hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
@@ -579,13 +591,28 @@ hdec_soon:
 	std	r5, VCPU_SPRG2(r9)
 	std	r6, VCPU_SPRG3(r9)
 
-	/* Save PMU registers */
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	lwz	r3, LPPACA_YIELDCOUNT(r8)
+	addi	r3, r3, 1
+	stw	r3, LPPACA_YIELDCOUNT(r8)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
 	isync
-	mfspr	r5, SPRN_MMCR1
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r7, LPPACA_PMCINUSE(r8)
+	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
 	mfspr	r6, SPRN_MMCRA
 	std	r4, VCPU_MMCR(r9)
 	std	r5, VCPU_MMCR + 8(r9)
@@ -676,6 +703,125 @@ hdec_soon:
 	mfspr	r7,SPRN_HDSISR
 	b	7b
 
+/*
+ * Try to handle an hcall in real mode.
+ * Returns to the guest if we handle it, or continues on up to
+ * the kernel if we can't (i.e. if we don't have a handler for
+ * it, or if the handler returns H_TOO_HARD).
+ */
+	.globl	hcall_try_real_mode
+hcall_try_real_mode:
+	ld	r3,VCPU_GPR(r3)(r9)
+	andi.	r0,r11,MSR_PR
+	bne	hcall_real_cont
+	clrrdi	r3,r3,2
+	cmpldi	r3,hcall_real_table_end - hcall_real_table
+	bge	hcall_real_cont
+	LOAD_REG_ADDR(r4, hcall_real_table)
+	lwzx	r3,r3,r4
+	cmpwi	r3,0
+	beq	hcall_real_cont
+	add	r3,r3,r4
+	mtctr	r3
+	mr	r3,r9		/* get vcpu pointer */
+	ld	r4,VCPU_GPR(r4)(r9)
+	bctrl
+	cmpdi	r3,H_TOO_HARD
+	beq	hcall_real_fallback
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	std	r3,VCPU_GPR(r3)(r4)
+	ld	r10,VCPU_PC(r4)
+	ld	r11,VCPU_MSR(r4)
+	b	fast_guest_return
+
+	/* We've attempted a real mode hcall, but it's punted it back
+	 * to userspace.  We need to restore some clobbered volatiles
+	 * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+	li	r12,BOOK3S_INTERRUPT_SYSCALL
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r11, VCPU_MSR(r9)
+
+	b	hcall_real_cont
+
+	.globl	hcall_real_table
+hcall_real_table:
+	.long	0		/* 0 - unused */
+	.long	.kvmppc_h_remove - hcall_real_table
+	.long	.kvmppc_h_enter - hcall_real_table
+	.long	.kvmppc_h_read - hcall_real_table
+	.long	0		/* 0x10 - H_CLEAR_MOD */
+	.long	0		/* 0x14 - H_CLEAR_REF */
+	.long	.kvmppc_h_protect - hcall_real_table
+	.long	0		/* 0x1c - H_GET_TCE */
+	.long	0		/* 0x20 - H_SET_TCE */
+	.long	0		/* 0x24 - H_SET_SPRG0 */
+	.long	.kvmppc_h_set_dabr - hcall_real_table
+	.long	0		/* 0x2c */
+	.long	0		/* 0x30 */
+	.long	0		/* 0x34 */
+	.long	0		/* 0x38 */
+	.long	0		/* 0x3c */
+	.long	0		/* 0x40 */
+	.long	0		/* 0x44 */
+	.long	0		/* 0x48 */
+	.long	0		/* 0x4c */
+	.long	0		/* 0x50 */
+	.long	0		/* 0x54 */
+	.long	0		/* 0x58 */
+	.long	0		/* 0x5c */
+	.long	0		/* 0x60 */
+	.long	0		/* 0x64 */
+	.long	0		/* 0x68 */
+	.long	0		/* 0x6c */
+	.long	0		/* 0x70 */
+	.long	0		/* 0x74 */
+	.long	0		/* 0x78 */
+	.long	0		/* 0x7c */
+	.long	0		/* 0x80 */
+	.long	0		/* 0x84 */
+	.long	0		/* 0x88 */
+	.long	0		/* 0x8c */
+	.long	0		/* 0x90 */
+	.long	0		/* 0x94 */
+	.long	0		/* 0x98 */
+	.long	0		/* 0x9c */
+	.long	0		/* 0xa0 */
+	.long	0		/* 0xa4 */
+	.long	0		/* 0xa8 */
+	.long	0		/* 0xac */
+	.long	0		/* 0xb0 */
+	.long	0		/* 0xb4 */
+	.long	0		/* 0xb8 */
+	.long	0		/* 0xbc */
+	.long	0		/* 0xc0 */
+	.long	0		/* 0xc4 */
+	.long	0		/* 0xc8 */
+	.long	0		/* 0xcc */
+	.long	0		/* 0xd0 */
+	.long	0		/* 0xd4 */
+	.long	0		/* 0xd8 */
+	.long	0		/* 0xdc */
+	.long	0		/* 0xe0 */
+	.long	0		/* 0xe4 */
+	.long	0		/* 0xe8 */
+	.long	0		/* 0xec */
+	.long	0		/* 0xf0 */
+	.long	0		/* 0xf4 */
+	.long	0		/* 0xf8 */
+	.long	0		/* 0xfc */
+	.long	0		/* 0x100 */
+	.long	0		/* 0x104 */
+	.long	0		/* 0x108 */
+	.long	0		/* 0x10c */
+	.long	0		/* 0x110 */
+	.long	0		/* 0x114 */
+	.long	0		/* 0x118 */
+	.long	0		/* 0x11c */
+	.long	0		/* 0x120 */
+	.long	.kvmppc_h_bulk_remove - hcall_real_table
+hcall_real_table_end:
+
 ignore_hdec:
 	mr	r4,r9
 	b	fast_guest_return
@@ -688,10 +834,16 @@ bounce_ext_interrupt:
 	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
 	b	fast_guest_return
 
+_GLOBAL(kvmppc_h_set_dabr)
+	std	r4,VCPU_DABR(r3)
+	mtspr	SPRN_DABR,r4
+	li	r3,0
+	blr
+
 /*
  * Save away FP, VMX and VSX registers.
  * r3 = vcpu pointer
-*/
+ */
 _GLOBAL(kvmppc_save_fp)
 	mfmsr	r9
 	ori	r8,r9,MSR_FP
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3a4f379ee70f..6fc9ee499b61 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -42,7 +42,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
 #else
-	return 1;
+	return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
 #endif
 }
 

From 54738c097163c3f01e67ccc85462b78d4d4f495f Mon Sep 17 00:00:00 2001
From: David Gibson <dwg@au1.ibm.com>
Date: Wed, 29 Jun 2011 00:22:41 +0000
Subject: [PATCH 110/143] KVM: PPC: Accelerate H_PUT_TCE by implementing it in
 real mode

This improves I/O performance for guests using the PAPR
paravirtualization interface by making the H_PUT_TCE hcall faster, by
implementing it in real mode.  H_PUT_TCE is used for updating virtual
IOMMU tables, and is used both for virtual I/O and for real I/O in the
PAPR interface.

Since this moves the IOMMU tables into the kernel, we define a new
KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.  The
ioctl returns a file descriptor which can be used to mmap the newly
created table.  The qemu driver models use them in the same way as
userspace managed tables, but they can be updated directly by the
guest with a real-mode H_PUT_TCE implementation, reducing the number
of host/guest context switches during guest IO.

There are certain circumstances where it is useful for userland qemu
to write to the TCE table even if the kernel H_PUT_TCE path is used
most of the time.  Specifically, allowing this will avoid awkwardness
when we need to reset the table.  More importantly, we will in the
future need to write the table in order to restore its state after a
checkpoint resume or migration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt        |  35 +++++++
 arch/powerpc/include/asm/kvm.h           |   9 ++
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 +
 arch/powerpc/include/asm/kvm_host.h      |   9 ++
 arch/powerpc/include/asm/kvm_ppc.h       |   2 +
 arch/powerpc/kvm/Makefile                |   3 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c      |  73 ++++++++++++++
 arch/powerpc/kvm/book3s_hv.c             | 116 ++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   2 +-
 arch/powerpc/kvm/powerpc.c               |  18 ++++
 include/linux/kvm.h                      |   2 +
 11 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e8875fef3eb8..a1d344d5ff4c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1350,6 +1350,41 @@ The following flags are defined:
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
+4.62 KVM_CREATE_SPAPR_TCE
+
+Capability: KVM_CAP_SPAPR_TCE
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table.  The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every 4kiB of the DMA window.
+
+When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
+table has been created using this ioctl(), the kernel will handle it
+in real mode, updating the TCE table.  H_PUT_TCE calls for other
+liobns will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index d2ca5ed3877b..c3ec990daf45 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -22,6 +22,9 @@
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
@@ -272,4 +275,10 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_SET_LEVEL	-3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 5f73388ea0af..e43fe42b9875 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -27,4 +27,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 }
 #endif
 
+#define SPAPR_TCE_SHIFT		12
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6ebf1721680a..5616e39a7fa4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -144,6 +144,14 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_table {
+	struct list_head list;
+	struct kvm *kvm;
+	u64 liobn;
+	u32 window_size;
+	struct page *pages[0];
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -157,6 +165,7 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2afe92e6f625..99f6fcf4cf88 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -119,6 +119,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				struct kvm_create_spapr_tce *args);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2ecffc0dc1bb..1de3d54901d4 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -55,7 +55,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
-	book3s_hv_rm_mmu.o
+	book3s_hv_rm_mmu.o \
+	book3s_64_vio_hv.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 000000000000..ea0f8c537c28
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+			/* 	    liobn, stt, stt->window_size); */
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			/* FIXME: Need to validate the TCE itself */
+			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+			tbl[idx % TCES_PER_PAGE] = tce;
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index af862c30b70e..6fe469eabce8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -538,6 +538,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDWR);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
@@ -559,13 +669,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
+	if (r)
+		return r;
 
-	return r;
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 	kvmppc_free_hpt(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
 
 /* These are stubs for now */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 319ff63b1f31..e6adaadcdff2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -754,7 +754,7 @@ hcall_real_table:
 	.long	0		/* 0x14 - H_CLEAR_REF */
 	.long	.kvmppc_h_protect - hcall_real_table
 	.long	0		/* 0x1c - H_GET_TCE */
-	.long	0		/* 0x20 - H_SET_TCE */
+	.long	.kvmppc_h_put_tce - hcall_real_table
 	.long	0		/* 0x24 - H_SET_SPRG0 */
 	.long	.kvmppc_h_set_dabr - hcall_real_table
 	.long	0		/* 0x2c */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6fc9ee499b61..c78ceb9d5605 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -202,6 +202,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_SPAPR_TCE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -653,6 +658,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+		struct kvm *kvm = filp->private_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+		goto out;
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a156294fc22a..61f56502732e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -550,6 +550,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_TSC_CONTROL 60
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -752,6 +753,7 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 

From 371fefd6f2dc46668e00871930dde613b88d4bde Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:23:08 +0000
Subject: [PATCH 111/143] KVM: PPC: Allow book3s_hv guests to use SMT processor
 modes

This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7.  The host still has to run single-threaded.

This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability.  The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.

To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode.  KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline).  To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it.  Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.

When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host.  This number is exported
to userspace via the KVM_CAP_PPC_SMT capability.  If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.

We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host.  We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked.  This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.

When a vcore starts to run, it executes in the context of one of the
vcpu threads.  The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).

It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running.  In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest.  It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.

Note that there is no fixed relationship between the hardware thread
number and the vcpu number.  Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt         |  13 +
 arch/powerpc/include/asm/kvm.h            |   1 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   2 +
 arch/powerpc/include/asm/kvm_host.h       |  46 +++-
 arch/powerpc/include/asm/kvm_ppc.h        |  13 +
 arch/powerpc/kernel/asm-offsets.c         |   6 +
 arch/powerpc/kernel/exceptions-64s.S      |  31 ++-
 arch/powerpc/kernel/idle_power7.S         |   2 -
 arch/powerpc/kvm/book3s_hv.c              | 322 ++++++++++++++++++++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 168 ++++++++++-
 arch/powerpc/kvm/powerpc.c                |   4 +
 arch/powerpc/sysdev/xics/icp-native.c     |   9 +
 include/linux/kvm.h                       |   1 +
 13 files changed, 570 insertions(+), 48 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a1d344d5ff4c..681871311d3e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
 
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index c3ec990daf45..471bb3d85e0b 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -24,6 +24,7 @@
 
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
 
 struct kvm_regs {
 	__u64 pc;
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index b7b039532fbc..9cfd5436782d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -78,6 +78,8 @@ struct kvmppc_host_state {
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu *kvm_vcpu;
+	struct kvmppc_vcore *kvm_vcore;
+	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
 	u32 host_pmc[6];
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5616e39a7fa4..0d6d569e19c7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,10 +25,14 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS		NR_CPUS
+#define KVM_MAX_VCORES		NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -167,9 +171,34 @@ struct kvm_arch {
 	int tlbie_lock;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
+	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+	int n_runnable;
+	int n_blocked;
+	int num_threads;
+	int entry_exit_count;
+	int n_woken;
+	int nap_count;
+	u16 pcpu;
+	u8 vcore_running;
+	u8 in_guest;
+	struct list_head runnable_threads;
+	spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -365,14 +394,29 @@ struct kvm_vcpu_arch {
 	struct slb_shadow *slb_shadow;
 	struct dtl *dtl;
 	struct dtl *dtl_end;
+
+	struct kvmppc_vcore *vcore;
+	int ret;
 	int trap;
+	int state;
+	int ptid;
+	wait_queue_head_t cpu_run;
+
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
+
+	struct list_head run_list;
+	struct task_struct *run_task;
+	struct kvm_run *kvm_run;
 #endif
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST	0
+#define KVMPPC_VCPU_BLOCKED		1
+#define KVMPPC_VCPU_RUNNABLE		2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 99f6fcf4cf88..6ef734428634 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
@@ -169,4 +172,14 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+	paca[cpu].kvm_hstate.xics_phys = addr;
+}
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c70d106bf1a4..d0f2387fd792 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -471,6 +471,10 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -530,6 +534,8 @@ int main(void)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 163c041cec24..5bc06fdfa6c0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
 	 * state loss at this time.
 	 */
 	mfspr	r13,SPRN_SRR1
-	rlwinm	r13,r13,47-31,30,31
-	cmpwi	cr0,r13,1
-	bne	1f
-	b	.power7_wakeup_noloss
-1:	cmpwi	cr0,r13,2
-	bne	1f
-	b	.power7_wakeup_loss
+	rlwinm.	r13,r13,47-31,30,31
+	beq	9f
+
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
 	/* Total loss of HV state is fatal, we could try to use the
 	 * PIR to locate a PACA, then use an emergency stack etc...
 	 * but for now, let's just stay stuck here
 	 */
-1:	cmpwi	cr0,r13,3
-	beq	.
+	bgt	cr1,.
+	GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	lbz	r0,PACAPROCSTART(r13)
+	cmpwi	r0,0x80
+	bne	1f
+	li	r0,0
+	stb	r0,PACAPROCSTART(r13)
+	b	kvm_start_guest
+1:
+#endif
+
+	beq	cr1,2f
+	b	.power7_wakeup_noloss
+2:	b	.power7_wakeup_loss
+9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index f8f0bc7f1d4f..3a70845a51c7 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
 	b	.
 
 _GLOBAL(power7_wakeup_loss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
 	rfid
 
 _GLOBAL(power7_wakeup_noloss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	ld	r4,_MSR(r1)
 	ld	r5,_NIP(r1)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6fe469eabce8..36b6d98f1197 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -39,6 +39,7 @@
 #include <asm/mmu_context.h>
 #include <asm/lppaca.h>
 #include <asm/processor.h>
+#include <asm/cputhreads.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
@@ -51,12 +52,16 @@
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+	local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
 void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	u64 now;
@@ -74,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 			      HRTIMER_MODE_REL);
 	}
 
+	kvmppc_vcpu_blocked(vcpu);
+
 	kvm_vcpu_block(vcpu);
 	vcpu->stat.halt_wakeup++;
 
 	if (vcpu->arch.dec_expires != ~(u64)0)
 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+	kvmppc_vcpu_unblocked(vcpu);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -429,9 +438,16 @@ int kvmppc_core_check_processor_compat(void)
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
-	int err = -ENOMEM;
+	int err = -EINVAL;
+	int core;
+	struct kvmppc_vcore *vcore;
 	unsigned long lpcr;
 
+	core = id / threads_per_core;
+	if (core >= KVM_MAX_VCORES)
+		goto out;
+
+	err = -ENOMEM;
 	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
 	if (!vcpu)
 		goto out;
@@ -454,6 +470,38 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
+	/*
+	 * Some vcpus may start out in stopped state.  If we initialize
+	 * them to busy-in-host state they will stop other vcpus in the
+	 * vcore from running.  Instead we initialize them to blocked
+	 * state, effectively considering them to be stopped until we
+	 * see the first run ioctl for them.
+	 */
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+
+	mutex_lock(&kvm->lock);
+	vcore = kvm->arch.vcores[core];
+	if (!vcore) {
+		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+		if (vcore) {
+			INIT_LIST_HEAD(&vcore->runnable_threads);
+			spin_lock_init(&vcore->lock);
+		}
+		kvm->arch.vcores[core] = vcore;
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		goto free_vcpu;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	++vcore->n_blocked;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+
 	return vcpu;
 
 free_vcpu:
@@ -468,21 +516,121 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kfree(vcpu);
 }
 
-extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-
-static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+	++vc->n_blocked;
+	if (vc->n_runnable > 0 &&
+	    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+		vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+					arch.run_list);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+	spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_blocked;
+	spin_unlock(&vc->lock);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+				   struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu *v;
+
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_runnable;
+	/* decrement the physical thread id of each following vcpu */
+	v = vcpu;
+	list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
+		--v->arch.ptid;
+	list_del(&vcpu->arch.run_list);
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	cpu = vc->pcpu + vcpu->arch.ptid;
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	tpaca->kvm_hstate.kvm_vcore = vc;
+	smp_wmb();
+#ifdef CONFIG_PPC_ICP_NATIVE
+	if (vcpu->arch.ptid) {
+		tpaca->cpu_start = 0x80;
+		tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
+		wmb();
+		xics_wake_cpu(cpu);
+		++vc->n_woken;
+	}
+#endif
+}
+
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+	int i;
+
+	HMT_low();
+	i = 0;
+	while (vc->nap_count < vc->n_woken) {
+		if (++i >= 1000000) {
+			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+			       vc->nap_count, vc->n_woken);
+			break;
+		}
+		cpu_relax();
+	}
+	HMT_medium();
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.
+ */
+static int on_primary_thread(void)
+{
+	int cpu = smp_processor_id();
+	int thr = cpu_thread_in_core(cpu);
+
+	if (thr)
+		return 0;
+	while (++thr < threads_per_core)
+		if (cpu_online(cpu + thr))
+			return 0;
+	return 1;
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+	long ret;
 	u64 now;
 
-	if (signal_pending(current)) {
-		run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
-	}
-
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
-	preempt_disable();
+	/* don't start if any threads have a signal pending */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		if (signal_pending(vcpu->arch.run_task))
+			return 0;
 
 	/*
 	 * Make sure we are running on thread 0, and that
@@ -490,36 +638,150 @@ static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	 * XXX we should also block attempts to bring any
 	 * secondary threads online.
 	 */
-	if (threads_per_core > 1) {
-		int cpu = smp_processor_id();
-		int thr = cpu_thread_in_core(cpu);
-
-		if (thr)
-			goto out;
-		while (++thr < threads_per_core)
-			if (cpu_online(cpu + thr))
-				goto out;
+	if (threads_per_core > 1 && !on_primary_thread()) {
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			vcpu->arch.ret = -EBUSY;
+		goto out;
 	}
 
-	kvm_guest_enter();
+	vc->n_woken = 0;
+	vc->nap_count = 0;
+	vc->entry_exit_count = 0;
+	vc->vcore_running = 1;
+	vc->in_guest = 0;
+	vc->pcpu = smp_processor_id();
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		kvmppc_start_thread(vcpu);
+	vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+				arch.run_list);
 
+	spin_unlock(&vc->lock);
+
+	preempt_disable();
+	kvm_guest_enter();
 	__kvmppc_vcore_entry(NULL, vcpu);
 
+	/* wait for secondary threads to finish writing their state to memory */
+	spin_lock(&vc->lock);
+	if (vc->nap_count < vc->n_woken)
+		kvmppc_wait_for_nap(vc);
+	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
+	vc->vcore_running = 2;
+	spin_unlock(&vc->lock);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
 	kvm_guest_exit();
 
 	preempt_enable();
 	kvm_resched(vcpu);
 
 	now = get_tb();
-	/* cancel pending dec exception if dec is positive */
-	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
-		kvmppc_core_dequeue_dec(vcpu);
-
-	return kvmppc_handle_exit(run, vcpu, current);
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+		if (!vcpu->arch.trap) {
+			if (signal_pending(vcpu->arch.run_task)) {
+				vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+			}
+			continue;		/* didn't get to run */
+		}
+		ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+					 vcpu->arch.run_task);
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+	}
 
+	spin_lock(&vc->lock);
  out:
-	preempt_enable();
-	return -EBUSY;
+	vc->vcore_running = 0;
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (vcpu->arch.ret != RESUME_GUEST) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+
+	return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ptid;
+	int wait_state;
+	struct kvmppc_vcore *vc;
+	DEFINE_WAIT(wait);
+
+	/* No need to go into the guest when all we do is going out */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	kvm_run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	/* This happens the first time this is called for a vcpu */
+	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+		--vc->n_blocked;
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	ptid = vc->n_runnable;
+	vcpu->arch.run_task = current;
+	vcpu->arch.kvm_run = kvm_run;
+	vcpu->arch.ptid = ptid;
+	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	++vc->n_runnable;
+
+	wait_state = TASK_INTERRUPTIBLE;
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		if (signal_pending(current)) {
+			if (!vc->vcore_running) {
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+				break;
+			}
+			/* have to wait for vcore to stop executing guest */
+			wait_state = TASK_UNINTERRUPTIBLE;
+			smp_send_reschedule(vc->pcpu);
+		}
+
+		if (!vc->vcore_running &&
+		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+			/* we can run now */
+			if (kvmppc_run_core(vc))
+				continue;
+		}
+
+		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+			kvmppc_start_thread(vcpu);
+
+		/* wait for other threads to come in, or wait for vcore */
+		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+		spin_unlock(&vc->lock);
+		schedule();
+		finish_wait(&vcpu->arch.cpu_run, &wait);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		kvmppc_remove_runnable(vc, vcpu);
+	spin_unlock(&vc->lock);
+
+	return vcpu->arch.ret;
 }
 
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e6adaadcdff2..c9bf177b7cf2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,8 +30,6 @@
  *                                                                           *
  ****************************************************************************/
 
-#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
-
 	.globl	kvmppc_skip_interrupt
 kvmppc_skip_interrupt:
 	mfspr	r13,SPRN_SRR0
@@ -79,6 +77,32 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
  *                                                                            *
  *****************************************************************************/
 
+#define XICS_XIRR		4
+#define XICS_QIRR		0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+	.globl	kvm_start_guest
+kvm_start_guest:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+	/* We got here with an IPI; clear it */
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	li	r7, XICS_XIRR
+	lwzcix	r8, r5, r7		/* ack the interrupt */
+	sync
+	stbcix	r0, r5, r6		/* clear it */
+	stwcix	r8, r5, r7		/* EOI it */
+
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
 
@@ -200,7 +224,20 @@ kvmppc_hv_entry:
 	slbia
 	ptesync
 
-	/* Switch to guest partition. */
+	/* Increment entry count iff exit count is zero. */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r9,r5,VCORE_ENTRY_EXIT
+21:	lwarx	r3,0,r9
+	cmpwi	r3,0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	addi	r3,r3,1
+	stwcx.	r3,0,r9
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	lwz	r6,VCPU_PTID(r4)
+	cmpwi	r6,0
+	bne	20f
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
@@ -210,7 +247,15 @@ kvmppc_hv_entry:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
-	ld	r8,VCPU_LPCR(r4)
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+	b	10f
+
+	/* Secondary threads wait for primary to have done partition switch */
+20:	lbz	r0,VCORE_IN_GUEST(r5)
+	cmpwi	r0,0
+	beq	20b
+10:	ld	r8,VCPU_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -225,10 +270,12 @@ kvmppc_hv_entry:
 	 * Invalidate the TLB if we could possibly have stale TLB
 	 * entries for this partition on this core due to the use
 	 * of tlbiel.
+	 * XXX maybe only need this on primary thread?
 	 */
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r5,VCPU_VCPUID(r4)
 	lhz	r6,PACAPACAINDEX(r13)
+	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
 	lhz	r8,VCPU_LAST_CPU(r4)
 	sldi	r7,r6,1			/* see if this is the same vcpu */
 	add	r7,r7,r9		/* as last ran on this pcpu */
@@ -512,8 +559,60 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
-	/* Switch back to host partition */
+	/* Increment the threads-exiting-guest count in the 0xff00
+	   bits of vcore->entry_exit_count */
+	lwsync
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r6,r5,VCORE_ENTRY_EXIT
+41:	lwarx	r3,0,r6
+	addi	r0,r3,0x100
+	stwcx.	r0,0,r6
+	bne	41b
+
+	/*
+	 * At this point we have an interrupt that we have to pass
+	 * up to the kernel or qemu; we can't handle it in real mode.
+	 * Thus we have to do a partition switch, so we have to
+	 * collect the other threads, if we are the first thread
+	 * to take an interrupt.  To do this, we set the HDEC to 0,
+	 * which causes an HDEC interrupt in all threads within 2ns
+	 * because the HDEC register is shared between all 4 threads.
+	 * However, we don't need to bother if this is an HDEC
+	 * interrupt, since the other threads will already be on their
+	 * way here in that case.
+	 */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	beq	40f
+	cmpwi	r3,0x100	/* Are we the first here? */
+	bge	40f
+	cmpwi	r3,1
+	ble	40f
+	li	r0,0
+	mtspr	SPRN_HDEC,r0
+40:
+
+	/* Secondary threads wait for primary to do partition switch */
 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lwz	r3,VCPU_PTID(r9)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	srwi	r0,r3,8
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Primary thread switches back to host partition */
 	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
 	li	r8,LPID_RSVD		/* switch to reserved LPID */
@@ -522,10 +621,12 @@ hdec_soon:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+	li	r0,0
+	stb	r0,VCORE_IN_GUEST(r5)
 	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-	ld	r8,KVM_HOST_LPCR(r4)
+16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -634,6 +735,11 @@ hdec_soon:
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
+	/* Secondary threads go off to take a nap */
+	lwz	r0,VCPU_PTID(r3)
+	cmpwi	r0,0
+	bne	secondary_nap
+
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
 	 * we reloaded the host's LPCR value.
@@ -840,6 +946,56 @@ _GLOBAL(kvmppc_h_set_dabr)
 	li	r3,0
 	blr
 
+secondary_too_late:
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	ld	r11,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r11)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r11,r11,16
+	.endr
+	b	50f
+
+secondary_nap:
+	/* Clear any pending IPI */
+50:	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	stbcix	r0, r5, r6
+
+	/* increment the nap count and then go to nap mode */
+	ld	r4, HSTATE_KVM_VCORE(r13)
+	addi	r4, r4, VCORE_NAP_COUNT
+	lwsync				/* make previous updates visible */
+51:	lwarx	r3, 0, r4
+	addi	r3, r3, 1
+	stwcx.	r3, 0, r4
+	bne	51b
+	isync
+
+	mfspr	r4, SPRN_LPCR
+	li	r0, LPCR_PECE
+	andc	r4, r4, r0
+	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
+	mtspr	SPRN_LPCR, r4
+	li	r0, 0
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
 /*
  * Save away FP, VMX and VSX registers.
  * r3 = vcpu pointer
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c78ceb9d5605..4c549664c987 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include <asm/cputhreads.h>
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
@@ -207,6 +208,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SPAPR_TCE:
 		r = 1;
 		break;
+	case KVM_CAP_PPC_SMT:
+		r = threads_per_core;
+		break;
 #endif
 	default:
 		r = 0;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 1f15ad436140..ba382b59b926 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/prom.h>
 #include <asm/io.h>
@@ -24,6 +25,7 @@
 #include <asm/irq.h>
 #include <asm/errno.h>
 #include <asm/xics.h>
+#include <asm/kvm_ppc.h>
 
 struct icp_ipl {
 	union {
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+void xics_wake_cpu(int cpu)
+{
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(xics_wake_cpu);
+
 static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
 	}
 
 	icp_native_regs[cpu] = ioremap(addr, size);
+	kvmppc_set_xics_phys(cpu, addr);
 	if (!icp_native_regs[cpu]) {
 		pr_warning("icp_native: Failed ioremap for CPU %d, "
 			   "interrupt server #0x%x, addr %#lx\n",
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 61f56502732e..e2a378d97160 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -551,6 +551,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From aa04b4cc5be64b4fb9ef4e0fdf2418e2f4737fb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:25:44 +0000
Subject: [PATCH 112/143] KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for
 use by guests

This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility.  These processors require a physically
contiguous, aligned area of memory for each guest.  When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access.  The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.

Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator.  The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.

KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs.  The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.

This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA.  It
also returns the size of the RMA in the argument structure.

Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace.  To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory.  Subsequently we will get rid of this
array and use memory associated with each memslot instead.

This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region.  Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB.  However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.

Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest.  This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt       |  32 +++
 arch/powerpc/include/asm/kvm.h          |   5 +
 arch/powerpc/include/asm/kvm_book3s.h   |   8 -
 arch/powerpc/include/asm/kvm_host.h     |  15 +-
 arch/powerpc/include/asm/kvm_ppc.h      |  10 +
 arch/powerpc/include/asm/reg.h          |   1 +
 arch/powerpc/kernel/asm-offsets.c       |   4 +-
 arch/powerpc/kernel/setup_64.c          |   3 +
 arch/powerpc/kvm/Makefile               |   3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c     |  97 +--------
 arch/powerpc/kvm/book3s_hv.c            | 259 +++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_builtin.c    | 152 ++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  19 +-
 arch/powerpc/kvm/powerpc.c              |  13 ++
 include/linux/kvm.h                     |   3 +
 15 files changed, 505 insertions(+), 119 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_builtin.c

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 681871311d3e..b0e4b9cd6a66 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1398,6 +1398,38 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets
 userspace update the TCE table directly which is useful in some
 circumstances.
 
+4.63 KVM_ALLOCATE_RMA
+
+Capability: KVM_CAP_PPC_RMA
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_allocate_rma (out)
+Returns: file descriptor for mapping the allocated RMA
+
+This allocates a Real Mode Area (RMA) from the pool allocated at boot
+time by the kernel.  An RMA is a physically-contiguous, aligned region
+of memory used on older POWER processors to provide the memory which
+will be accessed by real-mode (MMU off) accesses in a KVM guest.
+POWER processors support a set of sizes for the RMA that usually
+includes 64MB, 128MB, 256MB and some larger powers of two.
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the allocated RMA into userspace.  The mapped area can then be
+passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
+RMA for a virtual machine.  The size of the RMA in bytes (which is
+fixed at host kernel boot time) is returned in the rma_size field of
+the argument structure.
+
+The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
+is supported; 2 if the processor requires all virtual machines to have
+an RMA, or 1 if the processor can use an RMA but doesn't require it,
+because it supports the Virtual RMA (VRMA) facility.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 471bb3d85e0b..a4f6c85431f8 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -282,4 +282,9 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };
 
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5537c45d626c..3f91ebd4ae43 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -298,14 +298,6 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 			unsigned long pending_now, unsigned long old_pending)
 {
-	/* Recalculate LPCR:MER based on the presence of
-	 * a pending external interrupt
-	 */
-	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
-	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
-		vcpu->arch.lpcr |= LPCR_MER;
-	else
-		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
 }
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0d6d569e19c7..f572d9cc31bd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -28,6 +28,8 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 #include <linux/kvm_para.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 
@@ -156,6 +158,14 @@ struct kvmppc_spapr_tce_table {
 	struct page *pages[0];
 };
 
+struct kvmppc_rma_info {
+	void		*base_virt;
+	unsigned long	 base_pfn;
+	unsigned long	 npages;
+	struct list_head list;
+	atomic_t 	 use_count;
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -169,6 +179,10 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	int n_rma_pages;
+	unsigned long lpcr;
+	unsigned long rmor;
+	struct kvmppc_rma_info *rma;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
@@ -295,7 +309,6 @@ struct kvm_vcpu_arch {
 	ulong guest_owned_ext;
 	ulong purr;
 	ulong spurr;
-	ulong lpcr;
 	ulong dscr;
 	ulong amr;
 	ulong uamor;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 6ef734428634..d121f49d62b8 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -124,6 +124,10 @@ extern void kvmppc_map_vrma(struct kvm *kvm,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+				struct kvm_allocate_rma *rma);
+extern struct kvmppc_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_rma_info *ri);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
@@ -177,9 +181,15 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca[cpu].kvm_hstate.xics_phys = addr;
 }
+
+extern void kvm_rma_init(void);
+
 #else
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
+
+static inline void kvm_rma_init(void)
+{}
 #endif
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 36a611b398c5..20a053c14270 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -242,6 +242,7 @@
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
 #define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
+#define	  LPCR_RMLS_SH	(63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
 #define     LPCR_PECE0	0x00004000	/* ext. exceptions can cause exit */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index d0f2387fd792..f4aba938166b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -437,6 +437,8 @@ int main(void)
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
 	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
 	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
@@ -459,7 +461,7 @@ int main(void)
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
-	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a88bf2713d41..532054f24ecb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -63,6 +63,7 @@
 #include <asm/kexec.h>
 #include <asm/mmu_context.h>
 #include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
 
 #include "setup.h"
 
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff */
 	mmu_context_init();
 
+	kvm_rma_init();
+
 	ppc64_boot_msg(0x15, "Setup Done");
 }
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1de3d54901d4..08428e2c188d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -56,7 +56,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rm_mmu.o \
-	book3s_64_vio_hv.o
+	book3s_64_vio_hv.o \
+	book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 4a4fbec61a17..96ba96a16abf 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -79,103 +79,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	unsigned long i;
-	struct kvmppc_pginfo *pginfo;
-
 	clear_bit(kvm->arch.lpid, lpid_inuse);
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
-
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = 0; i < kvm->arch.ram_npages; ++i)
-			put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
-	}
-}
-
-static unsigned long user_page_size(unsigned long addr)
-{
-	struct vm_area_struct *vma;
-	unsigned long size = PAGE_SIZE;
-
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (vma)
-		size = vma_kernel_pagesize(vma);
-	up_read(&current->mm->mmap_sem);
-	return size;
-}
-
-static pfn_t hva_to_pfn(unsigned long addr)
-{
-	struct page *page[1];
-	int npages;
-
-	might_sleep();
-
-	npages = get_user_pages_fast(addr, 1, 1, page);
-
-	if (unlikely(npages != 1))
-		return 0;
-
-	return page_to_pfn(page[0]);
-}
-
-long kvmppc_prepare_vrma(struct kvm *kvm,
-			 struct kvm_userspace_memory_region *mem)
-{
-	unsigned long psize, porder;
-	unsigned long i, npages;
-	struct kvmppc_pginfo *pginfo;
-	pfn_t pfn;
-	unsigned long hva;
-
-	/* First see what page size we have */
-	psize = user_page_size(mem->userspace_addr);
-	/* For now, only allow 16MB pages */
-	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
-		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
-		       psize, mem->memory_size, mem->userspace_addr);
-		return -EINVAL;
-	}
-	porder = __ilog2(psize);
-
-	npages = mem->memory_size >> porder;
-	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
-	if (!pginfo) {
-		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < npages; ++i) {
-		hva = mem->userspace_addr + (i << porder);
-		if (user_page_size(hva) != psize)
-			goto err;
-		pfn = hva_to_pfn(hva);
-		if (pfn == 0) {
-			pr_err("oops, no pfn for hva %lx\n", hva);
-			goto err;
-		}
-		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
-			pr_err("oops, unaligned pfn %llx\n", pfn);
-			put_page(pfn_to_page(pfn));
-			goto err;
-		}
-		pginfo[i].pfn = pfn;
-	}
-
-	kvm->arch.ram_npages = npages;
-	kvm->arch.ram_psize = psize;
-	kvm->arch.ram_porder = porder;
-	kvm->arch.ram_pginfo = pginfo;
-
-	return 0;
-
- err:
-	kfree(pginfo);
-	return -EINVAL;
 }
 
 void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
@@ -199,6 +104,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 	for (i = 0; i < npages; ++i) {
 		pfn = pginfo[i].pfn;
+		if (!pfn)
+			break;
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36b6d98f1197..04da135cae61 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -27,6 +27,8 @@
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -40,11 +42,22 @@
 #include <asm/lppaca.h>
 #include <asm/processor.h>
 #include <asm/cputhreads.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+/*
+ * For now, limit memory to 64GB and require it to be large pages.
+ * This value is chosen because it makes the ram_pginfo array be
+ * 64kB in size, which is about as large as we want to be trying
+ * to allocate with kmalloc.
+ */
+#define MAX_MEM_ORDER		36
+
+#define LARGE_PAGE_ORDER	24	/* 16MB pages */
+
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
@@ -129,7 +142,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 		pr_err("  ESID = %.16llx VSID = %.16llx\n",
 		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
 	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
-	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
 	       vcpu->arch.last_inst);
 }
 
@@ -441,7 +454,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	int err = -EINVAL;
 	int core;
 	struct kvmppc_vcore *vcore;
-	unsigned long lpcr;
 
 	core = id / threads_per_core;
 	if (core >= KVM_MAX_VCORES)
@@ -464,10 +476,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
-	vcpu->arch.lpcr = lpcr;
-
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	/*
@@ -910,24 +918,216 @@ fail:
 	return ret;
 }
 
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= ri->npages)
+		return VM_FAULT_SIGBUS;
+
+	page = pfn_to_page(ri->base_pfn + vmf->pgoff);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_rma_vm_ops = {
+	.fault = kvm_rma_fault,
+};
+
+static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &kvm_rma_vm_ops;
+	return 0;
+}
+
+static int kvm_rma_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_rma_info *ri = filp->private_data;
+
+	kvm_release_rma(ri);
+	return 0;
+}
+
+static struct file_operations kvm_rma_fops = {
+	.mmap           = kvm_rma_mmap,
+	.release	= kvm_rma_release,
+};
+
+long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+{
+	struct kvmppc_rma_info *ri;
+	long fd;
+
+	ri = kvm_alloc_rma();
+	if (!ri)
+		return -ENOMEM;
+
+	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+	if (fd < 0)
+		kvm_release_rma(ri);
+
+	ret->rma_size = ri->npages << PAGE_SHIFT;
+	return fd;
+}
+
+static struct page *hva_to_page(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page[0];
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
-		return kvmppc_prepare_vrma(kvm, mem);
+	unsigned long psize, porder;
+	unsigned long i, npages, totalpages;
+	unsigned long pg_ix;
+	struct kvmppc_pginfo *pginfo;
+	unsigned long hva;
+	struct kvmppc_rma_info *ri = NULL;
+	struct page *page;
+
+	/* For now, only allow 16MB pages */
+	porder = LARGE_PAGE_ORDER;
+	psize = 1ul << porder;
+	if ((mem->memory_size & (psize - 1)) ||
+	    (mem->guest_phys_addr & (psize - 1))) {
+		pr_err("bad memory_size=%llx @ %llx\n",
+		       mem->memory_size, mem->guest_phys_addr);
+		return -EINVAL;
+	}
+
+	npages = mem->memory_size >> porder;
+	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+
+	/* More memory than we have space to track? */
+	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
+		return -EINVAL;
+
+	/* Do we already have an RMA registered? */
+	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
+		return -EINVAL;
+
+	if (totalpages > kvm->arch.ram_npages)
+		kvm->arch.ram_npages = totalpages;
+
+	/* Is this one of our preallocated RMAs? */
+	if (mem->guest_phys_addr == 0) {
+		struct vm_area_struct *vma;
+
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, mem->userspace_addr);
+		if (vma && vma->vm_file &&
+		    vma->vm_file->f_op == &kvm_rma_fops &&
+		    mem->userspace_addr == vma->vm_start)
+			ri = vma->vm_file->private_data;
+		up_read(&current->mm->mmap_sem);
+	}
+
+	if (ri) {
+		unsigned long rma_size;
+		unsigned long lpcr;
+		long rmls;
+
+		rma_size = ri->npages << PAGE_SHIFT;
+		if (rma_size > mem->memory_size)
+			rma_size = mem->memory_size;
+		rmls = lpcr_rmls(rma_size);
+		if (rmls < 0) {
+			pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
+			return -EINVAL;
+		}
+		atomic_inc(&ri->use_count);
+		kvm->arch.rma = ri;
+		kvm->arch.n_rma_pages = rma_size >> porder;
+		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
+		lpcr |= rmls << LPCR_RMLS_SH;
+		kvm->arch.lpcr = lpcr;
+		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
+	}
+
+	pg_ix = mem->guest_phys_addr >> porder;
+	pginfo = kvm->arch.ram_pginfo + pg_ix;
+	for (i = 0; i < npages; ++i, ++pg_ix) {
+		if (ri && pg_ix < kvm->arch.n_rma_pages) {
+			pginfo[i].pfn = ri->base_pfn +
+				(pg_ix << (porder - PAGE_SHIFT));
+			continue;
+		}
+		hva = mem->userspace_addr + (i << porder);
+		page = hva_to_page(hva);
+		if (!page) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		/* Check it's a 16MB page */
+		if (!PageHead(page) ||
+		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
+			pr_err("page at %lx isn't 16MB (o=%d)\n",
+			       hva, compound_order(page));
+			goto err;
+		}
+		pginfo[i].pfn = page_to_pfn(page);
+	}
+
 	return 0;
+
+ err:
+	return -EINVAL;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
+	    !kvm->arch.rma)
 		kvmppc_map_vrma(kvm, mem);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 	long r;
+	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
+	long err = -ENOMEM;
+	unsigned long lpcr;
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
@@ -935,11 +1135,52 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		return r;
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+
+	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
+				       GFP_KERNEL);
+	if (!kvm->arch.ram_pginfo) {
+		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		goto out_free;
+	}
+
+	kvm->arch.ram_npages = 0;
+	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
+	kvm->arch.rma = NULL;
+	kvm->arch.n_rma_pages = 0;
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+		LPCR_VPM0 | LPCR_VRMA_L;
+	kvm->arch.lpcr = lpcr;
+
+
 	return 0;
+
+ out_free:
+	kvmppc_free_hpt(kvm);
+	return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
+	struct kvmppc_pginfo *pginfo;
+	unsigned long i;
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
+			if (pginfo[i].pfn)
+				put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+	if (kvm->arch.rma) {
+		kvm_release_rma(kvm->arch.rma);
+		kvm->arch.rma = NULL;
+	}
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 000000000000..736df3cbbc55
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/*
+ * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Each RMA has to be physically contiguous and of a size that the
+ * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
+ * and other larger sizes.  Since we are unlikely to be allocate that
+ * much physically contiguous memory after the system is up and running,
+ * we preallocate a set of RMAs in early boot for KVM to use.
+ */
+static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
+static unsigned long kvm_rma_count;
+
+static int __init early_parse_rma_size(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_size = memparse(p, &p);
+
+	return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+	return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+static struct kvmppc_rma_info *rma_info;
+static LIST_HEAD(free_rmas);
+static DEFINE_SPINLOCK(rma_lock);
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the real memory
+ * areas for guests.
+ */
+void kvm_rma_init(void)
+{
+	unsigned long i;
+	unsigned long j, npages;
+	void *rma;
+	struct page *pg;
+
+	/* Only do this on POWER7 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return;
+
+	if (!kvm_rma_size || !kvm_rma_count)
+		return;
+
+	/* Check that the requested size is one supported in hardware */
+	if (lpcr_rmls(kvm_rma_size) < 0) {
+		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+		return;
+	}
+
+	npages = kvm_rma_size >> PAGE_SHIFT;
+	rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
+	for (i = 0; i < kvm_rma_count; ++i) {
+		rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
+		pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
+			kvm_rma_size >> 20);
+		rma_info[i].base_virt = rma;
+		rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
+		rma_info[i].npages = npages;
+		list_add_tail(&rma_info[i].list, &free_rmas);
+		atomic_set(&rma_info[i].use_count, 0);
+
+		pg = pfn_to_page(rma_info[i].base_pfn);
+		for (j = 0; j < npages; ++j) {
+			atomic_inc(&pg->_count);
+			++pg;
+		}
+	}
+}
+
+struct kvmppc_rma_info *kvm_alloc_rma(void)
+{
+	struct kvmppc_rma_info *ri;
+
+	ri = NULL;
+	spin_lock(&rma_lock);
+	if (!list_empty(&free_rmas)) {
+		ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+		list_del(&ri->list);
+		atomic_inc(&ri->use_count);
+	}
+	spin_unlock(&rma_lock);
+	return ri;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_rma_info *ri)
+{
+	if (atomic_dec_and_test(&ri->use_count)) {
+		spin_lock(&rma_lock);
+		list_add_tail(&ri->list, &free_rmas);
+		spin_unlock(&rma_lock);
+
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c9bf177b7cf2..9ee223c35285 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -235,10 +235,10 @@ kvmppc_hv_entry:
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r6,VCPU_PTID(r4)
 	cmpwi	r6,0
 	bne	20f
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -255,8 +255,18 @@ kvmppc_hv_entry:
 20:	lbz	r0,VCORE_IN_GUEST(r5)
 	cmpwi	r0,0
 	beq	20b
-10:	ld	r8,VCPU_LPCR(r4)
-	mtspr	SPRN_LPCR,r8
+
+	/* Set LPCR.  Set the MER bit if there is a pending external irq. */
+10:	ld	r8,KVM_LPCR(r9)
+	ld	r0,VCPU_PENDING_EXC(r4)
+	li	r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0,r0,r7
+	beq	11f
+	ori	r8,r8,LPCR_MER
+11:	mtspr	SPRN_LPCR,r8
+	ld	r8,KVM_RMOR(r9)
+	mtspr	SPRN_RMOR,r8
 	isync
 
 	/* Check if HDEC expires soon */
@@ -464,7 +474,8 @@ hcall_real_cont:
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
-	ld	r5,VCPU_LPCR(r9)
+	ld	r5,VCPU_KVM(r9)
+	ld	r5,KVM_LPCR(r5)
 	andi.	r0,r11,MSR_EE
 	beq	1f
 	andi.	r0,r5,LPCR_MER
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c549664c987..72c506505fa4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -211,6 +211,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_SMT:
 		r = threads_per_core;
 		break;
+	case KVM_CAP_PPC_RMA:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -673,6 +676,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
 		goto out;
 	}
+
+	case KVM_ALLOCATE_RMA: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_allocate_rma rma;
+
+		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+			r = -EFAULT;
+		break;
+	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 	default:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e2a378d97160..2c366b52f505 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -552,6 +552,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA	65
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -755,6 +756,8 @@ struct kvm_clock_data {
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 

From 969391c58a4efb8411d6881179945f425ad9cbb5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:26:11 +0000
Subject: [PATCH 113/143] powerpc, KVM: Split HVMODE_206 cpu feature bit into
 separate HV and architecture bits

This replaces the single CPU_FTR_HVMODE_206 bit with two bits, one to
indicate that we have a usable hypervisor mode, and another to indicate
that the processor conforms to PowerISA version 2.06.  We also add
another bit to indicate that the processor conforms to ISA version 2.01
and set that for PPC970 and derivatives.

Some PPC970 chips (specifically those in Apple machines) have a
hypervisor mode in that MSR[HV] is always 1, but the hypervisor mode
is not useful in the sense that there is no way to run any code in
supervisor mode (HV=0 PR=0).  On these processors, the LPES0 and LPES1
bits in HID4 are always 0, and we use that as a way of detecting that
hypervisor mode is not useful.

Where we have a feature section in assembly code around code that
only applies on POWER7 in hypervisor mode, we use a construct like

END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)

The definition of END_FTR_SECTION_IFSET is such that the code will
be enabled (not overwritten with nops) only if all bits in the
provided mask are set.

Note that the CPU feature check in __tlbie() only needs to check the
ARCH_206 bit, not the HVMODE bit, because __tlbie() can only get called
if we are running bare-metal, i.e. in hypervisor mode.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h    | 14 ++++++++------
 arch/powerpc/include/asm/reg.h         | 16 ++++++++++++----
 arch/powerpc/kernel/cpu_setup_power7.S |  4 ++--
 arch/powerpc/kernel/cpu_setup_ppc970.S | 26 ++++++++++++++++++++++----
 arch/powerpc/kernel/exceptions-64s.S   |  4 ++--
 arch/powerpc/kernel/paca.c             |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  3 ++-
 arch/powerpc/kvm/book3s_hv.c           |  3 ++-
 arch/powerpc/kvm/book3s_hv_builtin.c   |  4 ++--
 arch/powerpc/kvm/book3s_segment.S      |  2 +-
 arch/powerpc/mm/hash_native_64.c       |  4 ++--
 11 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index c0d842cfd012..e30442c539ce 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform;
 #define LONG_ASM_CONST(x)		0
 #endif
 
-
-#define CPU_FTR_HVMODE_206		LONG_ASM_CONST(0x0000000800000000)
+#define CPU_FTR_HVMODE			LONG_ASM_CONST(0x0000000200000000)
+#define CPU_FTR_ARCH_201		LONG_ASM_CONST(0x0000000400000000)
+#define CPU_FTR_ARCH_206		LONG_ASM_CONST(0x0000000800000000)
 #define CPU_FTR_CFAR			LONG_ASM_CONST(0x0000001000000000)
 #define CPU_FTR_IABR			LONG_ASM_CONST(0x0000002000000000)
 #define CPU_FTR_MMCRA			LONG_ASM_CONST(0x0000004000000000)
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS)
 #define CPU_FTRS_PPC970	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
-	    CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS)
+	    CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
+	    CPU_FTR_HVMODE)
 #define CPU_FTRS_POWER5	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_COHERENT_ICACHE | \
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR)
+	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 20a053c14270..ddbe57ae8584 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -307,6 +307,7 @@
 #define SPRN_HASH1	0x3D2		/* Primary Hash Address Register */
 #define SPRN_HASH2	0x3D3		/* Secondary Hash Address Resgister */
 #define SPRN_HID0	0x3F0		/* Hardware Implementation Register 0 */
+#define HID0_HDICE_SH	(63 - 23)	/* 970 HDEC interrupt enable */
 #define HID0_EMCP	(1<<31)		/* Enable Machine Check pin */
 #define HID0_EBA	(1<<29)		/* Enable Bus Address Parity */
 #define HID0_EBD	(1<<28)		/* Enable Bus Data Parity */
@@ -362,6 +363,13 @@
 #define SPRN_IABR2	0x3FA		/* 83xx */
 #define SPRN_IBCR	0x135		/* 83xx Insn Breakpoint Control Reg */
 #define SPRN_HID4	0x3F4		/* 970 HID4 */
+#define  HID4_LPES0	 (1ul << (63-0)) /* LPAR env. sel. bit 0 */
+#define	 HID4_RMLS2_SH	 (63 - 2)	/* Real mode limit bottom 2 bits */
+#define	 HID4_LPID5_SH	 (63 - 6)	/* partition ID bottom 4 bits */
+#define	 HID4_RMOR_SH	 (63 - 22)	/* real mode offset (16 bits) */
+#define  HID4_LPES1	 (1 << (63-57))	/* LPAR env. sel. bit 1 */
+#define  HID4_RMLS0_SH	 (63 - 58)	/* Real mode limit top bit */
+#define	 HID4_LPID1_SH	 0		/* partition ID top 2 bits */
 #define SPRN_HID4_GEKKO	0x3F3		/* Gekko HID4 */
 #define SPRN_HID5	0x3F6		/* 970 HID5 */
 #define SPRN_HID6	0x3F9	/* BE HID 6 */
@@ -811,28 +819,28 @@
 	mfspr	rX,SPRN_SPRG_PACA;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_HPACA;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_PACA(rX)					\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mtspr	SPRN_SPRG_PACA,rX;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mtspr	SPRN_SPRG_HPACA,rX;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define GET_SCRATCH0(rX)				\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_SCRATCH0;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_HSCRATCH0;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_SCRATCH0(rX)				\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mtspr	SPRN_SPRG_SCRATCH0,rX;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mtspr	SPRN_SPRG_HSCRATCH0,rX;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #else /* CONFIG_PPC_BOOK3S_64 */
 #define GET_SCRATCH0(rX)	mfspr	rX,SPRN_SPRG_SCRATCH0
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S
index 2ef6749688e9..76797c5105d6 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power7.S
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7)
 	blr
 
 __init_hvmode_206:
-	/* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */
+	/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
 	mfmsr	r3
 	rldicl.	r0,r3,4,63
 	bnelr
 	ld	r5,CPU_SPEC_FEATURES(r4)
-	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
 	xor	r5,r5,r6
 	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S
index 27f2507279d8..12fac8df01c5 100644
--- a/arch/powerpc/kernel/cpu_setup_ppc970.S
+++ b/arch/powerpc/kernel/cpu_setup_ppc970.S
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
 	/* Do nothing if not running in HV mode */
 	mfmsr	r0
 	rldicl.	r0,r0,4,63
-	beqlr
+	beq	no_hv_mode
 
 	mfspr	r0,SPRN_HID0
 	li	r11,5			/* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
 	/* Do nothing if not running in HV mode */
 	mfmsr	r0
 	rldicl.	r0,r0,4,63
-	beqlr
+	beq	no_hv_mode
 
 	mfspr	r0,SPRN_HID0
 	li	r11,0x15		/* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
 	sync
 	isync
 
+	/* Try to set LPES = 01 in HID4 */
+	mfspr	r0,SPRN_HID4
+	clrldi	r0,r0,1			/* clear LPES0 */
+	ori	r0,r0,HID4_LPES1	/* set LPES1 */
+	sync
+	mtspr	SPRN_HID4,r0
+	isync
+
 	/* Save away cpu state */
 	LOAD_REG_ADDR(r5,cpu_state_storage)
 
@@ -117,11 +125,21 @@ load_hids:
 	std	r3,CS_HID0(r5)
 	mfspr	r3,SPRN_HID1
 	std	r3,CS_HID1(r5)
-	mfspr	r3,SPRN_HID4
-	std	r3,CS_HID4(r5)
+	mfspr	r4,SPRN_HID4
+	std	r4,CS_HID4(r5)
 	mfspr	r3,SPRN_HID5
 	std	r3,CS_HID5(r5)
 
+	/* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+	andi.	r4,r4,HID4_LPES1
+	bnelr
+
+no_hv_mode:
+	/* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+	ld	r5,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+	andc	r5,r5,r6
+	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
 
 /* Called with no MMU context (typically MSR:IR/DR off) to
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 5bc06fdfa6c0..a5345380bef3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -75,7 +75,7 @@ BEGIN_FTR_SECTION
 	b	.power7_wakeup_noloss
 2:	b	.power7_wakeup_loss
 9:
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)
@@ -173,7 +173,7 @@ hardware_interrupt_hv:
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
 					    EXC_STD, SOFTEN_TEST_PR)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index efeb88184182..0a5a899846bb 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca)
 	 * if we do a GET_PACA() before the feature fixups have been
 	 * applied
 	 */
-	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE))
 		mtspr(SPRN_SPRG_HPACA, local_paca);
 #endif
 	mtspr(SPRN_SPRG_PACA, local_paca);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 96ba96a16abf..212dcd8fc50b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -128,7 +128,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 int kvmppc_mmu_hv_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EINVAL;
 	memset(lpid_inuse, 0, sizeof(lpid_inuse));
 	set_bit(mfspr(SPRN_LPID), lpid_inuse);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 04da135cae61..dc70e7745ab3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -443,7 +443,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvmppc_core_check_processor_compat(void)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_206))
 		return 0;
 	return -EIO;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 736df3cbbc55..7315ec6e8177 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -90,8 +90,8 @@ void kvm_rma_init(void)
 	void *rma;
 	struct page *pg;
 
-	/* Only do this on POWER7 in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+	/* Only do this in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return;
 
 	if (!kvm_rma_size || !kvm_rma_count)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 134501691ad0..aed32e517212 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -170,7 +170,7 @@ BEGIN_FTR_SECTION
 	mfspr	r4,SPRN_HSRR1
 	andi.	r12,r12,0x3ffd
 	b	2f
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
 1:	mfsrr0	r3
 	mfsrr1	r4
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index dfd764896db0..b44f5f803052 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
 		va &= ~0xffful;
 		va |= ssize << 8;
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
 			     : "memory");
 		break;
 	default:
@@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
 		va |= ssize << 8;
 		va |= 1; /* L */
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
 			     : "memory");
 		break;
 	}

From 9e368f2915601cd5bc7f5fd638b58435b018bbd7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:40:08 +0000
Subject: [PATCH 114/143] KVM: PPC: book3s_hv: Add support for PPC970-family
 processors

This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode.  Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.

There are several differences between the PPC970 and POWER7 in how
guests are managed.  These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits.  Notably, on PPC970:

* The LPCR, LPID or RMOR registers don't exist, and the functions of
  those registers are provided by bits in HID4 and one bit in HID0.

* External interrupts can be directed to the hypervisor, but unlike
  POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
  SRR0/1 not HSRR0/1.

* There is no virtual RMA (VRMA) mode; the guest must use an RMO
  (real mode offset) area.

* The TLB entries are not tagged with the LPID, so it is necessary to
  flush the whole TLB on partition switch.  Furthermore, when switching
  partitions we have to ensure that no other CPU is executing the tlbie
  or tlbsync instructions in either the old or the new partition,
  otherwise undefined behaviour can occur.

* The PMU has 8 counters (PMC registers) rather than 6.

* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.

* The SLB has 64 entries rather than 32.

* There is no mediated external interrupt facility, so if we switch to
  a guest that has a virtual external interrupt pending but the guest
  has MSR[EE] = 0, we have to arrange to have an interrupt pending for
  it so that we can get control back once it re-enables interrupts.  We
  do that by sending ourselves an IPI with smp_send_reschedule after
  hard-disabling interrupts.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/exception-64s.h  |   4 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   2 +-
 arch/powerpc/include/asm/kvm_host.h       |   2 +-
 arch/powerpc/kernel/asm-offsets.c         |   1 +
 arch/powerpc/kernel/exceptions-64s.S      |   2 +-
 arch/powerpc/kvm/Kconfig                  |  13 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       |  30 ++-
 arch/powerpc/kvm/book3s_hv.c              |  60 ++++--
 arch/powerpc/kvm/book3s_hv_builtin.c      |  11 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S   |  30 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c       |   6 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 230 +++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c                |   3 +
 arch/powerpc/mm/hash_native_64.c          |   2 +-
 14 files changed, 354 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 69435da8f2ba..8057f4f6980f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -246,6 +246,10 @@ label##_hv:						\
 	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_HV)
 
+#define SOFTEN_TEST_HV_201(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_STD)
+
 #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	HMT_MEDIUM;							\
 	SET_SCRATCH0(r13);    /* save r13 */				\
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 9cfd5436782d..ef7b3688c3b6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,7 +82,7 @@ struct kvmppc_host_state {
 	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
-	u32 host_pmc[6];
+	u32 host_pmc[8];
 	u64 host_purr;
 	u64 host_spurr;
 	u64 host_dscr;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f572d9cc31bd..cc22b282d755 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -353,7 +353,7 @@ struct kvm_vcpu_arch {
 	u32 dbsr;
 
 	u64 mmcr[3];
-	u32 pmc[6];
+	u32 pmc[8];
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f4aba938166b..54b935f2f5de 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -128,6 +128,7 @@ int main(void)
 	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+	DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
 	DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
 	DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
 	DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a5345380bef3..41b02c792aa3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -171,7 +171,7 @@ hardware_interrupt_hv:
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST_PR)
+					    EXC_STD, SOFTEN_TEST_HV_201)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 5d9b78ebbaa6..eeb42e06f2d7 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -67,23 +67,20 @@ config KVM_BOOK3S_64
 	  If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-	bool "KVM support for POWER7 using hypervisor mode in host"
+	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	depends on KVM_BOOK3S_64
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
-	  virtual machines on POWER7 processors that have hypervisor
-	  mode available to the host.
+	  virtual machines on POWER7 and PPC970 processors that have
+	  hypervisor mode available to the host.
 
 	  If you say Y here, KVM will use the hardware virtualization
 	  facilities of POWER7 (and later) processors, meaning that
 	  guest operating systems will run at full hardware speed
 	  using supervisor and user modes.  However, this also means
 	  that KVM is not usable under PowerVM (pHyp), is only usable
-	  on POWER7 (or later) processors, and can only emulate
-	  POWER5+, POWER6 and POWER7 processors.
-
-	  This module provides access to the hardware capabilities through
-	  a character device node named /dev/kvm.
+	  on POWER7 (or later) processors and PPC970-family processors,
+	  and cannot emulate a different processor from the host processor.
 
 	  If unsure, say N.
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 212dcd8fc50b..bc3a2ea94217 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -42,6 +42,8 @@
 #define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
 
+/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
+#define MAX_LPID_970	63
 #define NR_LPIDS	(LPID_RSVD + 1)
 unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
 
@@ -69,9 +71,6 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
 	kvm->arch.lpid = lpid;
-	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
-	kvm->arch.host_lpid = mfspr(SPRN_LPID);
-	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
@@ -128,12 +127,24 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 int kvmppc_mmu_hv_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-	    !cpu_has_feature(CPU_FTR_ARCH_206))
+	unsigned long host_lpid, rsvd_lpid;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return -EINVAL;
+
 	memset(lpid_inuse, 0, sizeof(lpid_inuse));
-	set_bit(mfspr(SPRN_LPID), lpid_inuse);
-	set_bit(LPID_RSVD, lpid_inuse);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */
+		rsvd_lpid = LPID_RSVD;
+	} else {
+		host_lpid = 0;			/* PPC970 */
+		rsvd_lpid = MAX_LPID_970;
+	}
+
+	set_bit(host_lpid, lpid_inuse);
+	/* rsvd_lpid is reserved for use in partition switching */
+	set_bit(rsvd_lpid, lpid_inuse);
 
 	return 0;
 }
@@ -157,7 +168,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
 
-	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+	if (cpu_has_feature(CPU_FTR_ARCH_206))
+		vcpu->arch.slb_nr = 32;		/* POWER7 */
+	else
+		vcpu->arch.slb_nr = 64;
 
 	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index dc70e7745ab3..cc0d7f1b19ab 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -443,8 +443,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvmppc_core_check_processor_compat(void)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE) &&
-	    cpu_has_feature(CPU_FTR_ARCH_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE))
 		return 0;
 	return -EIO;
 }
@@ -731,6 +730,10 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINTR;
 	}
 
+	/* On PPC970, check that we have an RMA region */
+	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+		return -EPERM;
+
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
@@ -920,12 +923,14 @@ fail:
 }
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 {
 	switch (rma_size) {
 	case 32ul << 20:	/* 32 MB */
-		return 8;
+		if (cpu_has_feature(CPU_FTR_ARCH_206))
+			return 8;	/* only supported on POWER7 */
+		return -1;
 	case 64ul << 20:	/* 64 MB */
 		return 3;
 	case 128ul << 20:	/* 128 MB */
@@ -1059,6 +1064,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		    mem->userspace_addr == vma->vm_start)
 			ri = vma->vm_file->private_data;
 		up_read(&current->mm->mmap_sem);
+		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("CPU requires an RMO\n");
+			return -EINVAL;
+		}
 	}
 
 	if (ri) {
@@ -1077,10 +1086,25 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
 		kvm->arch.n_rma_pages = rma_size >> porder;
-		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
-		lpcr |= rmls << LPCR_RMLS_SH;
+
+		/* Update LPCR and RMOR */
+		lpcr = kvm->arch.lpcr;
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			/* PPC970; insert RMLS value (split field) in HID4 */
+			lpcr &= ~((1ul << HID4_RMLS0_SH) |
+				  (3ul << HID4_RMLS2_SH));
+			lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+				((rmls & 3) << HID4_RMLS2_SH);
+			/* RMOR is also in HID4 */
+			lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+				<< HID4_RMOR_SH;
+		} else {
+			/* POWER7 */
+			lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+			lpcr |= rmls << LPCR_RMLS_SH;
+			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		}
 		kvm->arch.lpcr = lpcr;
-		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
 		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
@@ -1151,11 +1175,25 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	kvm->arch.rma = NULL;
 	kvm->arch.n_rma_pages = 0;
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
-		LPCR_VPM0 | LPCR_VRMA_L;
-	kvm->arch.lpcr = lpcr;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
+	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+		/* PPC970; HID4 is effectively the LPCR */
+		unsigned long lpid = kvm->arch.lpid;
+		kvm->arch.host_lpid = 0;
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+		lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+		lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+			((lpid & 0xf) << HID4_LPID5_SH);
+	} else {
+		/* POWER7; init LPCR for virtual RMA mode */
+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+		lpcr &= LPCR_PECE | LPCR_LPES;
+		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+			LPCR_VPM0 | LPCR_VRMA_L;
+	}
+	kvm->arch.lpcr = lpcr;
 
 	return 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7315ec6e8177..d43120355eec 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -55,12 +55,14 @@ static LIST_HEAD(free_rmas);
 static DEFINE_SPINLOCK(rma_lock);
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 {
 	switch (rma_size) {
 	case 32ul << 20:	/* 32 MB */
-		return 8;
+		if (cpu_has_feature(CPU_FTR_ARCH_206))
+			return 8;	/* only supported on POWER7 */
+		return -1;
 	case 64ul << 20:	/* 64 MB */
 		return 3;
 	case 128ul << 20:	/* 128 MB */
@@ -90,8 +92,9 @@ void kvm_rma_init(void)
 	void *rma;
 	struct page *pg;
 
-	/* Only do this in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE))
+	/* Only do this on PPC970 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_201))
 		return;
 
 	if (!kvm_rma_size || !kvm_rma_count)
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 532afaf19841..3f7b674dd4bf 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -50,8 +50,10 @@ _GLOBAL(__kvmppc_vcore_entry)
 	SAVE_NVGPRS(r1)
 
 	/* Save host DSCR */
+BEGIN_FTR_SECTION
 	mfspr	r3, SPRN_DSCR
 	std	r3, HSTATE_DSCR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save host DABR */
 	mfspr	r3, SPRN_DABR
@@ -86,12 +88,20 @@ _GLOBAL(__kvmppc_vcore_entry)
 	mfspr	r7, SPRN_PMC4
 	mfspr	r8, SPRN_PMC5
 	mfspr	r9, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, HSTATE_PMC(r13)
 	stw	r5, HSTATE_PMC + 4(r13)
 	stw	r6, HSTATE_PMC + 8(r13)
 	stw	r7, HSTATE_PMC + 12(r13)
 	stw	r8, HSTATE_PMC + 16(r13)
 	stw	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	stw	r10, HSTATE_PMC + 24(r13)
+	stw	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 31:
 
 	/*
@@ -105,6 +115,26 @@ _GLOBAL(__kvmppc_vcore_entry)
 	add	r8,r8,r7
 	std	r8,HSTATE_DECEXP(r13)
 
+	/*
+	 * On PPC970, if the guest vcpu has an external interrupt pending,
+	 * send ourselves an IPI so as to interrupt the guest once it
+	 * enables interrupts.  (It must have interrupts disabled,
+	 * otherwise we would already have delivered the interrupt.)
+	 */
+BEGIN_FTR_SECTION
+	ld	r0, VCPU_PENDING_EXC(r4)
+	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0, r0, r7
+	beq	32f
+	mr	r31, r4
+	lhz	r3, PACAPACAINDEX(r13)
+	bl	smp_send_reschedule
+	nop
+	mr	r4, r31
+32:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+
 	/* Jump to partition switch code */
 	bl	.kvmppc_hv_entry_trampoline
 	nop
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index edb0aae901a3..fcfe6b055558 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -56,7 +56,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
 	if (pteh & HPTE_V_LARGE) {
-		if ((ptel & 0xf000) == 0x1000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (ptel & 0xf000) == 0x1000) {
 			/* 64k page */
 			porder = 16;
 		} else if ((ptel & 0xff000) == 0) {
@@ -126,7 +127,8 @@ static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	va_low &= 0x7ff;
 	if (v & HPTE_V_LARGE) {
 		rb |= 1;			/* L field */
-		if (r & 0xff000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (r & 0xff000)) {
 			/* non-16MB large page, must be 64k */
 			/* (masks depend on page size) */
 			rb |= 0x1000;		/* page encoding in LP field */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9ee223c35285..6dd33581a228 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,12 +148,20 @@ kvmppc_hv_entry:
 	lwz	r7, VCPU_PMC + 12(r4)
 	lwz	r8, VCPU_PMC + 16(r4)
 	lwz	r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+	lwz	r10, VCPU_PMC + 24(r4)
+	lwz	r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r5
 	mtspr	SPRN_PMC3, r6
 	mtspr	SPRN_PMC4, r7
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, VCPU_MMCR(r4)
 	ld	r5, VCPU_MMCR + 8(r4)
 	ld	r6, VCPU_MMCR + 16(r4)
@@ -165,9 +173,11 @@ kvmppc_hv_entry:
 	/* Load up FP, VMX and VSX registers */
 	bl	kvmppc_load_fp
 
+BEGIN_FTR_SECTION
 	/* Switch DSCR to guest value */
 	ld	r5, VCPU_DSCR(r4)
 	mtspr	SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/*
 	 * Set the decrementer to the guest decrementer.
@@ -210,6 +220,7 @@ kvmppc_hv_entry:
 	mtspr	SPRN_DABRX,r5
 	mtspr	SPRN_DABR,r6
 
+BEGIN_FTR_SECTION
 	/* Restore AMR and UAMOR, set AMOR to all 1s */
 	ld	r5,VCPU_AMR(r4)
 	ld	r6,VCPU_UAMOR(r4)
@@ -217,6 +228,7 @@ kvmppc_hv_entry:
 	mtspr	SPRN_AMR,r5
 	mtspr	SPRN_UAMOR,r6
 	mtspr	SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Clear out SLB */
 	li	r6,0
@@ -224,6 +236,14 @@ kvmppc_hv_entry:
 	slbia
 	ptesync
 
+BEGIN_FTR_SECTION
+	b	30f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 host -> guest partition switch code.
+	 * We don't have to lock against concurrent tlbies,
+	 * but we do have to coordinate across hardware threads.
+	 */
 	/* Increment entry count iff exit count is zero. */
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	addi	r9,r5,VCORE_ENTRY_EXIT
@@ -315,9 +335,94 @@ kvmppc_hv_entry:
 	ld	r8,VCPU_SPURR(r4)
 	mtspr	SPRN_PURR,r7
 	mtspr	SPRN_SPURR,r8
+	b	31f
+
+	/*
+	 * PPC970 host -> guest partition switch code.
+	 * We have to lock against concurrent tlbies,
+	 * using native_tlbie_lock to lock against host tlbies
+	 * and kvm->arch.tlbie_lock to lock against guest tlbies.
+	 * We also have to invalidate the TLB since its
+	 * entries aren't tagged with the LPID.
+	 */
+30:	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+
+	/* first take native_tlbie_lock */
+	.section ".toc","aw"
+toc_tlbie_lock:
+	.tc	native_tlbie_lock[TC],native_tlbie_lock
+	.previous
+	ld	r3,toc_tlbie_lock@toc(2)
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_LPCR(r9)		/* use kvm->arch.lpcr to store HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* Take the guest's tlbie_lock */
+	addi	r3,r9,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+	ld	r6,KVM_SDR1(r9)
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+
+	/* Set up HID4 with the guest's LPID etc. */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+
+	/* drop the guest's tlbie_lock */
+	li	r0,0
+	stw	r0,0(r3)
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/* Enable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,1
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* Load up guest SLB entries */
-	lwz	r5,VCPU_SLB_MAX(r4)
+31:	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
 	mtctr	r5
@@ -472,6 +577,7 @@ kvmppc_interrupt:
 hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
 	ld	r5,VCPU_KVM(r9)
@@ -481,6 +587,7 @@ hcall_real_cont:
 	andi.	r0,r5,LPCR_MER
 	bne	bounce_ext_interrupt
 1:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
@@ -492,9 +599,11 @@ hcall_real_cont:
 	/* Save HEIR (HV emulation assist reg) in last_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,-1
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
 	bne	11f
 	mfspr	r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 11:	stw	r3,VCPU_LAST_INST(r9)
 
 	/* Save more register state  */
@@ -508,8 +617,10 @@ hcall_real_cont:
 	stw	r7, VCPU_DSISR(r9)
 	std	r8, VCPU_CTR(r9)
 	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 7:	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -543,6 +654,7 @@ hcall_real_cont:
 	/*
 	 * Save the guest PURR/SPURR
 	 */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_PURR
 	mfspr	r6,SPRN_SPURR
 	ld	r7,VCPU_PURR(r9)
@@ -562,6 +674,7 @@ hcall_real_cont:
 	add	r4,r4,r6
 	mtspr	SPRN_PURR,r3
 	mtspr	SPRN_SPURR,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 
 	/* Clear out SLB */
 	li	r5,0
@@ -570,6 +683,14 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
+BEGIN_FTR_SECTION
+	b	32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 guest -> host partition switch code.
+	 * We don't have to lock against tlbies but we do
+	 * have to coordinate the hardware threads.
+	 */
 	/* Increment the threads-exiting-guest count in the 0xff00
 	   bits of vcore->entry_exit_count */
 	lwsync
@@ -640,9 +761,82 @@ hdec_soon:
 16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
+	b	33f
+
+	/*
+	 * PPC970 guest -> host partition switch code.
+	 * We have to lock against concurrent tlbies, and
+	 * we have to flush the whole TLB.
+	 */
+32:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+
+	/* Take the guest's tlbie_lock */
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+	addi	r3,r4,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_HOST_LPCR(r4)	/* use kvm->arch.host_lpcr for HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop guest tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* take native_tlbie_lock */
+	ld	r3,toc_tlbie_lock@toc(2)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r6,KVM_HOST_SDR1(r4)
+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
+
+	/* Set up host HID4 value */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	/* Disable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,0
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* load host SLB entries */
-	ld	r8,PACA_SLBSHADOWPTR(r13)
+33:	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
 	ld	r5,SLBSHADOW_SAVEAREA(r8)
@@ -654,12 +848,14 @@ hdec_soon:
 	.endr
 
 	/* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_AMR
 	mfspr	r6,SPRN_UAMOR
 	std	r5,VCPU_AMR(r9)
 	std	r6,VCPU_UAMOR(r9)
 	li	r6,0
 	mtspr	SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Restore host DABR and DABRX */
 	ld	r5,HSTATE_DABR(r13)
@@ -668,10 +864,12 @@ hdec_soon:
 	mtspr	SPRN_DABRX,r6
 
 	/* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
 	mfspr	r8, SPRN_DSCR
 	ld	r7, HSTATE_DSCR(r13)
 	std	r8, VCPU_DSCR(r7)
 	mtspr	SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save non-volatile GPRs */
 	std	r14, VCPU_GPR(r14)(r9)
@@ -735,21 +933,31 @@ hdec_soon:
 	mfspr	r6, SPRN_PMC4
 	mfspr	r7, SPRN_PMC5
 	mfspr	r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, VCPU_PMC(r9)
 	stw	r4, VCPU_PMC + 4(r9)
 	stw	r5, VCPU_PMC + 8(r9)
 	stw	r6, VCPU_PMC + 12(r9)
 	stw	r7, VCPU_PMC + 16(r9)
 	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	stw	r10, VCPU_PMC + 24(r9)
+	stw	r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 22:
 	/* save FP state */
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
-	/* Secondary threads go off to take a nap */
+	/* Secondary threads go off to take a nap on POWER7 */
+BEGIN_FTR_SECTION
 	lwz	r0,VCPU_PTID(r3)
 	cmpwi	r0,0
 	bne	secondary_nap
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
@@ -771,12 +979,20 @@ hdec_soon:
 	lwz	r6, HSTATE_PMC + 12(r13)
 	lwz	r8, HSTATE_PMC + 16(r13)
 	lwz	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	lwz	r10, HSTATE_PMC + 24(r13)
+	lwz	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r4
 	mtspr	SPRN_PMC3, r5
 	mtspr	SPRN_PMC4, r6
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, HSTATE_MMCR(r13)
 	ld	r4, HSTATE_MMCR + 8(r13)
 	ld	r5, HSTATE_MMCR + 16(r13)
@@ -802,7 +1018,7 @@ hdec_soon:
 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 
 	/* RFI into the highmem handler, or branch to interrupt handler */
-	mfmsr	r6
+12:	mfmsr	r6
 	mtctr	r12
 	li	r0, MSR_RI
 	andc	r6, r6, r0
@@ -812,7 +1028,11 @@ hdec_soon:
 	beqctr
 	RFI
 
-11:	mtspr	SPRN_HSRR0, r8
+11:
+BEGIN_FTR_SECTION
+	b	12b
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	mtspr	SPRN_HSRR0, r8
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72c506505fa4..a107c9be0fb1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -213,6 +213,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 		break;
 	case KVM_CAP_PPC_RMA:
 		r = 1;
+		/* PPC970 requires an RMA */
+		if (cpu_has_feature(CPU_FTR_ARCH_201))
+			r = 2;
 		break;
 #endif
 	default:
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index b44f5f803052..90039bc64119 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
 
 #define HPTE_LOCK_BIT 3
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {

From 29d03158f9d400450c17bb25ee0533b52f651d04 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 1 Jul 2011 20:26:32 +0200
Subject: [PATCH 115/143] KVM: PPC: Remove prog_flags

Commit c8f729d408 (KVM: PPC: Deliver program interrupts right away instead
of queueing them) made away with all users of prog_flags, so we can just
remove it from the headers.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 3f91ebd4ae43..98da010252a3 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -89,7 +89,6 @@ struct kvmppc_vcpu_book3s {
 	u64 vsid_max;
 #endif
 	int context_id[SID_CONTEXTS];
-	ulong prog_flags; /* flags to inject when giving a 700 trap */
 
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
 	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];

From e03b644fe68b1c6401465b02724d261538dba10f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 Jul 2011 15:28:11 -0400
Subject: [PATCH 116/143] KVM: introduce kvm_read_guest_cached

Introduce kvm_read_guest_cached() function in addition to write one we
already have.

[ by glauber: export function signature in kvm header ]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric Munson <emunson@mgebm.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59cbd2f..f7df0a3b031d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -381,6 +381,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11d2783eb9df..d5ef9ebcaff7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,6 +1418,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int r;
+
+	if (slots->generation != ghc->generation)
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+	if (kvm_is_error_hva(ghc->hva))
+		return -EFAULT;
+
+	r = __copy_from_user(data, (void __user *)ghc->hva, len);
+	if (r)
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
 	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,

From 4b6b35f55ca81d3bfdec63b0adb61798702ceb2e Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:12 -0400
Subject: [PATCH 117/143] KVM: Add constant to represent KVM MSRs enabled bit
 in guest/host interface

This patch is simple, put in a different commit so it can be more easily
shared between guest and hypervisor. It just defines a named constant
to indicate the enable bit for KVM-specific MSRs.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93d..d6cd79b34aab 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -30,6 +30,7 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define KVM_MSR_ENABLED 1
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01

From 9ddabbe72e41ca6794cb4947c70929c9410e6752 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:13 -0400
Subject: [PATCH 118/143] KVM: KVM Steal time guest/host interface

To implement steal time, we need the hypervisor to pass the guest information
about how much time was spent running other processes outside the VM.
This is per-vcpu, and using the kvmclock structure for that is an abuse
we decided not to make.

In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
holds the memory area address containing information about steal time

This patch contains the headers for it. I am keeping it separate to facilitate
backports to people who wants to backport the kernel part but not the
hypervisor, or the other way around.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/msr.txt | 34 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_para.h   |  9 ++++++++
 2 files changed, 43 insertions(+)

diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index d079aed27e03..50317809113d 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 
 	Currently type 2 APF will be always delivered on the same vcpu as
 	type 1 was, but guest should not rely on that.
+
+MSR_KVM_STEAL_TIME: 0x4b564d03
+
+	data: 64-byte alignment physical address of a memory area which must be
+	in guest RAM, plus an enable bit in bit 0. This memory is expected to
+	hold a copy of the following structure:
+
+	struct kvm_steal_time {
+		__u64 steal;
+		__u32 version;
+		__u32 flags;
+		__u32 pad[12];
+	}
+
+	whose data will be filled in by the hypervisor periodically. Only one
+	write, or registration, is needed for each VCPU. The interval between
+	updates of this structure is arbitrary and implementation-dependent.
+	The hypervisor may update this structure at any time it sees fit until
+	anything with bit0 == 0 is written to it. Guest is required to make sure
+	this structure is initialized to zero.
+
+	Fields have the following meanings:
+
+		version: a sequence counter. In other words, guest has to check
+		this field before and after grabbing time information and make
+		sure they are both equal and even. An odd version indicates an
+		in-progress update.
+
+		flags: At this point, always zero. May be used to indicate
+		changes in this structure in the future.
+
+		steal: the amount of time in which this vCPU did not run, in
+		nanoseconds. Time during which the vcpu is idle, will not be
+		reported as steal time.
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index d6cd79b34aab..65f8bb9279e0 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 #define KVM_FEATURE_ASYNC_PF		4
+#define KVM_FEATURE_STEAL_TIME		5
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -35,6 +36,14 @@
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME  0x4b564d03
+
+struct kvm_steal_time {
+	__u64 steal;
+	__u32 version;
+	__u32 flags;
+	__u32 pad[12];
+};
 
 #define KVM_MAX_MMU_OP_BATCH           32
 

From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:14 -0400
Subject: [PATCH 119/143] KVM: Steal time implementation

To implement steal time, we need the hypervisor to pass the guest
information about how much time was spent running other processes
outside the VM, while the vcpu had meaningful work to do - halt
time does not count.

This information is acquired through the run_delay field of
delayacct/schedstats infrastructure, that counts time spent in a
runqueue but not running.

Steal time is a per-cpu information, so the traditional MSR-based
infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the
memory area address containing information about steal time

This patch contains the hypervisor part of the steal time infrasructure,
and can be backported independently of the guest portion.

[avi, yongjie: export delayacct_on, to avoid build failures in some configs]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Rik van Riel <riel@redhat.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Yongjie Ren <yongjie.ren@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  9 ++++
 arch/x86/include/asm/kvm_para.h |  4 ++
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/x86.c              | 74 ++++++++++++++++++++++++++++++++-
 include/linux/kvm_host.h        |  1 +
 kernel/delayacct.c              |  2 +
 6 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee878ca..59086a77ff13 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,15 @@ struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		u64 last_steal;
+		u64 accum_steal;
+		struct gfn_to_hva_cache stime;
+		struct kvm_steal_time steal;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65f8bb9279e0..c484ba8e05ea 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -45,6 +45,10 @@ struct kvm_steal_time {
 	__u32 pad[12];
 };
 
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1b..99c3f0589faa 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
+	select TASK_DELAY_ACCT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b803f04bde7..c96cdc092484 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+		return;
+
+	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+	vcpu->arch.st.steal.version += 2;
+	vcpu->arch.st.accum_steal = 0;
+
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+
+		if (unlikely(!sched_info_on()))
+			return 1;
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+							data & KVM_STEAL_VALID_BITS))
+			return 1;
+
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED))
+			break;
+
+		vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+		preempt_disable();
+		accumulate_steal_time(vcpu);
+		preempt_enable();
+
+		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	accumulate_steal_time(vcpu);
+	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 1;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+			record_steal_time(vcpu);
+
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
+	vcpu->arch.st.msr_val = 0;
 
 	kvmclock_reset(vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f7df0a3b031d..c8e023902f79 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -47,6 +47,7 @@
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
+#define KVM_REQ_STEAL_UPDATE      13
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)

From 3c404b578fab699c4708279938078d9404b255a4 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:15 -0400
Subject: [PATCH 120/143] KVM guest: Add a pv_ops stub for steal time

This patch adds a function pointer in one of the many paravirt_ops
structs, to allow guests to register a steal time function. Besides
a steal time function, we also declare two jump_labels. They will be
used to allow the steal time code to be easily bypassed when not
in use.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/paravirt.h       | 9 +++++++++
 arch/x86/include/asm/paravirt_types.h | 1 +
 arch/x86/kernel/paravirt.c            | 9 +++++++++
 3 files changed, 19 insertions(+)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab170..a7d2db9a74fb 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+	return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
 	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c869..2c7652163111 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
 
 struct pv_time_ops {
 	unsigned long long (*sched_clock)(void);
+	unsigned long long (*steal_clock)(int cpu);
 	unsigned long (*get_tsc_khz)(void);
 };
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71b..613a7931ecc1 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
 	__native_flush_tlb_single(addr);
 }
 
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+	return 0;
+}
+
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
 
 struct pv_time_ops pv_time_ops = {
 	.sched_clock = native_sched_clock,
+	.steal_clock = native_steal_clock,
 };
 
 struct pv_irq_ops pv_irq_ops = {

From 747f2925836b678d2a0de980d70101fd35620f2a Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:16 -0400
Subject: [PATCH 121/143] ia64: add jump labels for paravirt

Since in a later patch I intend to call jump labels inside
CONFIG_PARAVIRT, IA64 would fail to compile if they are not
provided. This patch provides those jump labels for the IA64
architecture.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Isaku Yamahata <yamahata@valinux.co.jp>
Acked-by: Rik van Riel <riel@redhat.com>
CC: Tony Luck <tony.luck@intel.com>
CC: Eddie Dong <eddie.dong@intel.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Anthony Liguori <aliguori@us.ibm.com>
CC: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/paravirt.h | 4 ++++
 arch/ia64/kernel/paravirt.c      | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2eb0a981a09a..32551d304cd7 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu)
 		pv_time_ops.init_missing_ticks_accounting(cpu);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
 static inline int
 paravirt_do_steal_accounting(unsigned long *new_itm)
 {
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index a21d7bb9c69c..100868216c55 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
  * pv_time_ops
  * time operations
  */
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
 
 static int
 ia64_native_do_steal_accounting(unsigned long *new_itm)

From e6e6685accfa81f509fadfc9624bc7c3862d75c4 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:17 -0400
Subject: [PATCH 122/143] KVM guest: Steal time accounting

This patch accounts steal time time in account_process_tick.
If one or more tick is considered stolen in the current
accounting cycle, user/system accounting is skipped. Idle is fine,
since the hypervisor does not report steal time if the guest
is halted.

Accounting steal time from the core scheduler give us the
advantage of direct acess to the runqueue data. In a later
opportunity, it can be used to tweak cpu power and make
the scheduler aware of the time it lost.

[avi: <asm/paravirt.h> doesn't exist on many archs]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502d609b..f98a28b19b2a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -528,6 +531,9 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+	u64 prev_steal_time;
+#endif
 
 	/* calc_load related fields */
 	unsigned long calc_load_update;
@@ -1953,6 +1959,18 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+	if (unlikely(steal > NSEC_PER_SEC))
+		return div_u64(steal, TICK_NSEC);
+
+	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	s64 irq_delta;
@@ -3845,6 +3863,25 @@ void account_idle_time(cputime_t cputime)
 		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
 
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+	if (static_branch(&paravirt_steal_enabled)) {
+		u64 steal, st = 0;
+
+		steal = paravirt_steal_clock(smp_processor_id());
+		steal -= this_rq()->prev_steal_time;
+
+		st = steal_ticks(steal);
+		this_rq()->prev_steal_time += st * TICK_NSEC;
+
+		account_steal_time(st);
+		return st;
+	}
+#endif
+	return false;
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3876,6 +3913,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
+	if (steal_account_process_tick())
+		return;
+
 	if (irqtime_account_hi_update()) {
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	} else if (irqtime_account_si_update()) {
@@ -3929,6 +3969,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
+	if (steal_account_process_tick())
+		return;
+
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

From 095c0aa83e52d6c3dd7168610746703921f570af Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:18 -0400
Subject: [PATCH 123/143] sched: adjust scheduler cpu power for stolen time

This patch makes update_rq_clock() aware of steal time.
The mechanism of operation is not different from irq_time,
and follows the same principles. This lives in a CONFIG
option itself, and can be compiled out independently of
the rest of steal time reporting. The effect of disabling it
is that the scheduler will still report steal time (that cannot be
disabled), but won't use this information for cpu power adjustments.

Everytime update_rq_clock_task() is invoked, we query information
about how much time was stolen since last call, and feed it into
sched_rt_avg_update().

Although steal time reporting in account_process_tick() keeps
track of the last time we read the steal clock, in prev_steal_time,
this patch do it independently using another field,
prev_steal_time_rq. This is because otherwise, information about time
accounted in update_process_tick() would never reach us in update_rq_clock().

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/Kconfig        | 12 +++++++++++
 kernel/sched.c          | 47 ++++++++++++++++++++++++++++++++---------
 kernel/sched_features.h |  4 ++--
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da349723d411..1f03e221a01e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -512,6 +512,18 @@ menuconfig PARAVIRT_GUEST
 
 if PARAVIRT_GUEST
 
+config PARAVIRT_TIME_ACCOUNTING
+	bool "Paravirtual steal time accounting"
+	select PARAVIRT
+	default n
+	---help---
+	  Select this option to enable fine granularity task steal time
+	  accounting. Time spent executing other tasks in parallel with
+	  the current vCPU is discounted from the vCPU power. To account for
+	  that, there can be a small performance impact.
+
+	  If in doubt, say N here.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_CLOCK
diff --git a/kernel/sched.c b/kernel/sched.c
index f98a28b19b2a..b35ac50b26c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -534,6 +534,9 @@ struct rq {
 #ifdef CONFIG_PARAVIRT
 	u64 prev_steal_time;
 #endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	u64 prev_steal_time_rq;
+#endif
 
 	/* calc_load related fields */
 	unsigned long calc_load_update;
@@ -1973,8 +1976,14 @@ static inline u64 steal_ticks(u64 steal)
 
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	s64 irq_delta;
-
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
 	/*
@@ -1997,12 +2006,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_branch((&paravirt_steal_rq_enabled))) {
+		u64 st;
+
+		steal = paravirt_steal_clock(cpu_of(rq));
+		steal -= rq->prev_steal_time_rq;
+
+		if (unlikely(steal > delta))
+			steal = delta;
+
+		st = steal_ticks(steal);
+		steal = st * TICK_NSEC;
+
+		rq->prev_steal_time_rq += steal;
+
+		delta -= steal;
+	}
+#endif
+
 	rq->clock_task += delta;
 
-	if (irq_delta && sched_feat(NONIRQ_POWER))
-		sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+		sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2037,12 +2069,7 @@ static int irqtime_account_si_update(void)
 
 #define sched_clock_irqtime	(0)
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-	rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..ca3b025f8669 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
  */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
 
 /*
  * Queue remote wakeups on the target CPU and process them

From d910f5c1064d7ff09c31b0191564f9f99e210f91 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 11 Jul 2011 15:28:19 -0400
Subject: [PATCH 124/143] KVM guest: KVM Steal time registration

This patch implements the kvm bits of the steal time infrastructure.
The most important part of it, is the steal time clock. It is an
continuous clock that shows the accumulated amount of steal time
since vcpu creation. It is supposed to survive cpu offlining/onlining.

[marcelo: fix build with CONFIG_KVM_GUEST=n]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Avi Kivity <avi@redhat.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kernel-parameters.txt |  4 ++
 arch/x86/include/asm/kvm_para.h     |  6 +++
 arch/x86/kernel/kvm.c               | 72 +++++++++++++++++++++++++++++
 arch/x86/kernel/kvmclock.c          |  2 +
 4 files changed, 84 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd248a318211..a7225746ed96 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1737,6 +1737,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
 			fault handling.
 
+	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+			steal time is computed, but won't influence scheduler
+			behaviour
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c484ba8e05ea..734c3767cfac 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -192,6 +192,7 @@ void __init kvm_guest_init(void);
 void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
 #else
 #define kvm_guest_init() do { } while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
@@ -200,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
 }
+
+static inline void kvm_disable_steal_time(void)
+{
+	return;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122e..a9c2116001d6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
 
 early_param("no-kvmapf", parse_no_kvmapf);
 
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+        steal_acc = 0;
+        return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
 struct kvm_para_state {
 	u8 mmu_queue[MMU_QUEUE_SIZE];
 	int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+static void kvm_register_steal_time(void)
+{
+	int cpu = smp_processor_id();
+	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+	if (!has_steal_clock)
+		return;
+
+	memset(st, 0, sizeof(*st));
+
+	wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+	printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+		cpu, __pa(st));
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
 		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 		       smp_processor_id());
 	}
+
+	if (has_steal_clock)
+		kvm_register_steal_time();
 }
 
 static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 
+static u64 kvm_steal_clock(int cpu)
+{
+	u64 steal;
+	struct kvm_steal_time *src;
+	int version;
+
+	src = &per_cpu(steal_time, cpu);
+	do {
+		version = src->version;
+		rmb();
+		steal = src->steal;
+		rmb();
+	} while ((version & 1) || (version != src->version));
+
+	return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+	if (!has_steal_clock)
+		return;
+
+	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 
 static void kvm_guest_cpu_offline(void *dummy)
 {
+	kvm_disable_steal_time();
 	kvm_pv_disable_apf(NULL);
 	apf_task_wake_all();
 }
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+		has_steal_clock = 1;
+		pv_time_ops.steal_clock = kvm_steal_clock;
+	}
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 }
+
+static __init int activate_jump_labels(void)
+{
+	if (has_steal_clock) {
+		jump_label_inc(&paravirt_steal_enabled);
+		if (steal_acc)
+			jump_label_inc(&paravirt_steal_rq_enabled);
+	}
+
+	return 0;
+}
+arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6bca11b..c1a0188e29ae 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_crash_shutdown(regs);
 }
 #endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 static void kvm_shutdown(void)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_shutdown();
 }
 

From 052331bea38dfc176322ec85642eb98d6803a762 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:21:17 +0800
Subject: [PATCH 125/143] KVM: MMU: fix walking shadow page table

Properly check the last mapping, and do not walk to the next level if last spte
is met

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da0f3b081076..03323dc705c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1517,10 +1517,6 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
 		return false;
 
-	if (iterator->level == PT_PAGE_TABLE_LEVEL)
-		if (is_large_pte(*iterator->sptep))
-			return false;
-
 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
 	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
 	return true;
@@ -1528,6 +1524,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 
 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 {
+	if (is_last_spte(*iterator->sptep, iterator->level)) {
+		iterator->level = 0;
+		return;
+	}
+
 	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
 	--iterator->level;
 }

From ffb61bb3bca33ff8e68d11d7cb6b27ac0f74a2c0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:22:01 +0800
Subject: [PATCH 126/143] KVM: MMU: do not update slot bitmap if spte is
 nonpresent

Set slot bitmap only if the spte is present

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 03323dc705c2..02c839f40e29 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	struct kvm_mmu_page *sp;
 	unsigned long *rmapp;
 
-	if (!is_rmap_spte(*spte))
-		return 0;
-
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -2087,11 +2084,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
 
-	page_header_update_slot(vcpu->kvm, sptep, gfn);
-	if (!was_rmapped) {
-		rmap_count = rmap_add(vcpu, sptep, gfn);
-		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-			rmap_recycle(vcpu, sptep, gfn);
+	if (is_shadow_present_pte(*sptep)) {
+		page_header_update_slot(vcpu->kvm, sptep, gfn);
+		if (!was_rmapped) {
+			rmap_count = rmap_add(vcpu, sptep, gfn);
+			if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+				rmap_recycle(vcpu, sptep, gfn);
+		}
 	}
 	kvm_release_pfn_clean(pfn);
 	if (speculative) {

From af7cc7d1ee422a612f6785e347a893d44cc892ea Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:22:46 +0800
Subject: [PATCH 127/143] KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup
 the code

Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it
to cleanup the code between read emulation and write emulation

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c96cdc092484..a1dbd0443545 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4010,6 +4010,27 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+				gpa_t *gpa, struct x86_exception *exception,
+				bool write)
+{
+	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+	if (write)
+		access |= PFERR_WRITE_MASK;
+
+	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+	if (*gpa == UNMAPPED_GVA)
+		return -1;
+
+	/* For APIC access vmexit */
+	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		return 1;
+
+	return 0;
+}
+
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  unsigned long addr,
 				  void *val,
@@ -4017,8 +4038,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-	gpa_t                 gpa;
-	int handled;
+	gpa_t gpa;
+	int handled, ret;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -4028,13 +4049,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
 
-	if (gpa == UNMAPPED_GVA)
+	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
 
-	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+	if (ret)
 		goto mmio;
 
 	if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -4085,16 +4105,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct x86_exception *exception,
 					   struct kvm_vcpu *vcpu)
 {
-	gpa_t                 gpa;
-	int handled;
+	gpa_t gpa;
+	int handled, ret;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
 
-	if (gpa == UNMAPPED_GVA)
+	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
 
 	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+	if (ret)
 		goto mmio;
 
 	if (emulator_write_phys(vcpu, gpa, val, bytes))

From bebb106a5afa32efdf5332ed4a40bf4d6d06b56e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:23:20 +0800
Subject: [PATCH 128/143] KVM: MMU: cache mmio info on page fault path

If the page fault is caused by mmio, we can cache the mmio info, later, we do
not need to walk guest page table and quickly know it is a mmio fault while we
emulate the mmio instruction

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++++
 arch/x86/kvm/mmu.c              | 21 +++++++------------
 arch/x86/kvm/mmu.h              | 23 +++++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h      | 21 ++++++++++++-------
 arch/x86/kvm/x86.c              | 11 ++++++++++
 arch/x86/kvm/x86.h              | 36 +++++++++++++++++++++++++++++++++
 6 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 59086a77ff13..8da1400ab581 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -424,6 +424,11 @@ struct kvm_vcpu_arch {
 	u64 mcg_ctl;
 	u64 *mce_banks;
 
+	/* Cache MMIO info */
+	u64 mmio_gva;
+	unsigned access;
+	gfn_t mmio_gfn;
+
 	/* used for guest single stepping over the given code position */
 	unsigned long singlestep_rip;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 02c839f40e29..d1986b7dcec7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,11 +217,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
-	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -243,11 +238,6 @@ static int is_large_pte(u64 pte)
 	return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_writable_pte(unsigned long pte)
-{
-	return pte & PT_WRITABLE_MASK;
-}
-
 static int is_dirty_gpte(unsigned long pte)
 {
 	return pte & PT_DIRTY_MASK;
@@ -2247,15 +2237,17 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 	send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
+			       unsigned access, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
 	} else if (is_fault_pfn(pfn))
 		return -EFAULT;
 
+	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
 	return 1;
 }
 
@@ -2337,7 +2329,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 
 	/* mmio */
 	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+		return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2564,6 +2556,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 
+	vcpu_clear_mmio_info(vcpu, ~0ul);
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2710,7 +2703,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
 	/* mmio */
 	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+		return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e7..05310b105dac 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,4 +76,27 @@ static inline int is_present_gpte(unsigned long pte)
 	return pte & PT_PRESENT_MASK;
 }
 
+static inline int is_writable_pte(unsigned long pte)
+{
+	return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+					   bool write_fault, bool user_fault,
+					   unsigned long pte)
+{
+	if (unlikely(write_fault && !is_writable_pte(pte)
+	      && (user_fault || is_write_protection(vcpu))))
+		return false;
+
+	if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+		return false;
+
+	return true;
+}
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1e1c2444cef5..f0fb1a4c522d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -208,11 +208,8 @@ retry_walk:
 			goto error;
 		}
 
-		if (unlikely(write_fault && !is_writable_pte(pte)
-			     && (user_fault || is_write_protection(vcpu))))
-			eperm = true;
-
-		if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+		if (!check_write_user_access(vcpu, write_fault, user_fault,
+					  pte))
 			eperm = true;
 
 #if PTTYPE == 64
@@ -625,8 +622,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 
 	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+	if (is_error_pfn(pfn)) {
+		unsigned access = walker.pte_access;
+		bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
+
+		if (!dirty)
+			access &= ~ACC_WRITE_MASK;
+
+		return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
+					   addr, access, walker.gfn, pfn);
+	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -666,6 +671,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	u64 *sptep;
 	int need_flush = 0;
 
+	vcpu_clear_mmio_info(vcpu, gva);
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 
 	for_each_shadow_entry(vcpu, gva, iterator) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1dbd0443545..028a0f25e8a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4016,6 +4016,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
+	if (vcpu_match_mmio_gva(vcpu, gva) &&
+		  check_write_user_access(vcpu, write, access,
+		  vcpu->arch.access)) {
+		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+					(gva & (PAGE_SIZE - 1));
+		return 1;
+	}
+
 	if (write)
 		access |= PFERR_WRITE_MASK;
 
@@ -4028,6 +4036,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		return 1;
 
+	if (vcpu_match_mmio_gpa(vcpu, *gpa))
+		return 1;
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 256da82856bd..d36fe237c665 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,6 +75,42 @@ static inline u32 bit(int bitno)
 	return 1 << (bitno & 31);
 }
 
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+					gva_t gva, gfn_t gfn, unsigned access)
+{
+	vcpu->arch.mmio_gva = gva & PAGE_MASK;
+	vcpu->arch.access = access;
+	vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+		return;
+
+	vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+	if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+		return true;
+
+	return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+		return true;
+
+	return false;
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);

From 640d9b0dbe9f744ac8fd517a8f6afe238f8f525b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:24:39 +0800
Subject: [PATCH 129/143] KVM: MMU: optimize to handle dirty bit

If dirty bit is not set, we can make the pte access read-only to avoid handing
dirty bit everywhere

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 13 +++++------
 arch/x86/kvm/paging_tmpl.h | 46 ++++++++++++++++----------------------
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d1986b7dcec7..98812c25727b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1923,7 +1923,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int user_fault,
-		    int write_fault, int dirty, int level,
+		    int write_fault, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync, bool host_writable)
 {
@@ -1938,8 +1938,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte = PT_PRESENT_MASK;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
-	if (!dirty)
-		pte_access &= ~ACC_WRITE_MASK;
+
 	if (pte_access & ACC_EXEC_MASK)
 		spte |= shadow_x_mask;
 	else
@@ -2023,7 +2022,7 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
+			 int user_fault, int write_fault,
 			 int *ptwrite, int level, gfn_t gfn,
 			 pfn_t pfn, bool speculative,
 			 bool host_writable)
@@ -2059,7 +2058,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-		      dirty, level, gfn, pfn, speculative, true,
+		      level, gfn, pfn, speculative, true,
 		      host_writable)) {
 		if (write_fault)
 			*ptwrite = 1;
@@ -2129,7 +2128,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 
 	for (i = 0; i < ret; i++, gfn++, start++)
 		mmu_set_spte(vcpu, start, ACC_ALL,
-			     access, 0, 0, 1, NULL,
+			     access, 0, 0, NULL,
 			     sp->role.level, gfn,
 			     page_to_pfn(pages[i]), true, true);
 
@@ -2193,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			unsigned pte_access = ACC_ALL;
 
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-				     0, write, 1, &pt_write,
+				     0, write, &pt_write,
 				     level, gfn, pfn, prefault, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f0fb1a4c522d..c9fe97bd8c82 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+				   bool last)
 {
 	unsigned access;
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+	if (last && !is_dirty_gpte(gpte))
+		access &= ~ACC_WRITE_MASK;
+
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
@@ -232,8 +236,6 @@ retry_walk:
 			pte |= PT_ACCESSED_MASK;
 		}
 
-		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
 		walker->ptes[walker->level - 1] = pte;
 
 		if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
@@ -268,7 +270,7 @@ retry_walk:
 			break;
 		}
 
-		pt_access = pte_access;
+		pt_access &= FNAME(gpte_access)(vcpu, pte, false);
 		--walker->level;
 	}
 
@@ -293,6 +295,7 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	}
 
+	pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -367,7 +370,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
@@ -379,7 +382,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+		     NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
@@ -430,7 +433,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		unsigned pte_access;
 		gfn_t gfn;
 		pfn_t pfn;
-		bool dirty;
 
 		if (spte == sptep)
 			continue;
@@ -443,18 +445,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 			continue;
 
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+								  true);
 		gfn = gpte_to_gfn(gpte);
-		dirty = is_dirty_gpte(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-				      (pte_access & ACC_WRITE_MASK) && dirty);
+				      pte_access & ACC_WRITE_MASK);
 		if (is_error_pfn(pfn)) {
 			kvm_release_pfn_clean(pfn);
 			break;
 		}
 
 		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
 			     pfn, true, true);
 	}
 }
@@ -470,7 +472,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
-	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	int top_level;
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator it;
@@ -479,8 +480,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		return NULL;
 
 	direct_access = gw->pt_access & gw->pte_access;
-	if (!dirty)
-		direct_access &= ~ACC_WRITE_MASK;
 
 	top_level = vcpu->arch.mmu.root_level;
 	if (top_level == PT32E_ROOT_LEVEL)
@@ -539,7 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
-		     user_fault, write_fault, dirty, ptwrite, it.level,
+		     user_fault, write_fault, ptwrite, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
@@ -622,17 +621,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		unsigned access = walker.pte_access;
-		bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
-
-		if (!dirty)
-			access &= ~ACC_WRITE_MASK;
-
+	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
-					   addr, access, walker.gfn, pfn);
-	}
-
+				      addr, walker.pte_access, walker.gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
@@ -849,11 +840,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		}
 
 		nr_present++;
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+								  true);
 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+			 PT_PAGE_TABLE_LEVEL, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false,
 			 host_writable);
 	}

From b36c7a7c10bf845b623ce187501b561d1d843a18 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:25:19 +0800
Subject: [PATCH 130/143] KVM: MMU: cleanup for FNAME(fetch)

gw->pte_access is the final access permission, since it is unified with
gw->pt_access when we walked guest page table:

FNAME(walk_addr_generic):
	pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c9fe97bd8c82..5c2aa406408f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -479,7 +479,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
 
-	direct_access = gw->pt_access & gw->pte_access;
+	direct_access = gw->pte_access;
 
 	top_level = vcpu->arch.mmu.root_level;
 	if (top_level == PT32E_ROOT_LEVEL)
@@ -537,7 +537,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
-	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
 		     user_fault, write_fault, ptwrite, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);

From b90a0e6c81d7b1fef0b7dea007015e1a56ab14c7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:25:56 +0800
Subject: [PATCH 131/143] KVM: MMU: rename 'pt_write' to 'emulate'

If 'pt_write' is true, we need to emulate the fault. And in later patch, we
need to emulate the fault even though it is not a pt_write event, so rename
it to better fit the meaning

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 +++++-----
 arch/x86/kvm/paging_tmpl.h | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 98812c25727b..a62ba462972e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2023,7 +2023,7 @@ done:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault,
-			 int *ptwrite, int level, gfn_t gfn,
+			 int *emulate, int level, gfn_t gfn,
 			 pfn_t pfn, bool speculative,
 			 bool host_writable)
 {
@@ -2061,7 +2061,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		      level, gfn, pfn, speculative, true,
 		      host_writable)) {
 		if (write_fault)
-			*ptwrite = 1;
+			*emulate = 1;
 		kvm_mmu_flush_tlb(vcpu);
 	}
 
@@ -2184,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	int pt_write = 0;
+	int emulate = 0;
 	gfn_t pseudo_gfn;
 
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2192,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			unsigned pte_access = ACC_ALL;
 
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-				     0, write, &pt_write,
+				     0, write, &emulate,
 				     level, gfn, pfn, prefault, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
@@ -2220,7 +2220,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				   | shadow_accessed_mask);
 		}
 	}
-	return pt_write;
+	return emulate;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5c2aa406408f..fa3b54bbce6b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *ptwrite, pfn_t pfn, bool map_writable,
+			 int *emulate, pfn_t pfn, bool map_writable,
 			 bool prefault)
 {
 	unsigned access = gw->pt_access;
@@ -538,7 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
-		     user_fault, write_fault, ptwrite, it.level,
+		     user_fault, write_fault, emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
@@ -572,7 +572,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int user_fault = error_code & PFERR_USER_MASK;
 	struct guest_walker walker;
 	u64 *sptep;
-	int write_pt = 0;
+	int emulate = 0;
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
@@ -633,19 +633,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn, map_writable, prefault);
+			     level, &emulate, pfn, map_writable, prefault);
 	(void)sptep;
-	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
-		 sptep, *sptep, write_pt);
+	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+		 sptep, *sptep, emulate);
 
-	if (!write_pt)
+	if (!emulate)
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
-	return write_pt;
+	return emulate;
 
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);

From aa6bd187af013319c3f18be7b0970d9a3d1be696 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:26:40 +0800
Subject: [PATCH 132/143] KVM: MMU: count used shadow pages on prepareing path

Move counting used shadow pages from commiting path to preparing path to
reduce tlb flush on some paths

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a62ba462972e..91d30695677b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,7 +1039,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
 	hlist_del(&sp->hash_link);
@@ -1048,7 +1048,6 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		free_page((unsigned long)sp->gfns);
 	kmem_cache_free(mmu_page_header_cache, sp);
-	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1655,6 +1654,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 		/* Count self */
 		ret++;
 		list_move(&sp->link, invalid_list);
+		kvm_mod_used_mmu_pages(kvm, -1);
 	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
@@ -1678,7 +1678,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
-		kvm_mmu_free_page(kvm, sp);
+		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
 
 }
@@ -1704,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
 
@@ -3302,9 +3302,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,

From bd4c86eaa6ff10abc4e00d0f45d2a28b10b09df4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:27:14 +0800
Subject: [PATCH 133/143] KVM: MMU: split kvm_mmu_free_page

Split kvm_mmu_free_page to kvm_mmu_isolate_page and
kvm_mmu_free_page

One is used to remove the page from cache under mmu lock and the other is
used to free page table out of mmu lock

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91d30695677b..2f8543c65fa3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,14 +1039,28 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
 	hlist_del(&sp->hash_link);
-	list_del(&sp->link);
-	free_page((unsigned long)sp->spt);
 	if (!sp->role.direct)
 		free_page((unsigned long)sp->gfns);
+}
+
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+	list_del(&sp->link);
+	free_page((unsigned long)sp->spt);
 	kmem_cache_free(mmu_page_header_cache, sp);
 }
 
@@ -1678,6 +1692,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
+		kvm_mmu_isolate_page(sp);
 		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
 

From c37079586f317d7e7f1a70d36f0e5177691c89c2 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:28:04 +0800
Subject: [PATCH 134/143] KVM: MMU: remove bypass_guest_pf

The idea is from Avi:
| Maybe it's time to kill off bypass_guest_pf=1.  It's not as effective as
| it used to be, since unsync pages always use shadow_trap_nonpresent_pte,
| and since we convert between the two nonpresent_ptes during sync and unsync.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kernel-parameters.txt |  4 --
 arch/x86/include/asm/kvm_host.h     |  3 --
 arch/x86/kvm/mmu.c                  | 83 +++++++++--------------------
 arch/x86/kvm/mmu_audit.c            | 12 -----
 arch/x86/kvm/paging_tmpl.h          | 51 +++---------------
 arch/x86/kvm/vmx.c                  | 11 +---
 arch/x86/kvm/x86.c                  |  1 -
 7 files changed, 33 insertions(+), 132 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a7225746ed96..1810a6b51bab 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			for all guests.
 			Default is 1 (enabled) if in 64bit or 32bit-PAE mode
 
-	kvm-intel.bypass_guest_pf=
-			[KVM,Intel] Disables bypassing of guest page faults
-			on Intel chips. Default is 1 (enabled)
-
 	kvm-intel.ept=	[KVM,Intel] Disable extended page tables
 			(virtualized MMU) support on capable Intel chips.
 			Default is 1 (enabled)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8da1400ab581..a198a5b2f04e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    struct x86_exception *exception);
 	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
-	void (*prefetch_page)(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -647,7 +645,6 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2f8543c65fa3..5334b4e9ecc7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -186,8 +186,6 @@ static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -199,13 +197,6 @@ static inline u64 rsvd_bits(int s, int e)
 	return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
-	shadow_trap_nonpresent_pte = trap_pte;
-	shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -229,8 +220,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return pte != shadow_trap_nonpresent_pte
-		&& pte != shadow_notrap_nonpresent_pte;
+	return pte & PT_PRESENT_MASK;
 }
 
 static int is_large_pte(u64 pte)
@@ -777,9 +767,9 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
 	return 1;
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-	if (set_spte_track_bits(sptep, new_spte))
+	if (set_spte_track_bits(sptep, 0ull))
 		rmap_remove(kvm, sptep);
 }
 
@@ -814,8 +804,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				drop_spte(kvm, spte,
-					  shadow_trap_nonpresent_pte);
+				drop_spte(kvm, spte);
 				--kvm->stat.lpages;
 				spte = NULL;
 				write_protected = 1;
@@ -836,7 +825,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -858,7 +847,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1088,7 +1077,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 			    u64 *parent_pte)
 {
 	mmu_page_remove_parent_pte(sp, parent_pte);
-	__set_spte(parent_pte, shadow_trap_nonpresent_pte);
+	__set_spte(parent_pte, 0ull);
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -1130,15 +1119,6 @@ static void mark_unsync(u64 *spte)
 	kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-		sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 			       struct kvm_mmu_page *sp)
 {
@@ -1420,6 +1400,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	}
 }
 
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		sp->spt[i] = 0ull;
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1482,10 +1470,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		account_shadowed(vcpu->kvm, gfn);
 	}
-	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-		vcpu->arch.mmu.prefetch_page(vcpu, sp);
-	else
-		nonpaging_prefetch_page(vcpu, sp);
+	init_shadow_page_table(sp);
 	trace_kvm_mmu_get_page(sp, true);
 	return sp;
 }
@@ -1546,7 +1531,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
 	if (is_large_pte(*sptep)) {
-		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+		drop_spte(vcpu->kvm, sptep);
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 }
@@ -1582,13 +1567,13 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
-			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
 	}
-	__set_spte(spte, shadow_trap_nonpresent_pte);
+
 	if (is_large_pte(pte))
 		--kvm->stat.lpages;
 }
@@ -1769,20 +1754,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	__set_bit(slot, sp->slot_bitmap);
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
-	int i;
-	u64 *pt = sp->spt;
-
-	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
-		return;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		if (pt[i] == shadow_notrap_nonpresent_pte)
-			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
-	}
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1895,7 +1866,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	sp->unsync = 1;
 
 	kvm_mmu_mark_parents_unsync(sp);
-	mmu_convert_notrap(sp);
 }
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
@@ -1980,7 +1950,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep);
 			goto done;
 		}
 
@@ -2066,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
@@ -2162,7 +2132,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
 	spte = sp->spt + i;
 
 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+		if (is_shadow_present_pte(*spte) || spte == sptep) {
 			if (!start)
 				continue;
 			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2214,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			break;
 		}
 
-		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+		if (!is_shadow_present_pte(*iterator.sptep)) {
 			u64 base_addr = iterator.addr;
 
 			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2748,7 +2718,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->free = nonpaging_free;
-	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
 	context->update_pte = nonpaging_update_pte;
@@ -2878,7 +2847,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
-	context->prefetch_page = paging64_prefetch_page;
 	context->sync_page = paging64_sync_page;
 	context->invlpg = paging64_invlpg;
 	context->update_pte = paging64_update_pte;
@@ -2907,7 +2875,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->free = paging_free;
-	context->prefetch_page = paging32_prefetch_page;
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
 	context->update_pte = paging32_update_pte;
@@ -2932,7 +2899,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
-	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
 	context->update_pte = nonpaging_update_pte;
@@ -3443,8 +3409,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				continue;
 
 			if (is_large_pte(pt[i])) {
-				drop_spte(kvm, &pt[i],
-					  shadow_trap_nonpresent_pte);
+				drop_spte(kvm, &pt[i]);
 				--kvm->stat.lpages;
 				continue;
 			}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf7..2460a265be23 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 				     "level = %d\n", sp, level);
 			return;
 		}
-
-		if (*sptep == shadow_notrap_nonpresent_pte) {
-			audit_printk(vcpu->kvm, "notrap spte in unsync "
-				     "sp: %p\n", sp);
-			return;
-		}
-	}
-
-	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-		audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
-			     sp);
-		return;
 	}
 
 	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index fa3b54bbce6b..a4565df501cd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -337,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp, u64 *spte,
 				    pt_element_t gpte)
 {
-	u64 nonpresent = shadow_trap_nonpresent_pte;
-
 	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
 		goto no_present;
 
-	if (!is_present_gpte(gpte)) {
-		if (!sp->unsync)
-			nonpresent = shadow_notrap_nonpresent_pte;
+	if (!is_present_gpte(gpte))
 		goto no_present;
-	}
 
 	if (!(gpte & PT_ACCESSED_MASK))
 		goto no_present;
@@ -354,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	return false;
 
 no_present:
-	drop_spte(vcpu->kvm, spte, nonpresent);
+	drop_spte(vcpu->kvm, spte);
 	return true;
 }
 
@@ -437,7 +432,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		if (spte == sptep)
 			continue;
 
-		if (*spte != shadow_trap_nonpresent_pte)
+		if (is_shadow_present_pte(*spte))
 			continue;
 
 		gpte = gptep[i];
@@ -687,11 +682,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			if (is_shadow_present_pte(*sptep)) {
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
-				drop_spte(vcpu->kvm, sptep,
-					  shadow_trap_nonpresent_pte);
+				drop_spte(vcpu->kvm, sptep);
 				need_flush = 1;
-			} else
-				__set_spte(sptep, shadow_trap_nonpresent_pte);
+			}
+
 			break;
 		}
 
@@ -751,36 +745,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return gpa;
 }
 
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
-				 struct kvm_mmu_page *sp)
-{
-	int i, j, offset, r;
-	pt_element_t pt[256 / sizeof(pt_element_t)];
-	gpa_t pte_gpa;
-
-	if (sp->role.direct
-	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
-		nonpaging_prefetch_page(vcpu, sp);
-		return;
-	}
-
-	pte_gpa = gfn_to_gpa(sp->gfn);
-	if (PTTYPE == 32) {
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-		pte_gpa += offset * sizeof(pt_element_t);
-	}
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
-		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
-		pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
-		for (j = 0; j < ARRAY_SIZE(pt); ++j)
-			if (r || is_present_gpte(pt[j]))
-				sp->spt[i+j] = shadow_trap_nonpresent_pte;
-			else
-				sp->spt[i+j] = shadow_notrap_nonpresent_pte;
-	}
-}
-
 /*
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -833,8 +797,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		}
 
 		if (gfn != sp->gfns[i]) {
-			drop_spte(vcpu->kvm, &sp->spt[i],
-				      shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, &sp->spt[i]);
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5b49c7fc89d..a644acb6ed80 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,9 +49,6 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
 static int __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -3632,8 +3629,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write32(PLE_WINDOW, ple_window);
 	}
 
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
@@ -7103,16 +7100,12 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
 	if (enable_ept) {
-		bypass_guest_pf = 0;
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
 				VMX_EPT_EXECUTABLE_MASK);
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
 
-	if (bypass_guest_pf)
-		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
 	return 0;
 
 out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 028a0f25e8a0..64c42d90112b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5091,7 +5091,6 @@ int kvm_arch_init(void *opaque)
 	kvm_init_msr_list();
 
 	kvm_x86_ops = ops;
-	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 

From fce92dce79dbf5fff39c7ac2fb149729d79b7a39 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:28:54 +0800
Subject: [PATCH 135/143] KVM: MMU: filter out the mmio pfn from the fault pfn

If the page fault is caused by mmio, the gfn can not be found in memslots, and
'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify
the mmio page fault.
And, to clarify the meaning of mmio pfn, we return fault page instead of bad
page when the gfn is not allowd to prefetch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       |  4 ++--
 include/linux/kvm_host.h |  5 +++++
 virt/kvm/kvm_main.c      | 16 ++++++++++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5334b4e9ecc7..96a7ed4e6837 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2085,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
 	if (!slot) {
-		get_page(bad_page);
-		return page_to_pfn(bad_page);
+		get_page(fault_page);
+		return page_to_pfn(fault_page);
 	}
 
 	hva = gfn_to_hva_memslot(slot, gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8e023902f79..eabb21a30c34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -327,12 +327,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 
 extern struct page *bad_page;
+extern struct page *fault_page;
+
 extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
 int is_fault_pfn(pfn_t pfn);
+int is_noslot_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d5ef9ebcaff7..56f3c704fd74 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,8 +101,8 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
@@ -931,6 +931,18 @@ int is_fault_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_fault_pfn);
 
+int is_noslot_pfn(pfn_t pfn)
+{
+	return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_noslot_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;

From d7c55201e66e9f702db575c9dfc2d34a7af6cf1f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:29:38 +0800
Subject: [PATCH 136/143] KVM: MMU: abstract some functions to handle fault pfn

Introduce handle_abnormal_pfn to handle fault pfn on page fault path,
introduce mmu_invalid_pfn to handle fault pfn on prefetch path

It is the preparing work for mmio page fault support

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 47 ++++++++++++++++++++++++++++----------
 arch/x86/kvm/paging_tmpl.h | 12 +++++-----
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 96a7ed4e6837..1d4a2d9cc718 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2221,18 +2221,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 	send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
-			       unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
 		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
-	} else if (is_fault_pfn(pfn))
-		return -EFAULT;
+	}
 
-	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-	return 1;
+	return -EFAULT;
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2277,6 +2274,33 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	}
 }
 
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+	return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+				pfn_t pfn, unsigned access, int *ret_val)
+{
+	bool ret = true;
+
+	/* The pfn is invalid, report the error! */
+	if (unlikely(is_invalid_pfn(pfn))) {
+		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+		goto exit;
+	}
+
+	if (unlikely(is_noslot_pfn(pfn))) {
+		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+		*ret_val = 1;
+		goto exit;
+	}
+
+	ret = false;
+exit:
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2311,9 +2335,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2685,9 +2708,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+		return r;
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a4565df501cd..67998d3be084 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -367,7 +367,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-	if (is_error_pfn(pfn)) {
+	if (mmu_invalid_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return;
 	}
@@ -445,7 +445,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		gfn = gpte_to_gfn(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 				      pte_access & ACC_WRITE_MASK);
-		if (is_error_pfn(pfn)) {
+		if (mmu_invalid_pfn(pfn)) {
 			kvm_release_pfn_clean(pfn);
 			break;
 		}
@@ -615,10 +615,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 			 &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
-				      addr, walker.pte_access, walker.gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+				walker.gfn, pfn, walker.pte_access, &r))
+		return r;
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;

From 1df9f2dc39948c3cb900725b7f0754fb385c8354 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:30:35 +0800
Subject: [PATCH 137/143] KVM: MMU: introduce the rules to modify shadow page
 table

Introduce some interfaces to modify spte as linux kernel does:
- mmu_spte_clear_track_bits, it set the spte from present to nonpresent, and
  track the stat bits(accessed/dirty) of spte
- mmu_spte_clear_no_track, the same as mmu_spte_clear_track_bits except
  tracking the stat bits
- mmu_spte_set, set spte from nonpresent to present
- mmu_spte_update, only update the stat bits

Now, it does not allowed to set spte from present to present, later, we can
drop the atomicly opration for X86_32 host, and it is the preparing work to
get spte on X86_32 host out of the mmu lock

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 103 ++++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1d4a2d9cc718..982718fe12a7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,12 +299,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
 	return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
 
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+	WARN_ON(is_shadow_present_pte(*sptep));
+	__set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 mask, old_spte = *sptep;
 
 	WARN_ON(!is_rmap_spte(new_spte));
 
+	if (!is_shadow_present_pte(old_spte))
+		return mmu_spte_set(sptep, new_spte);
+
 	new_spte |= old_spte & shadow_dirty_mask;
 
 	mask = shadow_accessed_mask;
@@ -325,6 +343,42 @@ static void update_spte(u64 *sptep, u64 new_spte)
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+	pfn_t pfn;
+	u64 old_spte = *sptep;
+
+	if (!spte_has_volatile_bits(old_spte))
+		__set_spte(sptep, 0ull);
+	else
+		old_spte = __xchg_spte(sptep, 0ull);
+
+	if (!is_rmap_spte(old_spte))
+		return 0;
+
+	pfn = spte_to_pfn(old_spte);
+	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+		kvm_set_pfn_dirty(pfn);
+	return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+	__set_spte(sptep, 0ull);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -746,30 +800,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	pte_list_remove(spte, rmapp);
 }
 
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
-	pfn_t pfn;
-	u64 old_spte = *sptep;
-
-	if (!spte_has_volatile_bits(old_spte))
-		__set_spte(sptep, new_spte);
-	else
-		old_spte = __xchg_spte(sptep, new_spte);
-
-	if (!is_rmap_spte(old_spte))
-		return 0;
-
-	pfn = spte_to_pfn(old_spte);
-	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
-		kvm_set_pfn_dirty(pfn);
-	return 1;
-}
-
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-	if (set_spte_track_bits(sptep, 0ull))
+	if (mmu_spte_clear_track_bits(sptep))
 		rmap_remove(kvm, sptep);
 }
 
@@ -787,7 +820,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -856,7 +889,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			set_spte_track_bits(spte, new_spte);
+			mmu_spte_clear_track_bits(spte);
+			mmu_spte_set(spte, new_spte);
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1077,7 +1111,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 			    u64 *parent_pte)
 {
 	mmu_page_remove_parent_pte(sp, parent_pte);
-	__set_spte(parent_pte, 0ull);
+	mmu_spte_clear_no_track(parent_pte);
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -1525,7 +1559,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	spte = __pa(sp->spt)
 		| PT_PRESENT_MASK | PT_ACCESSED_MASK
 		| PT_WRITABLE_MASK | PT_USER_MASK;
-	__set_spte(sptep, spte);
+	mmu_spte_set(sptep, spte);
 }
 
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -1992,7 +2026,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	update_spte(sptep, spte);
+	mmu_spte_update(sptep, spte);
 	/*
 	 * If we overwrite a writable spte with a read-only one we
 	 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2198,11 +2232,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				return -ENOMEM;
 			}
 
-			__set_spte(iterator.sptep,
-				   __pa(sp->spt)
-				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask
-				   | shadow_accessed_mask);
+			mmu_spte_set(iterator.sptep,
+				     __pa(sp->spt)
+				     | PT_PRESENT_MASK | PT_WRITABLE_MASK
+				     | shadow_user_mask | shadow_x_mask
+				     | shadow_accessed_mask);
 		}
 	}
 	return emulate;
@@ -3439,7 +3473,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 
 			/* avoid RMW */
 			if (is_writable_pte(pt[i]))
-				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+				mmu_spte_update(&pt[i],
+						pt[i] & ~PT_WRITABLE_MASK);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);

From 603e0651cfc8562b103454d7ded71f3ad1eb3a37 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:31:28 +0800
Subject: [PATCH 138/143] KVM: MMU: do not need atomicly to set/clear spte

Now, the spte is just from nonprsent to present or present to nonprsent, so
we can use some trick to set/clear spte non-atomicly as linux kernel does

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 92 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 982718fe12a7..a22b5fe92167 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,26 +259,82 @@ static gfn_t pse36_gfn_delta(u32 gpte)
 	return (gpte & PT32_DIR_PSE36_MASK) << shift;
 }
 
+#ifdef CONFIG_X86_64
 static void __set_spte(u64 *sptep, u64 spte)
 {
-	set_64bit(sptep, spte);
+	*sptep = spte;
 }
 
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 {
-#ifdef CONFIG_X86_64
-	return xchg(sptep, new_spte);
-#else
-	u64 old_spte;
-
-	do {
-		old_spte = *sptep;
-	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
-
-	return old_spte;
-#endif
+	*sptep = spte;
 }
 
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+	return xchg(sptep, spte);
+}
+#else
+union split_spte {
+	struct {
+		u32 spte_low;
+		u32 spte_high;
+	};
+	u64 spte;
+};
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	ssptep->spte_high = sspte.spte_high;
+
+	/*
+	 * If we map the spte from nonpresent to present, We should store
+	 * the high bits firstly, then set present bit, so cpu can not
+	 * fetch this spte while we are setting the spte.
+	 */
+	smp_wmb();
+
+	ssptep->spte_low = sspte.spte_low;
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	ssptep->spte_low = sspte.spte_low;
+
+	/*
+	 * If we map the spte from present to nonpresent, we should clear
+	 * present bit firstly to avoid vcpu fetch the old high bits.
+	 */
+	smp_wmb();
+
+	ssptep->spte_high = sspte.spte_high;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte, orig;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	/* xchg acts as a barrier before the setting of the high bits */
+	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+	orig.spte_high = ssptep->spte_high = sspte.spte_high;
+
+	return orig.spte;
+}
+#endif
+
 static bool spte_has_volatile_bits(u64 spte)
 {
 	if (!shadow_accessed_mask)
@@ -330,9 +386,9 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
 		mask |= shadow_dirty_mask;
 
 	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
-		__set_spte(sptep, new_spte);
+		__update_clear_spte_fast(sptep, new_spte);
 	else
-		old_spte = __xchg_spte(sptep, new_spte);
+		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
 	if (!shadow_accessed_mask)
 		return;
@@ -354,9 +410,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	u64 old_spte = *sptep;
 
 	if (!spte_has_volatile_bits(old_spte))
-		__set_spte(sptep, 0ull);
+		__update_clear_spte_fast(sptep, 0ull);
 	else
-		old_spte = __xchg_spte(sptep, 0ull);
+		old_spte = __update_clear_spte_slow(sptep, 0ull);
 
 	if (!is_rmap_spte(old_spte))
 		return 0;
@@ -376,7 +432,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-	__set_spte(sptep, 0ull);
+	__update_clear_spte_fast(sptep, 0ull);
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,

From c2a2ac2b563ccc3a69540965b5a994c19e3817d7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:32:13 +0800
Subject: [PATCH 139/143] KVM: MMU: lockless walking shadow page table

Use rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fastly and is needed by mmio page fault

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   8 ++
 arch/x86/kvm/mmu.c              | 132 ++++++++++++++++++++++++++++++--
 2 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a198a5b2f04e..dd51c83aa5de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -233,6 +233,12 @@ struct kvm_mmu_page {
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+	int clear_spte_count;
+#endif
+
+	struct rcu_head rcu;
 };
 
 struct kvm_pv_mmu_op_buffer {
@@ -486,6 +492,8 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
+	atomic_t reader_counter;
+
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a22b5fe92167..374530a478a4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
+	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
+	     shadow_walk_okay(&(_walker)) &&				\
+		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
+	     __shadow_walk_next(&(_walker), spte))
+
 static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 {
 	return xchg(sptep, spte);
 }
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+	return ACCESS_ONCE(*sptep);
+}
 #else
 union split_spte {
 	struct {
@@ -283,6 +294,18 @@ union split_spte {
 	u64 spte;
 };
 
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+
+	if (is_shadow_present_pte(spte))
+		return;
+
+	/* Ensure the spte is completely set before we increase the count */
+	smp_wmb();
+	sp->clear_spte_count++;
+}
+
 static void __set_spte(u64 *sptep, u64 spte)
 {
 	union split_spte *ssptep, sspte;
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 	smp_wmb();
 
 	ssptep->spte_high = sspte.spte_high;
+	count_spte_clear(sptep, spte);
 }
 
 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 	/* xchg acts as a barrier before the setting of the high bits */
 	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 	orig.spte_high = ssptep->spte_high = sspte.spte_high;
+	count_spte_clear(sptep, spte);
 
 	return orig.spte;
 }
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+	union split_spte spte, *orig = (union split_spte *)sptep;
+	int count;
+
+retry:
+	count = sp->clear_spte_count;
+	smp_rmb();
+
+	spte.spte_low = orig->spte_low;
+	smp_rmb();
+
+	spte.spte_high = orig->spte_high;
+	smp_rmb();
+
+	if (unlikely(spte.spte_low != orig->spte_low ||
+	      count != sp->clear_spte_count))
+		goto retry;
+
+	return spte.spte;
+}
 #endif
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
 	__update_clear_spte_fast(sptep, 0ull);
 }
 
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+	return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+	rcu_read_lock();
+	atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+	/* Increase the counter before walking shadow page table */
+	smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+	/* Decrease the counter after walking shadow page table finished */
+	smp_mb__before_atomic_dec();
+	atomic_dec(&vcpu->kvm->arch.reader_counter);
+	rcu_read_unlock();
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 	return true;
 }
 
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+			       u64 spte)
 {
-	if (is_last_spte(*iterator->sptep, iterator->level)) {
+	if (is_last_spte(spte, iterator->level)) {
 		iterator->level = 0;
 		return;
 	}
 
-	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
 	--iterator->level;
 }
 
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+	return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 {
 	u64 spte;
@@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, invalid_list, link)
+		kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+	struct kvm_mmu_page *next, *sp;
+
+	sp = container_of(head, struct kvm_mmu_page, rcu);
+	while (sp) {
+		if (!list_empty(&sp->link))
+			next = list_first_entry(&sp->link,
+				      struct kvm_mmu_page, link);
+		else
+			next = NULL;
+		kvm_mmu_free_page(sp);
+		sp = next;
+	}
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 	kvm_flush_remote_tlbs(kvm);
 
+	if (atomic_read(&kvm->arch.reader_counter)) {
+		kvm_mmu_isolate_pages(invalid_list);
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		list_del_init(invalid_list);
+		call_rcu(&sp->rcu, free_pages_rcu);
+		return;
+	}
+
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
@@ -3784,16 +3899,17 @@ out:
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 {
 	struct kvm_shadow_walk_iterator iterator;
+	u64 spte;
 	int nr_sptes = 0;
 
-	spin_lock(&vcpu->kvm->mmu_lock);
-	for_each_shadow_entry(vcpu, addr, iterator) {
-		sptes[iterator.level-1] = *iterator.sptep;
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+		sptes[iterator.level-1] = spte;
 		nr_sptes++;
-		if (!is_shadow_present_pte(*iterator.sptep))
+		if (!is_shadow_present_pte(spte))
 			break;
 	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	walk_shadow_page_lockless_end(vcpu);
 
 	return nr_sptes;
 }

From dd3bfd59dbc69fd970394ab354cfca5f959d5755 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:32:54 +0800
Subject: [PATCH 140/143] KVM: MMU: reorganize struct kvm_shadow_walk_iterator

Reorganize it for good using the cache

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 374530a478a4..4b1aa6772147 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,8 +172,8 @@ struct pte_list_desc {
 struct kvm_shadow_walk_iterator {
 	u64 addr;
 	hpa_t shadow_addr;
-	int level;
 	u64 *sptep;
+	int level;
 	unsigned index;
 };
 

From ce88decffd17bf9f373cc233c961ad2054965667 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:33:44 +0800
Subject: [PATCH 141/143] KVM: MMU: mmio page fault support

The idea is from Avi:

| We could cache the result of a miss in an spte by using a reserved bit, and
| checking the page fault error code (or seeing if we get an ept violation or
| ept misconfiguration), so if we get repeated mmio on a page, we don't need to
| search the slot list/tree.
| (https://lkml.org/lkml/2011/2/22/221)

When the page fault is caused by mmio, we cache the info in the shadow page
table, and also set the reserved bits in the shadow page table, so if the mmio
is caused again, we can quickly identify it and emulate it directly

Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it
can be reduced by this feature, and also avoid walking guest page table for
soft mmu.

[jan: fix operator precedence issue]

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 192 +++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu.h         |   2 +
 arch/x86/kvm/paging_tmpl.h |  21 ++--
 arch/x86/kvm/vmx.c         |  22 ++++-
 arch/x86/kvm/x86.c         |  25 +++++
 virt/kvm/kvm_main.c        |   7 ++
 6 files changed, 255 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b1aa6772147..4e22df6f93ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+{
+	shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+	access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+	return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+	if (unlikely(is_noslot_pfn(pfn))) {
+		mark_mmio_spte(sptep, gfn, access);
+		return true;
+	}
+
+	return false;
+}
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return pte & PT_PRESENT_MASK;
+	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep)
 {
 	return ACCESS_ONCE(*sptep);
 }
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+	/* It is valid if the spte is zapped. */
+	return spte == 0ull;
+}
 #else
 union split_spte {
 	struct {
@@ -388,6 +435,23 @@ retry:
 
 	return spte.spte;
 }
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+	union split_spte sspte = (union split_spte)spte;
+	u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+	/* It is valid if the spte is zapped. */
+	if (spte == 0ull)
+		return true;
+
+	/* It is valid if the spte is being zapped. */
+	if (sspte.spte_low == 0ull &&
+	    (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+		return true;
+
+	return false;
+}
 #endif
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-	}
+	} else if (is_mmio_spte(pte))
+		mmu_spte_clear_no_track(spte);
 
 	if (is_large_pte(pte))
 		--kvm->stat.lpages;
@@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	u64 spte, entry = *sptep;
 	int ret = 0;
 
+	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+		return 0;
+
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		kvm_mmu_flush_tlb(vcpu);
 	}
 
+	if (unlikely(is_mmio_spte(*sptep) && emulate))
+		*emulate = 1;
+
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 
 static bool mmu_invalid_pfn(pfn_t pfn)
 {
-	return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
+	return unlikely(is_invalid_pfn(pfn));
 }
 
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2495,11 +2566,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 		goto exit;
 	}
 
-	if (unlikely(is_noslot_pfn(pfn))) {
+	if (unlikely(is_noslot_pfn(pfn)))
 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-		*ret_val = 1;
-		goto exit;
-	}
 
 	ret = false;
 exit:
@@ -2813,6 +2881,92 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+	if (direct)
+		return vcpu_match_mmio_gpa(vcpu, addr);
+
+	return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ *   - It is the mmio spte
+ *   - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+	return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	u64 spte = 0ull;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+		if (!is_shadow_present_pte(spte))
+			break;
+	walk_shadow_page_lockless_end(vcpu);
+
+	return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+	u64 spte;
+
+	if (quickly_check_mmio_pf(vcpu, addr, direct))
+		return 1;
+
+	spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+	if (is_mmio_spte(spte)) {
+		gfn_t gfn = get_mmio_spte_gfn(spte);
+		unsigned access = get_mmio_spte_access(spte);
+
+		if (direct)
+			addr = 0;
+		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+		return 1;
+	}
+
+	/*
+	 * It's ok if the gva is remapped by other cpus on shadow guest,
+	 * it's a BUG if the gfn is not a mmio page.
+	 */
+	if (direct && !check_direct_spte_mmio_pf(spte))
+		return -1;
+
+	/*
+	 * If the page table is zapped by other cpus, let CPU fault again on
+	 * the address.
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+				  u32 error_code, bool direct)
+{
+	int ret;
+
+	ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+	WARN_ON(ret < 0);
+	return ret;
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
@@ -2820,6 +2974,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	int r;
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -2896,6 +3054,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -2993,6 +3154,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+			   int *nr_present)
+{
+	if (unlikely(is_mmio_spte(*sptep))) {
+		if (gfn != get_mmio_spte_gfn(*sptep)) {
+			mmu_spte_clear_no_track(sptep);
+			return true;
+		}
+
+		(*nr_present)++;
+		mark_mmio_spte(sptep, gfn, access);
+		return true;
+	}
+
+	return false;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05310b105dac..e374db9af021 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67998d3be084..507e2b844cfa 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -577,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, addr, error_code,
+					      mmu_is_nested(vcpu));
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -684,7 +688,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 					--vcpu->kvm->stat.lpages;
 				drop_spte(vcpu->kvm, sptep);
 				need_flush = 1;
-			}
+			} else if (is_mmio_spte(*sptep))
+				mmu_spte_clear_no_track(sptep);
 
 			break;
 		}
@@ -780,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gpa_t pte_gpa;
 		gfn_t gfn;
 
-		if (!is_shadow_present_pte(sp->spt[i]))
+		if (!sp->spt[i])
 			continue;
 
 		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -789,13 +794,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		gfn = gpte_to_gfn(gpte);
-
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
+		gfn = gpte_to_gfn(gpte);
+		pte_access = sp->role.access;
+		pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+			continue;
+
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i]);
 			vcpu->kvm->tlbs_dirty++;
@@ -803,8 +813,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		}
 
 		nr_present++;
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
-								  true);
+
 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a644acb6ed80..e65a158dee64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3594,6 +3594,17 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
+static void ept_set_mmio_spte_mask(void)
+{
+	/*
+	 * EPT Misconfigurations can be generated if the value of bits 2:0
+	 * of an EPT paging-structure entry is 110b (write/execute).
+	 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+	 * spte.
+	 */
+	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -4671,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
 	u64 sptes[4];
-	int nr_sptes, i;
+	int nr_sptes, i, ret;
 	gpa_t gpa;
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
+	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+	if (likely(ret == 1))
+		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+					      EMULATE_DONE;
+	if (unlikely(!ret))
+		return 1;
+
+	/* It is the real ept misconfig */
 	printk(KERN_ERR "EPT: Misconfiguration.\n");
 	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
 
@@ -7102,6 +7121,7 @@ static int __init vmx_init(void)
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
 				VMX_EPT_EXECUTABLE_MASK);
+		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 64c42d90112b..2c9661f230a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
+static void kvm_set_mmio_spte_mask(void)
+{
+	u64 mask;
+	int maxphyaddr = boot_cpu_data.x86_phys_bits;
+
+	/*
+	 * Set the reserved bits and the present bit of an paging-structure
+	 * entry to generate page fault with PFER.RSV = 1.
+	 */
+	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+	mask |= 1ull;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If reserved bit is not supported, clear the present bit to disable
+	 * mmio page fault.
+	 */
+	if (maxphyaddr == 52)
+		mask &= ~1ull;
+#endif
+
+	kvm_mmu_set_mmio_spte_mask(mask);
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque)
 	if (r)
 		goto out;
 
+	kvm_set_mmio_spte_mask();
 	kvm_init_msr_list();
 
 	kvm_x86_ops = ops;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 56f3c704fd74..aefdda390f5e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -831,6 +831,13 @@ skip_lpage:
 
 	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
+	/*
+	 * If the new memory slot is created, we need to clear all
+	 * mmio sptes.
+	 */
+	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
+		kvm_arch_flush_shadow(kvm);
+
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
 

From 4f0226482d20f104e943ee9e6f1218b573953f63 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 12 Jul 2011 03:34:24 +0800
Subject: [PATCH 142/143] KVM: MMU: trace mmio page fault

Add tracepoints to trace mmio page fault

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c      |  5 +++++
 arch/x86/kvm/mmutrace.h | 48 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/trace.h    | 23 ++++++++++++++++++++
 arch/x86/kvm/x86.c      |  5 ++++-
 4 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4e22df6f93ec..9335e1bf72ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -211,6 +211,7 @@ static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
 {
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
 
+	trace_mark_mmio_spte(sptep, gfn, access);
 	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
 }
 
@@ -1940,6 +1941,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		kvm_mmu_isolate_pages(invalid_list);
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		list_del_init(invalid_list);
+
+		trace_kvm_mmu_delay_free_pages(sp);
 		call_rcu(&sp->rcu, free_pages_rcu);
 		return;
 	}
@@ -2938,6 +2941,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 
 		if (direct)
 			addr = 0;
+
+		trace_handle_mmio_page_fault(addr, gfn, access);
 		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
 		return 1;
 	}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3eda..eed67f34146d 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 	TP_ARGS(sp)
 );
 
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+	TP_PROTO(struct kvm_mmu_page *sp),
+
+	TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+	mark_mmio_spte,
+	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+	TP_ARGS(sptep, gfn, access),
+
+	TP_STRUCT__entry(
+		__field(void *, sptep)
+		__field(gfn_t, gfn)
+		__field(unsigned, access)
+	),
+
+	TP_fast_assign(
+		__entry->sptep = sptep;
+		__entry->gfn = gfn;
+		__entry->access = access;
+	),
+
+	TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+		  __entry->access)
+);
+
+TRACE_EVENT(
+	handle_mmio_page_fault,
+	TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+	TP_ARGS(addr, gfn, access),
+
+	TP_STRUCT__entry(
+		__field(u64, addr)
+		__field(gfn_t, gfn)
+		__field(unsigned, access)
+	),
+
+	TP_fast_assign(
+		__entry->addr = addr;
+		__entry->gfn = gfn;
+		__entry->access = access;
+	),
+
+	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+		  __entry->access)
+);
+
 TRACE_EVENT(
 	kvm_mmu_audit,
 	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 624f8cb46a6b..3ff898c104f7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
 #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
 #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
 
+TRACE_EVENT(
+	vcpu_match_mmio,
+	TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+	TP_ARGS(gva, gpa, write, gpa_match),
+
+	TP_STRUCT__entry(
+		__field(gva_t, gva)
+		__field(gpa_t, gpa)
+		__field(bool, write)
+		__field(bool, gpa_match)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		__entry->gpa = gpa;
+		__entry->write = write;
+		__entry->gpa_match = gpa_match
+		),
+
+	TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+		  __entry->write ? "Write" : "Read",
+		  __entry->gpa_match ? "GPA" : "GVA")
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c9661f230a9..84a28ea45fa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4021,6 +4021,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 		  vcpu->arch.access)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
+		trace_vcpu_match_mmio(gva, *gpa, write, false);
 		return 1;
 	}
 
@@ -4036,8 +4037,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		return 1;
 
-	if (vcpu_match_mmio_gpa(vcpu, *gpa))
+	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+		trace_vcpu_match_mmio(gva, *gpa, write, true);
 		return 1;
+	}
 
 	return 0;
 }

From 3f68b0318bbbd61bf08478ab99a149f0d9e5156e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 14 Jul 2011 13:27:03 -0600
Subject: [PATCH 143/143] KVM: IOMMU: Disable device assignment without
 interrupt remapping

IOMMU interrupt remapping support provides a further layer of
isolation for device assignment by preventing arbitrary interrupt
block DMA writes by a malicious guest from reaching the host.  By
default, we should require that the platform provides interrupt
remapping support, with an opt-in mechanism for existing behavior.

Both AMD IOMMU and Intel VT-d2 hardware support interrupt
remapping, however we currently only have software support on
the Intel side.  Users wishing to re-enable device assignment
when interrupt remapping is not supported on the platform can
use the "allow_unsafe_assigned_interrupts=1" module option.

[avi: break long lines]

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/iommu.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 62a9caf0563c..78c80f67f535 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -30,6 +30,12 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
+static int allow_unsafe_assigned_interrupts;
+module_param_named(allow_unsafe_assigned_interrupts,
+		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
+ "Enable device assignment on platforms without interrupt remapping support.");
+
 static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
 				gfn_t base_gfn, unsigned long npages);
@@ -231,6 +237,18 @@ int kvm_iommu_map_guest(struct kvm *kvm)
 	if (!kvm->arch.iommu_domain)
 		return -ENOMEM;
 
+	if (!allow_unsafe_assigned_interrupts &&
+	    !iommu_domain_has_cap(kvm->arch.iommu_domain,
+				  IOMMU_CAP_INTR_REMAP)) {
+		printk(KERN_WARNING "%s: No interrupt remapping support,"
+		       " disallowing device assignment."
+		       " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
+		       " module option.\n", __func__);
+		iommu_domain_free(kvm->arch.iommu_domain);
+		kvm->arch.iommu_domain = NULL;
+		return -EPERM;
+	}
+
 	r = kvm_iommu_map_memslots(kvm);
 	if (r)
 		goto out_unmap;