2005-09-26 10:04:21 +04:00
/*
* This program is used to generate definitions needed by
* assembly language modules .
*
* We use the technique used in the OSF Mach kernel code :
* generate asm statements containing # defines ,
* compile this file to assembler , and then extract the
* # defines from the assembly - language output .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# include <linux/signal.h>
# include <linux/sched.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/string.h>
# include <linux/types.h>
# include <linux/mman.h>
# include <linux/mm.h>
2007-05-03 16:31:38 +04:00
# include <linux/suspend.h>
2008-02-05 08:16:48 +03:00
# include <linux/hrtimer.h>
2005-09-28 18:35:31 +04:00
# ifdef CONFIG_PPC64
2005-09-26 10:04:21 +04:00
# include <linux/time.h>
# include <linux/hardirq.h>
2005-09-28 18:35:31 +04:00
# endif
2008-04-29 12:04:08 +04:00
# include <linux/kbuild.h>
2005-09-28 18:35:31 +04:00
2005-09-26 10:04:21 +04:00
# include <asm/io.h>
# include <asm/page.h>
# include <asm/pgtable.h>
# include <asm/processor.h>
# include <asm/cputable.h>
# include <asm/thread_info.h>
2005-10-26 11:05:24 +04:00
# include <asm/rtas.h>
2005-11-11 13:15:21 +03:00
# include <asm/vdso_datapage.h>
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_PPC64
# include <asm/paca.h>
# include <asm/lppaca.h>
# include <asm/cache.h>
# include <asm/compat.h>
2006-08-09 11:00:30 +04:00
# include <asm/mmu.h>
2006-09-13 22:32:39 +04:00
# include <asm/hvcall.h>
KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code
With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per
core), whenever a CPU goes idle, we have to pull all the other
hardware threads in the core out of the guest, because the H_CEDE
hcall is handled in the kernel. This is inefficient.
This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall
in real mode. When a guest vcpu does an H_CEDE hcall, we now only
exit to the kernel if all the other vcpus in the same core are also
idle. Otherwise we mark this vcpu as napping, save state that could
be lost in nap mode (mainly GPRs and FPRs), and execute the nap
instruction. When the thread wakes up, because of a decrementer or
external interrupt, we come back in at kvm_start_guest (from the
system reset interrupt vector), find the `napping' flag set in the
paca, and go to the resume path.
This has some other ramifications. First, when starting a core, we
now start all the threads, both those that are immediately runnable and
those that are idle. This is so that we don't have to pull all the
threads out of the guest when an idle thread gets a decrementer interrupt
and wants to start running. In fact the idle threads will all start
with the H_CEDE hcall returning; being idle they will just do another
H_CEDE immediately and go to nap mode.
This required some changes to kvmppc_run_core() and kvmppc_run_vcpu().
These functions have been restructured to make them simpler and clearer.
We introduce a level of indirection in the wait queue that gets woken
when external and decrementer interrupts get generated for a vcpu, so
that we can have the 4 vcpus in a vcore using the same wait queue.
We need this because the 4 vcpus are being handled by one thread.
Secondly, when we need to exit from the guest to the kernel, we now
have to generate an IPI for any napping threads, because an HDEC
interrupt doesn't wake up a napping thread.
Thirdly, we now need to be able to handle virtual external interrupts
and decrementer interrupts becoming pending while a thread is napping,
and deliver those interrupts to the guest when the thread wakes.
This is done in kvmppc_cede_reentry, just before fast_guest_return.
Finally, since we are not using the generic kvm_vcpu_block for book3s_hv,
and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef
from kvm_arch_vcpu_runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-23 11:42:46 +04:00
# include <asm/xics.h>
2005-09-26 10:04:21 +04:00
# endif
2011-09-19 21:45:04 +04:00
# ifdef CONFIG_PPC_POWERNV
# include <asm/opal.h>
# endif
2010-08-30 14:01:56 +04:00
# if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
2009-01-04 01:23:08 +03:00
# include <linux/kvm_host.h>
2010-04-16 02:11:44 +04:00
# endif
2010-08-30 14:01:56 +04:00
# if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
# include <asm/kvm_book3s.h>
2008-11-05 18:36:18 +03:00
# endif
2005-09-26 10:04:21 +04:00
2009-07-28 05:59:34 +04:00
# ifdef CONFIG_PPC32
2008-04-30 14:23:21 +04:00
# if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
# include "head_booke.h"
# endif
2009-07-28 05:59:34 +04:00
# endif
2008-04-30 14:23:21 +04:00
2009-10-17 03:48:40 +04:00
# if defined(CONFIG_PPC_FSL_BOOK3E)
2008-12-09 06:34:55 +03:00
# include "../mm/mmu_decl.h"
# endif
2005-09-26 10:04:21 +04:00
int main ( void )
{
2005-09-28 18:35:31 +04:00
DEFINE ( THREAD , offsetof ( struct task_struct , thread ) ) ;
DEFINE ( MM , offsetof ( struct task_struct , mm ) ) ;
2008-12-18 22:13:24 +03:00
DEFINE ( MMCONTEXTID , offsetof ( struct mm_struct , context . id ) ) ;
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_PPC64
2005-09-28 18:35:31 +04:00
DEFINE ( AUDITCONTEXT , offsetof ( struct task_struct , audit_context ) ) ;
powerpc: Allow perf_counters to access user memory at interrupt time
This provides a mechanism to allow the perf_counters code to access
user memory in a PMU interrupt routine. Such an access can cause
various kinds of interrupt: SLB miss, MMU hash table miss, segment
table miss, or TLB miss, depending on the processor. This commit
only deals with 64-bit classic/server processors, which use an MMU
hash table. 32-bit processors are already able to access user memory
at interrupt time. Since we don't soft-disable on 32-bit, we avoid
the possibility of reentering hash_page or the TLB miss handlers,
since they run with interrupts disabled.
On 64-bit processors, an SLB miss interrupt on a user address will
update the slb_cache and slb_cache_ptr fields in the paca. This is
OK except in the case where a PMU interrupt occurs in switch_slb,
which also accesses those fields. To prevent this, we hard-disable
interrupts in switch_slb. Interrupts are already soft-disabled at
this point, and will get hard-enabled when they get soft-enabled
later.
This also reworks slb_flush_and_rebolt: to avoid hard-disabling twice,
and to make sure that it clears the slb_cache_ptr when called from
other callers than switch_slb, the existing routine is renamed to
__slb_flush_and_rebolt, which is called by switch_slb and the new
version of slb_flush_and_rebolt.
Similarly, switch_stab (used on POWER3 and RS64 processors) gets a
hard_irq_disable() to protect the per-cpu variables used there and
in ste_allocate.
If a MMU hashtable miss interrupt occurs, normally we would call
hash_page to look up the Linux PTE for the address and create a HPTE.
However, hash_page is fairly complex and takes some locks, so to
avoid the possibility of deadlock, we check the preemption count
to see if we are in a (pseudo-)NMI handler, and if so, we don't call
hash_page but instead treat it like a bad access that will get
reported up through the exception table mechanism. An interrupt
whose handler runs even though the interrupt occurred when
soft-disabled (such as the PMU interrupt) is considered a pseudo-NMI
handler, which should use nmi_enter()/nmi_exit() rather than
irq_enter()/irq_exit().
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
2009-08-17 09:17:54 +04:00
DEFINE ( SIGSEGV , SIGSEGV ) ;
DEFINE ( NMI_MASK , NMI_MASK ) ;
2011-03-02 18:18:48 +03:00
DEFINE ( THREAD_DSCR , offsetof ( struct thread_struct , dscr ) ) ;
2012-09-03 20:51:10 +04:00
DEFINE ( THREAD_DSCR_INHERIT , offsetof ( struct thread_struct , dscr_inherit ) ) ;
2012-12-07 01:49:56 +04:00
DEFINE ( TASKTHREADPPR , offsetof ( struct task_struct , thread . ppr ) ) ;
2005-09-28 18:35:31 +04:00
# else
2007-05-09 13:35:17 +04:00
DEFINE ( THREAD_INFO , offsetof ( struct task_struct , stack ) ) ;
2013-09-24 09:17:21 +04:00
DEFINE ( THREAD_INFO_GAP , _ALIGN_UP ( sizeof ( struct thread_info ) , 16 ) ) ;
DEFINE ( KSP_LIMIT , offsetof ( struct thread_struct , ksp_limit ) ) ;
2005-09-28 18:35:31 +04:00
# endif /* CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
DEFINE ( KSP , offsetof ( struct thread_struct , ksp ) ) ;
DEFINE ( PT_REGS , offsetof ( struct thread_struct , regs ) ) ;
2011-04-23 01:48:27 +04:00
# ifdef CONFIG_BOOKE
DEFINE ( THREAD_NORMSAVES , offsetof ( struct thread_struct , normsave [ 0 ] ) ) ;
# endif
2005-09-26 10:04:21 +04:00
DEFINE ( THREAD_FPEXC_MODE , offsetof ( struct thread_struct , fpexc_mode ) ) ;
2013-09-10 14:20:42 +04:00
DEFINE ( THREAD_FPSTATE , offsetof ( struct thread_struct , fp_state ) ) ;
2013-09-10 14:21:10 +04:00
DEFINE ( THREAD_FPSAVEAREA , offsetof ( struct thread_struct , fp_save_area ) ) ;
2013-09-10 14:20:42 +04:00
DEFINE ( FPSTATE_FPSCR , offsetof ( struct thread_fp_state , fpscr ) ) ;
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_ALTIVEC
2013-09-10 14:20:42 +04:00
DEFINE ( THREAD_VRSTATE , offsetof ( struct thread_struct , vr_state ) ) ;
2013-09-10 14:21:10 +04:00
DEFINE ( THREAD_VRSAVEAREA , offsetof ( struct thread_struct , vr_save_area ) ) ;
2005-09-26 10:04:21 +04:00
DEFINE ( THREAD_VRSAVE , offsetof ( struct thread_struct , vrsave ) ) ;
DEFINE ( THREAD_USED_VR , offsetof ( struct thread_struct , used_vr ) ) ;
2013-09-10 14:20:42 +04:00
DEFINE ( VRSTATE_VSCR , offsetof ( struct thread_vr_state , vscr ) ) ;
2005-09-26 10:04:21 +04:00
# endif /* CONFIG_ALTIVEC */
2008-06-25 08:07:18 +04:00
# ifdef CONFIG_VSX
DEFINE ( THREAD_USED_VSR , offsetof ( struct thread_struct , used_vsr ) ) ;
# endif /* CONFIG_VSX */
2005-09-28 18:35:31 +04:00
# ifdef CONFIG_PPC64
DEFINE ( KSP_VSID , offsetof ( struct thread_struct , ksp_vsid ) ) ;
# else /* CONFIG_PPC64 */
DEFINE ( PGDIR , offsetof ( struct thread_struct , pgdir ) ) ;
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_SPE
DEFINE ( THREAD_EVR0 , offsetof ( struct thread_struct , evr [ 0 ] ) ) ;
DEFINE ( THREAD_ACC , offsetof ( struct thread_struct , acc ) ) ;
DEFINE ( THREAD_SPEFSCR , offsetof ( struct thread_struct , spefscr ) ) ;
DEFINE ( THREAD_USED_SPE , offsetof ( struct thread_struct , used_spe ) ) ;
# endif /* CONFIG_SPE */
2005-09-28 18:35:31 +04:00
# endif /* CONFIG_PPC64 */
2013-05-22 08:20:59 +04:00
# if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
2013-07-04 10:15:46 +04:00
DEFINE ( THREAD_DBCR0 , offsetof ( struct thread_struct , debug . dbcr0 ) ) ;
2013-05-22 08:20:59 +04:00
# endif
2010-04-16 02:11:51 +04:00
# ifdef CONFIG_KVM_BOOK3S_32_HANDLER
DEFINE ( THREAD_KVM_SVCPU , offsetof ( struct thread_struct , kvm_shadow_vcpu ) ) ;
# endif
2013-01-16 02:20:42 +04:00
# if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
2011-12-20 19:34:43 +04:00
DEFINE ( THREAD_KVM_VCPU , offsetof ( struct thread_struct , kvm_vcpu ) ) ;
# endif
2005-09-28 18:35:31 +04:00
2013-02-07 19:46:58 +04:00
# ifdef CONFIG_PPC_BOOK3S_64
DEFINE ( THREAD_TAR , offsetof ( struct thread_struct , tar ) ) ;
2013-05-01 00:17:04 +04:00
DEFINE ( THREAD_BESCR , offsetof ( struct thread_struct , bescr ) ) ;
DEFINE ( THREAD_EBBHR , offsetof ( struct thread_struct , ebbhr ) ) ;
DEFINE ( THREAD_EBBRR , offsetof ( struct thread_struct , ebbrr ) ) ;
2013-05-21 20:31:12 +04:00
DEFINE ( THREAD_SIAR , offsetof ( struct thread_struct , siar ) ) ;
DEFINE ( THREAD_SDAR , offsetof ( struct thread_struct , sdar ) ) ;
DEFINE ( THREAD_SIER , offsetof ( struct thread_struct , sier ) ) ;
DEFINE ( THREAD_MMCR0 , offsetof ( struct thread_struct , mmcr0 ) ) ;
DEFINE ( THREAD_MMCR2 , offsetof ( struct thread_struct , mmcr2 ) ) ;
2013-02-07 19:46:58 +04:00
# endif
2013-02-13 20:21:32 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2013-02-13 20:21:34 +04:00
DEFINE ( PACATMSCRATCH , offsetof ( struct paca_struct , tm_scratch ) ) ;
2013-02-13 20:21:32 +04:00
DEFINE ( THREAD_TM_TFHAR , offsetof ( struct thread_struct , tm_tfhar ) ) ;
DEFINE ( THREAD_TM_TEXASR , offsetof ( struct thread_struct , tm_texasr ) ) ;
DEFINE ( THREAD_TM_TFIAR , offsetof ( struct thread_struct , tm_tfiar ) ) ;
2013-08-09 11:29:31 +04:00
DEFINE ( THREAD_TM_TAR , offsetof ( struct thread_struct , tm_tar ) ) ;
DEFINE ( THREAD_TM_PPR , offsetof ( struct thread_struct , tm_ppr ) ) ;
DEFINE ( THREAD_TM_DSCR , offsetof ( struct thread_struct , tm_dscr ) ) ;
2013-02-13 20:21:32 +04:00
DEFINE ( PT_CKPT_REGS , offsetof ( struct thread_struct , ckpt_regs ) ) ;
2013-09-10 14:20:42 +04:00
DEFINE ( THREAD_TRANSACT_VRSTATE , offsetof ( struct thread_struct ,
transact_vr ) ) ;
2013-02-13 20:21:32 +04:00
DEFINE ( THREAD_TRANSACT_VRSAVE , offsetof ( struct thread_struct ,
transact_vrsave ) ) ;
2013-09-10 14:20:42 +04:00
DEFINE ( THREAD_TRANSACT_FPSTATE , offsetof ( struct thread_struct ,
transact_fp ) ) ;
2013-02-13 20:21:32 +04:00
/* Local pt_regs on stack for Transactional Memory funcs. */
DEFINE ( TM_FRAME_SIZE , STACK_FRAME_OVERHEAD +
sizeof ( struct pt_regs ) + 16 ) ;
# endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
2013-02-07 19:46:58 +04:00
2005-09-28 18:35:31 +04:00
DEFINE ( TI_FLAGS , offsetof ( struct thread_info , flags ) ) ;
2006-04-18 15:49:11 +04:00
DEFINE ( TI_LOCAL_FLAGS , offsetof ( struct thread_info , local_flags ) ) ;
2005-09-28 18:35:31 +04:00
DEFINE ( TI_PREEMPT , offsetof ( struct thread_info , preempt_count ) ) ;
DEFINE ( TI_TASK , offsetof ( struct thread_info , task ) ) ;
DEFINE ( TI_CPU , offsetof ( struct thread_info , cpu ) ) ;
# ifdef CONFIG_PPC64
DEFINE ( DCACHEL1LINESIZE , offsetof ( struct ppc64_caches , dline_size ) ) ;
DEFINE ( DCACHEL1LOGLINESIZE , offsetof ( struct ppc64_caches , log_dline_size ) ) ;
DEFINE ( DCACHEL1LINESPERPAGE , offsetof ( struct ppc64_caches , dlines_per_page ) ) ;
DEFINE ( ICACHEL1LINESIZE , offsetof ( struct ppc64_caches , iline_size ) ) ;
DEFINE ( ICACHEL1LOGLINESIZE , offsetof ( struct ppc64_caches , log_iline_size ) ) ;
DEFINE ( ICACHEL1LINESPERPAGE , offsetof ( struct ppc64_caches , ilines_per_page ) ) ;
/* paca */
DEFINE ( PACA_SIZE , sizeof ( struct paca_struct ) ) ;
KVM: PPC: book3s_hv: Add support for PPC970-family processors
This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode. Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.
There are several differences between the PPC970 and POWER7 in how
guests are managed. These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits. Notably, on PPC970:
* The LPCR, LPID or RMOR registers don't exist, and the functions of
those registers are provided by bits in HID4 and one bit in HID0.
* External interrupts can be directed to the hypervisor, but unlike
POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
SRR0/1 not HSRR0/1.
* There is no virtual RMA (VRMA) mode; the guest must use an RMO
(real mode offset) area.
* The TLB entries are not tagged with the LPID, so it is necessary to
flush the whole TLB on partition switch. Furthermore, when switching
partitions we have to ensure that no other CPU is executing the tlbie
or tlbsync instructions in either the old or the new partition,
otherwise undefined behaviour can occur.
* The PMU has 8 counters (PMC registers) rather than 6.
* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.
* The SLB has 64 entries rather than 32.
* There is no mediated external interrupt facility, so if we switch to
a guest that has a virtual external interrupt pending but the guest
has MSR[EE] = 0, we have to arrange to have an interrupt pending for
it so that we can get control back once it re-enables interrupts. We
do that by sending ourselves an IPI with smp_send_reschedule after
hard-disabling interrupts.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:40:08 +04:00
DEFINE ( PACA_LOCK_TOKEN , offsetof ( struct paca_struct , lock_token ) ) ;
2005-09-28 18:35:31 +04:00
DEFINE ( PACAPACAINDEX , offsetof ( struct paca_struct , paca_index ) ) ;
DEFINE ( PACAPROCSTART , offsetof ( struct paca_struct , cpu_start ) ) ;
DEFINE ( PACAKSAVE , offsetof ( struct paca_struct , kstack ) ) ;
DEFINE ( PACACURRENT , offsetof ( struct paca_struct , __current ) ) ;
DEFINE ( PACASAVEDMSR , offsetof ( struct paca_struct , saved_msr ) ) ;
DEFINE ( PACASTABRR , offsetof ( struct paca_struct , stab_rr ) ) ;
DEFINE ( PACAR1 , offsetof ( struct paca_struct , saved_r1 ) ) ;
DEFINE ( PACATOC , offsetof ( struct paca_struct , kernel_toc ) ) ;
2008-08-30 05:40:24 +04:00
DEFINE ( PACAKBASE , offsetof ( struct paca_struct , kernelbase ) ) ;
DEFINE ( PACAKMSR , offsetof ( struct paca_struct , kernel_msr ) ) ;
[POWERPC] Lazy interrupt disabling for 64-bit machines
This implements a lazy strategy for disabling interrupts. This means
that local_irq_disable() et al. just clear the 'interrupts are
enabled' flag in the paca. If an interrupt comes along, the interrupt
entry code notices that interrupts are supposed to be disabled, and
clears the EE bit in SRR1, clears the 'interrupts are hard-enabled'
flag in the paca, and returns. This means that interrupts only
actually get disabled in the processor when an interrupt comes along.
When interrupts are enabled by local_irq_enable() et al., the code
sets the interrupts-enabled flag in the paca, and then checks whether
interrupts got hard-disabled. If so, it also sets the EE bit in the
MSR to hard-enable the interrupts.
This has the potential to improve performance, and also makes it
easier to make a kernel that can boot on iSeries and on other 64-bit
machines, since this lazy-disable strategy is very similar to the
soft-disable strategy that iSeries already uses.
This version renames paca->proc_enabled to paca->soft_enabled, and
changes a couple of soft-disables in the kexec code to hard-disables,
which should fix the crash that Michael Ellerman saw. This doesn't
yet use a reserved CR field for the soft_enabled and hard_enabled
flags. This applies on top of Stephen Rothwell's patches to make it
possible to build a combined iSeries/other kernel.
Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-10-04 10:47:49 +04:00
DEFINE ( PACASOFTIRQEN , offsetof ( struct paca_struct , soft_enabled ) ) ;
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 11:27:59 +04:00
DEFINE ( PACAIRQHAPPENED , offsetof ( struct paca_struct , irq_happened ) ) ;
2005-09-28 18:35:31 +04:00
DEFINE ( PACACONTEXTID , offsetof ( struct paca_struct , context . id ) ) ;
2007-05-08 10:27:27 +04:00
# ifdef CONFIG_PPC_MM_SLICES
DEFINE ( PACALOWSLICESPSIZE , offsetof ( struct paca_struct ,
context . low_slices_psize ) ) ;
DEFINE ( PACAHIGHSLICEPSIZE , offsetof ( struct paca_struct ,
context . high_slices_psize ) ) ;
DEFINE ( MMUPSIZEDEFSIZE , sizeof ( struct mmu_psize_def ) ) ;
2009-06-03 01:17:41 +04:00
# endif /* CONFIG_PPC_MM_SLICES */
2009-07-24 03:15:42 +04:00
# ifdef CONFIG_PPC_BOOK3E
DEFINE ( PACAPGD , offsetof ( struct paca_struct , pgd ) ) ;
DEFINE ( PACA_KERNELPGD , offsetof ( struct paca_struct , kernel_pgd ) ) ;
DEFINE ( PACA_EXGEN , offsetof ( struct paca_struct , exgen ) ) ;
DEFINE ( PACA_EXTLB , offsetof ( struct paca_struct , extlb ) ) ;
DEFINE ( PACA_EXMC , offsetof ( struct paca_struct , exmc ) ) ;
DEFINE ( PACA_EXCRIT , offsetof ( struct paca_struct , excrit ) ) ;
DEFINE ( PACA_EXDBG , offsetof ( struct paca_struct , exdbg ) ) ;
DEFINE ( PACA_MC_STACK , offsetof ( struct paca_struct , mc_kstack ) ) ;
DEFINE ( PACA_CRIT_STACK , offsetof ( struct paca_struct , crit_kstack ) ) ;
DEFINE ( PACA_DBG_STACK , offsetof ( struct paca_struct , dbg_kstack ) ) ;
2013-10-12 04:22:38 +04:00
DEFINE ( PACA_TCD_PTR , offsetof ( struct paca_struct , tcd_ptr ) ) ;
DEFINE ( TCD_ESEL_NEXT ,
offsetof ( struct tlb_core_data , esel_next ) ) ;
DEFINE ( TCD_ESEL_MAX ,
offsetof ( struct tlb_core_data , esel_max ) ) ;
DEFINE ( TCD_ESEL_FIRST ,
offsetof ( struct tlb_core_data , esel_first ) ) ;
DEFINE ( TCD_LOCK , offsetof ( struct tlb_core_data , lock ) ) ;
2009-07-24 03:15:42 +04:00
# endif /* CONFIG_PPC_BOOK3E */
2009-06-03 01:17:41 +04:00
# ifdef CONFIG_PPC_STD_MMU_64
DEFINE ( PACASTABREAL , offsetof ( struct paca_struct , stab_real ) ) ;
DEFINE ( PACASTABVIRT , offsetof ( struct paca_struct , stab_addr ) ) ;
DEFINE ( PACASLBCACHE , offsetof ( struct paca_struct , slb_cache ) ) ;
DEFINE ( PACASLBCACHEPTR , offsetof ( struct paca_struct , slb_cache_ptr ) ) ;
DEFINE ( PACAVMALLOCSLLP , offsetof ( struct paca_struct , vmalloc_sllp ) ) ;
# ifdef CONFIG_PPC_MM_SLICES
2007-05-08 10:27:27 +04:00
DEFINE ( MMUPSIZESLLP , offsetof ( struct mmu_psize_def , sllp ) ) ;
# else
DEFINE ( PACACONTEXTSLLP , offsetof ( struct paca_struct , context . sllp ) ) ;
# endif /* CONFIG_PPC_MM_SLICES */
2005-09-28 18:35:31 +04:00
DEFINE ( PACA_EXGEN , offsetof ( struct paca_struct , exgen ) ) ;
DEFINE ( PACA_EXMC , offsetof ( struct paca_struct , exmc ) ) ;
DEFINE ( PACA_EXSLB , offsetof ( struct paca_struct , exslb ) ) ;
2006-01-13 02:26:42 +03:00
DEFINE ( PACALPPACAPTR , offsetof ( struct paca_struct , lppaca_ptr ) ) ;
2006-08-07 10:19:19 +04:00
DEFINE ( PACA_SLBSHADOWPTR , offsetof ( struct paca_struct , slb_shadow_ptr ) ) ;
2006-08-09 11:00:30 +04:00
DEFINE ( SLBSHADOW_STACKVSID ,
offsetof ( struct slb_shadow , save_area [ SLB_NUM_BOLTED - 1 ] . vsid ) ) ;
DEFINE ( SLBSHADOW_STACKESID ,
offsetof ( struct slb_shadow , save_area [ SLB_NUM_BOLTED - 1 ] . esid ) ) ;
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times. This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish. The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.
This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in. Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.
On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log. We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called). So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.
On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval. This avoids having to
read the SPURR on every kernel entry and exit. On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.
This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 23:56:43 +04:00
DEFINE ( SLBSHADOW_SAVEAREA , offsetof ( struct slb_shadow , save_area ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( LPPACA_PMCINUSE , offsetof ( struct lppaca , pmcregs_in_use ) ) ;
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times. This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish. The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.
This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in. Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.
On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log. We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called). So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.
On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval. This avoids having to
read the SPURR on every kernel entry and exit. On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.
This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 23:56:43 +04:00
DEFINE ( LPPACA_DTLIDX , offsetof ( struct lppaca , dtl_idx ) ) ;
2011-06-29 04:22:05 +04:00
DEFINE ( LPPACA_YIELDCOUNT , offsetof ( struct lppaca , yield_count ) ) ;
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times. This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish. The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.
This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in. Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.
On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log. We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called). So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.
On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval. This avoids having to
read the SPURR on every kernel entry and exit. On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.
This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 23:56:43 +04:00
DEFINE ( PACA_DTL_RIDX , offsetof ( struct paca_struct , dtl_ridx ) ) ;
2009-06-03 01:17:41 +04:00
# endif /* CONFIG_PPC_STD_MMU_64 */
DEFINE ( PACAEMERGSP , offsetof ( struct paca_struct , emergency_sp ) ) ;
powerpc/book3s: handle machine check in Linux host.
Move machine check entry point into Linux. So far we were dependent on
firmware to decode MCE error details and handover the high level info to OS.
This patch introduces early machine check routine that saves the MCE
information (srr1, srr0, dar and dsisr) to the emergency stack. We allocate
stack frame on emergency stack and set the r1 accordingly. This allows us to be
prepared to take another exception without loosing context. One thing to note
here that, if we get another machine check while ME bit is off then we risk a
checkstop. Hence we restrict ourselves to save only MCE information and
register saved on PACA_EXMC save are before we turn the ME bit on. We use
paca->in_mce flag to differentiate between first entry and nested machine check
entry which helps proper use of emergency stack. We increment paca->in_mce
every time we enter in early machine check handler and decrement it while
leaving. When we enter machine check early handler first time (paca->in_mce ==
0), we are sure nobody is using MC emergency stack and allocate a stack frame
at the start of the emergency stack. During subsequent entry (paca->in_mce >
0), we know that r1 points inside emergency stack and we allocate separate
stack frame accordingly. This prevents us from clobbering MCE information
during nested machine checks.
The early machine check handler changes are placed under CPU_FTR_HVMODE
section. This makes sure that the early machine check handler will get executed
only in hypervisor kernel.
This is the code flow:
Machine Check Interrupt
|
V
0x200 vector ME=0, IR=0, DR=0
|
V
+-----------------------------------------------+
|machine_check_pSeries_early: | ME=0, IR=0, DR=0
| Alloc frame on emergency stack |
| Save srr1, srr0, dar and dsisr on stack |
+-----------------------------------------------+
|
(ME=1, IR=0, DR=0, RFID)
|
V
machine_check_handle_early ME=1, IR=0, DR=0
|
V
+-----------------------------------------------+
| machine_check_early (r3=pt_regs) | ME=1, IR=0, DR=0
| Things to do: (in next patches) |
| Flush SLB for SLB errors |
| Flush TLB for TLB errors |
| Decode and save MCE info |
+-----------------------------------------------+
|
(Fall through existing exception handler routine.)
|
V
machine_check_pSerie ME=1, IR=0, DR=0
|
(ME=1, IR=1, DR=1, RFID)
|
V
machine_check_common ME=1, IR=1, DR=1
.
.
.
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2013-10-30 18:34:08 +04:00
# ifdef CONFIG_PPC_BOOK3S_64
DEFINE ( PACAMCEMERGSP , offsetof ( struct paca_struct , mc_emergency_sp ) ) ;
DEFINE ( PACA_IN_MCE , offsetof ( struct paca_struct , in_mce ) ) ;
# endif
2009-06-03 01:17:41 +04:00
DEFINE ( PACAHWCPUID , offsetof ( struct paca_struct , hw_cpu_id ) ) ;
2010-05-13 23:40:11 +04:00
DEFINE ( PACAKEXECSTATE , offsetof ( struct paca_struct , kexec_state ) ) ;
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times. This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish. The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.
This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in. Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.
On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log. We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called). So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.
On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval. This avoids having to
read the SPURR on every kernel entry and exit. On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.
This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-26 23:56:43 +04:00
DEFINE ( PACA_STARTTIME , offsetof ( struct paca_struct , starttime ) ) ;
DEFINE ( PACA_STARTTIME_USER , offsetof ( struct paca_struct , starttime_user ) ) ;
2009-06-03 01:17:41 +04:00
DEFINE ( PACA_USER_TIME , offsetof ( struct paca_struct , user_time ) ) ;
DEFINE ( PACA_SYSTEM_TIME , offsetof ( struct paca_struct , system_time ) ) ;
DEFINE ( PACA_TRAP_SAVE , offsetof ( struct paca_struct , trap_save ) ) ;
2011-12-05 23:47:26 +04:00
DEFINE ( PACA_NAPSTATELOST , offsetof ( struct paca_struct , nap_state_lost ) ) ;
2012-09-06 06:49:44 +04:00
DEFINE ( PACA_SPRG3 , offsetof ( struct paca_struct , sprg3 ) ) ;
2005-10-26 11:05:24 +04:00
# endif /* CONFIG_PPC64 */
2005-09-28 18:35:31 +04:00
/* RTAS */
DEFINE ( RTASBASE , offsetof ( struct rtas_t , base ) ) ;
DEFINE ( RTASENTRY , offsetof ( struct rtas_t , entry ) ) ;
2005-09-26 10:04:21 +04:00
/* Interrupt register frame */
2008-04-24 00:33:49 +04:00
DEFINE ( INT_FRAME_SIZE , STACK_INT_FRAME_SIZE ) ;
2005-09-26 10:04:21 +04:00
DEFINE ( SWITCH_FRAME_SIZE , STACK_FRAME_OVERHEAD + sizeof ( struct pt_regs ) ) ;
2010-04-16 02:11:55 +04:00
# ifdef CONFIG_PPC64
2005-09-28 18:35:31 +04:00
/* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */
DEFINE ( PROM_FRAME_SIZE , STACK_FRAME_OVERHEAD + sizeof ( struct pt_regs ) + 16 ) ;
DEFINE ( RTAS_FRAME_SIZE , STACK_FRAME_OVERHEAD + sizeof ( struct pt_regs ) + 16 ) ;
2006-09-07 03:23:12 +04:00
/* hcall statistics */
DEFINE ( HCALL_STAT_SIZE , sizeof ( struct hcall_stats ) ) ;
DEFINE ( HCALL_STAT_CALLS , offsetof ( struct hcall_stats , num_calls ) ) ;
DEFINE ( HCALL_STAT_TB , offsetof ( struct hcall_stats , tb_total ) ) ;
DEFINE ( HCALL_STAT_PURR , offsetof ( struct hcall_stats , purr_total ) ) ;
2005-09-28 18:35:31 +04:00
# endif /* CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
DEFINE ( GPR0 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 0 ] ) ) ;
DEFINE ( GPR1 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 1 ] ) ) ;
DEFINE ( GPR2 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 2 ] ) ) ;
DEFINE ( GPR3 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 3 ] ) ) ;
DEFINE ( GPR4 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 4 ] ) ) ;
DEFINE ( GPR5 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 5 ] ) ) ;
DEFINE ( GPR6 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 6 ] ) ) ;
DEFINE ( GPR7 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 7 ] ) ) ;
DEFINE ( GPR8 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 8 ] ) ) ;
DEFINE ( GPR9 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 9 ] ) ) ;
DEFINE ( GPR10 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 10 ] ) ) ;
DEFINE ( GPR11 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 11 ] ) ) ;
DEFINE ( GPR12 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 12 ] ) ) ;
DEFINE ( GPR13 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 13 ] ) ) ;
2005-09-28 18:35:31 +04:00
# ifndef CONFIG_PPC64
2005-09-26 10:04:21 +04:00
DEFINE ( GPR14 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 14 ] ) ) ;
DEFINE ( GPR15 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 15 ] ) ) ;
DEFINE ( GPR16 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 16 ] ) ) ;
DEFINE ( GPR17 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 17 ] ) ) ;
DEFINE ( GPR18 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 18 ] ) ) ;
DEFINE ( GPR19 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 19 ] ) ) ;
DEFINE ( GPR20 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 20 ] ) ) ;
DEFINE ( GPR21 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 21 ] ) ) ;
DEFINE ( GPR22 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 22 ] ) ) ;
DEFINE ( GPR23 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 23 ] ) ) ;
DEFINE ( GPR24 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 24 ] ) ) ;
DEFINE ( GPR25 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 25 ] ) ) ;
DEFINE ( GPR26 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 26 ] ) ) ;
DEFINE ( GPR27 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 27 ] ) ) ;
DEFINE ( GPR28 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 28 ] ) ) ;
DEFINE ( GPR29 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 29 ] ) ) ;
DEFINE ( GPR30 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 30 ] ) ) ;
DEFINE ( GPR31 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , gpr [ 31 ] ) ) ;
2005-09-28 18:35:31 +04:00
# endif /* CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
/*
* Note : these symbols include _ because they overlap with special
* register names
*/
DEFINE ( _NIP , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , nip ) ) ;
DEFINE ( _MSR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , msr ) ) ;
DEFINE ( _CTR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , ctr ) ) ;
DEFINE ( _LINK , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , link ) ) ;
DEFINE ( _CCR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , ccr ) ) ;
DEFINE ( _XER , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , xer ) ) ;
DEFINE ( _DAR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , dar ) ) ;
DEFINE ( _DSISR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , dsisr ) ) ;
2005-09-28 18:35:31 +04:00
DEFINE ( ORIG_GPR3 , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , orig_gpr3 ) ) ;
DEFINE ( RESULT , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , result ) ) ;
2005-10-28 16:45:25 +04:00
DEFINE ( _TRAP , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , trap ) ) ;
2005-09-28 18:35:31 +04:00
# ifndef CONFIG_PPC64
DEFINE ( _MQ , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , mq ) ) ;
/*
* The PowerPC 400 - class & Book - E processors have neither the DAR
* nor the DSISR SPRs . Hence , we overload them to hold the similar
* DEAR and ESR SPRs for such processors . For critical interrupts
* we use them to hold SRR0 and SRR1 .
2005-09-26 10:04:21 +04:00
*/
DEFINE ( _DEAR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , dar ) ) ;
DEFINE ( _ESR , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , dsisr ) ) ;
2005-09-28 18:35:31 +04:00
# else /* CONFIG_PPC64 */
DEFINE ( SOFTE , STACK_FRAME_OVERHEAD + offsetof ( struct pt_regs , softe ) ) ;
/* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */
DEFINE ( _SRR0 , STACK_FRAME_OVERHEAD + sizeof ( struct pt_regs ) ) ;
DEFINE ( _SRR1 , STACK_FRAME_OVERHEAD + sizeof ( struct pt_regs ) + 8 ) ;
# endif /* CONFIG_PPC64 */
2009-07-28 05:59:34 +04:00
# if defined(CONFIG_PPC32)
2008-04-30 14:23:21 +04:00
# if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
DEFINE ( EXC_LVL_SIZE , STACK_EXC_LVL_FRAME_SIZE ) ;
DEFINE ( MAS0 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas0 ) ) ;
/* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */
DEFINE ( MMUCR , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas0 ) ) ;
DEFINE ( MAS1 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas1 ) ) ;
DEFINE ( MAS2 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas2 ) ) ;
DEFINE ( MAS3 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas3 ) ) ;
DEFINE ( MAS6 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas6 ) ) ;
DEFINE ( MAS7 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , mas7 ) ) ;
DEFINE ( _SRR0 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , srr0 ) ) ;
DEFINE ( _SRR1 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , srr1 ) ) ;
DEFINE ( _CSRR0 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , csrr0 ) ) ;
DEFINE ( _CSRR1 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , csrr1 ) ) ;
DEFINE ( _DSRR0 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , dsrr0 ) ) ;
DEFINE ( _DSRR1 , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , dsrr1 ) ) ;
DEFINE ( SAVED_KSP_LIMIT , STACK_INT_FRAME_SIZE + offsetof ( struct exception_regs , saved_ksp_limit ) ) ;
# endif
2009-07-28 05:59:34 +04:00
# endif
2005-09-26 10:04:21 +04:00
DEFINE ( CLONE_VM , CLONE_VM ) ;
DEFINE ( CLONE_UNTRACED , CLONE_UNTRACED ) ;
2005-09-28 18:35:31 +04:00
# ifndef CONFIG_PPC64
2005-09-26 10:04:21 +04:00
DEFINE ( MM_PGD , offsetof ( struct mm_struct , pgd ) ) ;
2005-09-28 18:35:31 +04:00
# endif /* ! CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
/* About the CPU features table */
DEFINE ( CPU_SPEC_FEATURES , offsetof ( struct cpu_spec , cpu_features ) ) ;
DEFINE ( CPU_SPEC_SETUP , offsetof ( struct cpu_spec , cpu_setup ) ) ;
2006-08-11 09:07:08 +04:00
DEFINE ( CPU_SPEC_RESTORE , offsetof ( struct cpu_spec , cpu_restore ) ) ;
2005-09-26 10:04:21 +04:00
2005-09-28 18:35:31 +04:00
DEFINE ( pbe_address , offsetof ( struct pbe , address ) ) ;
DEFINE ( pbe_orig_address , offsetof ( struct pbe , orig_address ) ) ;
DEFINE ( pbe_next , offsetof ( struct pbe , next ) ) ;
2005-09-26 10:04:21 +04:00
2007-05-03 16:31:38 +04:00
# ifndef CONFIG_PPC64
2005-10-11 16:08:12 +04:00
DEFINE ( TASK_SIZE , TASK_SIZE ) ;
2005-09-28 18:35:31 +04:00
DEFINE ( NUM_USER_SEGMENTS , TASK_SIZE > > 28 ) ;
2005-11-11 13:15:21 +03:00
# endif /* ! CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
2005-11-11 13:15:21 +03:00
/* datapage offsets for use by vdso */
DEFINE ( CFG_TB_ORIG_STAMP , offsetof ( struct vdso_data , tb_orig_stamp ) ) ;
DEFINE ( CFG_TB_TICKS_PER_SEC , offsetof ( struct vdso_data , tb_ticks_per_sec ) ) ;
DEFINE ( CFG_TB_TO_XS , offsetof ( struct vdso_data , tb_to_xs ) ) ;
DEFINE ( CFG_STAMP_XSEC , offsetof ( struct vdso_data , stamp_xsec ) ) ;
DEFINE ( CFG_TB_UPDATE_COUNT , offsetof ( struct vdso_data , tb_update_count ) ) ;
DEFINE ( CFG_TZ_MINUTEWEST , offsetof ( struct vdso_data , tz_minuteswest ) ) ;
DEFINE ( CFG_TZ_DSTTIME , offsetof ( struct vdso_data , tz_dsttime ) ) ;
DEFINE ( CFG_SYSCALL_MAP32 , offsetof ( struct vdso_data , syscall_map_32 ) ) ;
DEFINE ( WTOM_CLOCK_SEC , offsetof ( struct vdso_data , wtom_clock_sec ) ) ;
DEFINE ( WTOM_CLOCK_NSEC , offsetof ( struct vdso_data , wtom_clock_nsec ) ) ;
2008-10-28 02:56:03 +03:00
DEFINE ( STAMP_XTIME , offsetof ( struct vdso_data , stamp_xtime ) ) ;
powerpc: Rework VDSO gettimeofday to prevent time going backwards
Currently it is possible for userspace to see the result of
gettimeofday() going backwards by 1 microsecond, assuming that
userspace is using the gettimeofday() in the VDSO. The VDSO
gettimeofday() algorithm computes the time in "xsecs", which are
units of 2^-20 seconds, or approximately 0.954 microseconds,
using the algorithm
now = (timebase - tb_orig_stamp) * tb_to_xs + stamp_xsec
and then converts the time in xsecs to seconds and microseconds.
The kernel updates the tb_orig_stamp and stamp_xsec values every
tick in update_vsyscall(). If the length of the tick is not an
integer number of xsecs, then some precision is lost in converting
the current time to xsecs. For example, with CONFIG_HZ=1000, the
tick is 1ms long, which is 1048.576 xsecs. That means that
stamp_xsec will advance by either 1048 or 1049 on each tick.
With the right conditions, it is possible for userspace to get
(timebase - tb_orig_stamp) * tb_to_xs being 1049 if the kernel is
slightly late in updating the vdso_datapage, and then for stamp_xsec
to advance by 1048 when the kernel does update it, and for userspace
to then see (timebase - tb_orig_stamp) * tb_to_xs being zero due to
integer truncation. The result is that time appears to go backwards
by 1 microsecond.
To fix this we change the VDSO gettimeofday to use a new field in the
VDSO datapage which stores the nanoseconds part of the time as a
fractional number of seconds in a 0.32 binary fraction format.
(Or put another way, as a 32-bit number in units of 0.23283 ns.)
This is convenient because we can use the mulhwu instruction to
convert it to either microseconds or nanoseconds.
Since it turns out that computing the time of day using this new field
is simpler than either using stamp_xsec (as gettimeofday does) or
stamp_xtime.tv_nsec (as clock_gettime does), this converts both
gettimeofday and clock_gettime to use the new field. The existing
__do_get_tspec function is converted to use the new field and take
a parameter in r7 that indicates the desired resolution, 1,000,000
for microseconds or 1,000,000,000 for nanoseconds. The __do_get_xsec
function is then unused and is deleted.
The new algorithm is
now = ((timebase - tb_orig_stamp) << 12) * tb_to_xs
+ (stamp_xtime_seconds << 32) + stamp_sec_fraction
with 'now' in units of 2^-32 seconds. That is then converted to
seconds and either microseconds or nanoseconds with
seconds = now >> 32
partseconds = ((now & 0xffffffff) * resolution) >> 32
The 32-bit VDSO code also makes a further simplification: it ignores
the bottom 32 bits of the tb_to_xs value, which is a 0.64 format binary
fraction. Doing so gets rid of 4 multiply instructions. Assuming
a timebase frequency of 1GHz or less and an update interval of no
more than 10ms, the upper 32 bits of tb_to_xs will be at least
4503599, so the error from ignoring the low 32 bits will be at most
2.2ns, which is more than an order of magnitude less than the time
taken to do gettimeofday or clock_gettime on our fastest processors,
so there is no possibility of seeing inconsistent values due to this.
This also moves update_gtod() down next to its only caller, and makes
update_vsyscall use the time passed in via the wall_time argument rather
than accessing xtime directly. At present, wall_time always points to
xtime, but that could change in future.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-06-20 23:03:08 +04:00
DEFINE ( STAMP_SEC_FRAC , offsetof ( struct vdso_data , stamp_sec_fraction ) ) ;
2007-11-20 04:24:45 +03:00
DEFINE ( CFG_ICACHE_BLOCKSZ , offsetof ( struct vdso_data , icache_block_size ) ) ;
DEFINE ( CFG_DCACHE_BLOCKSZ , offsetof ( struct vdso_data , dcache_block_size ) ) ;
DEFINE ( CFG_ICACHE_LOGBLOCKSZ , offsetof ( struct vdso_data , icache_log_block_size ) ) ;
DEFINE ( CFG_DCACHE_LOGBLOCKSZ , offsetof ( struct vdso_data , dcache_log_block_size ) ) ;
2005-11-11 13:15:21 +03:00
# ifdef CONFIG_PPC64
DEFINE ( CFG_SYSCALL_MAP64 , offsetof ( struct vdso_data , syscall_map_64 ) ) ;
2005-09-26 10:04:21 +04:00
DEFINE ( TVAL64_TV_SEC , offsetof ( struct timeval , tv_sec ) ) ;
DEFINE ( TVAL64_TV_USEC , offsetof ( struct timeval , tv_usec ) ) ;
DEFINE ( TVAL32_TV_SEC , offsetof ( struct compat_timeval , tv_sec ) ) ;
DEFINE ( TVAL32_TV_USEC , offsetof ( struct compat_timeval , tv_usec ) ) ;
2005-11-14 06:55:58 +03:00
DEFINE ( TSPC64_TV_SEC , offsetof ( struct timespec , tv_sec ) ) ;
DEFINE ( TSPC64_TV_NSEC , offsetof ( struct timespec , tv_nsec ) ) ;
2005-11-11 13:15:21 +03:00
DEFINE ( TSPC32_TV_SEC , offsetof ( struct compat_timespec , tv_sec ) ) ;
DEFINE ( TSPC32_TV_NSEC , offsetof ( struct compat_timespec , tv_nsec ) ) ;
# else
DEFINE ( TVAL32_TV_SEC , offsetof ( struct timeval , tv_sec ) ) ;
DEFINE ( TVAL32_TV_USEC , offsetof ( struct timeval , tv_usec ) ) ;
2005-11-14 06:55:58 +03:00
DEFINE ( TSPC32_TV_SEC , offsetof ( struct timespec , tv_sec ) ) ;
DEFINE ( TSPC32_TV_NSEC , offsetof ( struct timespec , tv_nsec ) ) ;
2005-11-11 13:15:21 +03:00
# endif
/* timeval/timezone offsets for use by vdso */
2005-09-26 10:04:21 +04:00
DEFINE ( TZONE_TZ_MINWEST , offsetof ( struct timezone , tz_minuteswest ) ) ;
DEFINE ( TZONE_TZ_DSTTIME , offsetof ( struct timezone , tz_dsttime ) ) ;
2005-11-11 13:15:21 +03:00
/* Other bits used by the vdso */
DEFINE ( CLOCK_REALTIME , CLOCK_REALTIME ) ;
DEFINE ( CLOCK_MONOTONIC , CLOCK_MONOTONIC ) ;
DEFINE ( NSEC_PER_SEC , NSEC_PER_SEC ) ;
2008-02-08 01:24:52 +03:00
DEFINE ( CLOCK_REALTIME_RES , MONOTONIC_RES_NSEC ) ;
2005-11-11 13:15:21 +03:00
2007-01-01 21:45:34 +03:00
# ifdef CONFIG_BUG
DEFINE ( BUG_ENTRY_SIZE , sizeof ( struct bug_entry ) ) ;
# endif
2007-08-20 08:58:36 +04:00
2007-09-18 11:22:59 +04:00
DEFINE ( PGD_TABLE_SIZE , PGD_TABLE_SIZE ) ;
2008-09-24 20:01:24 +04:00
DEFINE ( PTE_SIZE , sizeof ( pte_t ) ) ;
2007-12-06 22:11:04 +03:00
2008-04-17 08:28:09 +04:00
# ifdef CONFIG_KVM
DEFINE ( VCPU_HOST_STACK , offsetof ( struct kvm_vcpu , arch . host_stack ) ) ;
DEFINE ( VCPU_HOST_PID , offsetof ( struct kvm_vcpu , arch . host_pid ) ) ;
2011-12-20 19:34:43 +04:00
DEFINE ( VCPU_GUEST_PID , offsetof ( struct kvm_vcpu , arch . pid ) ) ;
2008-04-17 08:28:09 +04:00
DEFINE ( VCPU_GPRS , offsetof ( struct kvm_vcpu , arch . gpr ) ) ;
2011-04-28 02:24:10 +04:00
DEFINE ( VCPU_VRSAVE , offsetof ( struct kvm_vcpu , arch . vrsave ) ) ;
2013-10-15 13:43:02 +04:00
DEFINE ( VCPU_FPRS , offsetof ( struct kvm_vcpu , arch . fp . fpr ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
# ifdef CONFIG_ALTIVEC
2013-10-15 13:43:02 +04:00
DEFINE ( VCPU_VRS , offsetof ( struct kvm_vcpu , arch . vr . vr ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
# endif
DEFINE ( VCPU_XER , offsetof ( struct kvm_vcpu , arch . xer ) ) ;
DEFINE ( VCPU_CTR , offsetof ( struct kvm_vcpu , arch . ctr ) ) ;
DEFINE ( VCPU_LR , offsetof ( struct kvm_vcpu , arch . lr ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_TAR , offsetof ( struct kvm_vcpu , arch . tar ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_CR , offsetof ( struct kvm_vcpu , arch . cr ) ) ;
DEFINE ( VCPU_PC , offsetof ( struct kvm_vcpu , arch . pc ) ) ;
2013-10-07 20:47:52 +04:00
# ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_MSR , offsetof ( struct kvm_vcpu , arch . shregs . msr ) ) ;
DEFINE ( VCPU_SRR0 , offsetof ( struct kvm_vcpu , arch . shregs . srr0 ) ) ;
DEFINE ( VCPU_SRR1 , offsetof ( struct kvm_vcpu , arch . shregs . srr1 ) ) ;
DEFINE ( VCPU_SPRG0 , offsetof ( struct kvm_vcpu , arch . shregs . sprg0 ) ) ;
DEFINE ( VCPU_SPRG1 , offsetof ( struct kvm_vcpu , arch . shregs . sprg1 ) ) ;
DEFINE ( VCPU_SPRG2 , offsetof ( struct kvm_vcpu , arch . shregs . sprg2 ) ) ;
DEFINE ( VCPU_SPRG3 , offsetof ( struct kvm_vcpu , arch . shregs . sprg3 ) ) ;
# endif
2013-07-11 15:49:43 +04:00
DEFINE ( VCPU_SHARED_SPRG3 , offsetof ( struct kvm_vcpu_arch_shared , sprg3 ) ) ;
KVM: PPC: Paravirtualize SPRG4-7, ESR, PIR, MASn
This allows additional registers to be accessed by the guest
in PR-mode KVM without trapping.
SPRG4-7 are readable from userspace. On booke, KVM will sync
these registers when it enters the guest, so that accesses from
guest userspace will work. The guest kernel, OTOH, must consistently
use either the real registers or the shared area between exits. This
also applies to the already-paravirted SPRG3.
On non-booke, it's not clear to what extent SPRG4-7 are supported
(they're not architected for book3s, but exist on at least some classic
chips). They are copied in the get/set regs ioctls, but I do not see any
non-booke emulation. I also do not see any syncing with real registers
(in PR-mode) including the user-readable SPRG3. This patch should not
make that situation any worse.
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-11-09 04:23:30 +04:00
DEFINE ( VCPU_SHARED_SPRG4 , offsetof ( struct kvm_vcpu_arch_shared , sprg4 ) ) ;
DEFINE ( VCPU_SHARED_SPRG5 , offsetof ( struct kvm_vcpu_arch_shared , sprg5 ) ) ;
DEFINE ( VCPU_SHARED_SPRG6 , offsetof ( struct kvm_vcpu_arch_shared , sprg6 ) ) ;
DEFINE ( VCPU_SHARED_SPRG7 , offsetof ( struct kvm_vcpu_arch_shared , sprg7 ) ) ;
2008-07-25 22:54:53 +04:00
DEFINE ( VCPU_SHADOW_PID , offsetof ( struct kvm_vcpu , arch . shadow_pid ) ) ;
2011-06-15 03:35:14 +04:00
DEFINE ( VCPU_SHADOW_PID1 , offsetof ( struct kvm_vcpu , arch . shadow_pid1 ) ) ;
2010-07-29 16:47:42 +04:00
DEFINE ( VCPU_SHARED , offsetof ( struct kvm_vcpu , arch . shared ) ) ;
2010-07-29 16:47:43 +04:00
DEFINE ( VCPU_SHARED_MSR , offsetof ( struct kvm_vcpu_arch_shared , msr ) ) ;
2011-06-15 03:34:29 +04:00
DEFINE ( VCPU_SHADOW_MSR , offsetof ( struct kvm_vcpu , arch . shadow_msr ) ) ;
2008-04-17 08:28:09 +04:00
KVM: PPC: Paravirtualize SPRG4-7, ESR, PIR, MASn
This allows additional registers to be accessed by the guest
in PR-mode KVM without trapping.
SPRG4-7 are readable from userspace. On booke, KVM will sync
these registers when it enters the guest, so that accesses from
guest userspace will work. The guest kernel, OTOH, must consistently
use either the real registers or the shared area between exits. This
also applies to the already-paravirted SPRG3.
On non-booke, it's not clear to what extent SPRG4-7 are supported
(they're not architected for book3s, but exist on at least some classic
chips). They are copied in the get/set regs ioctls, but I do not see any
non-booke emulation. I also do not see any syncing with real registers
(in PR-mode) including the user-readable SPRG3. This patch should not
make that situation any worse.
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-11-09 04:23:30 +04:00
DEFINE ( VCPU_SHARED_MAS0 , offsetof ( struct kvm_vcpu_arch_shared , mas0 ) ) ;
DEFINE ( VCPU_SHARED_MAS1 , offsetof ( struct kvm_vcpu_arch_shared , mas1 ) ) ;
DEFINE ( VCPU_SHARED_MAS2 , offsetof ( struct kvm_vcpu_arch_shared , mas2 ) ) ;
DEFINE ( VCPU_SHARED_MAS7_3 , offsetof ( struct kvm_vcpu_arch_shared , mas7_3 ) ) ;
DEFINE ( VCPU_SHARED_MAS4 , offsetof ( struct kvm_vcpu_arch_shared , mas4 ) ) ;
DEFINE ( VCPU_SHARED_MAS6 , offsetof ( struct kvm_vcpu_arch_shared , mas6 ) ) ;
2011-12-20 19:34:43 +04:00
DEFINE ( VCPU_KVM , offsetof ( struct kvm_vcpu , kvm ) ) ;
DEFINE ( KVM_LPID , offsetof ( struct kvm , arch . lpid ) ) ;
2010-04-16 02:11:42 +04:00
/* book3s */
2013-10-07 20:47:52 +04:00
# ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( KVM_SDR1 , offsetof ( struct kvm , arch . sdr1 ) ) ;
DEFINE ( KVM_HOST_LPID , offsetof ( struct kvm , arch . host_lpid ) ) ;
DEFINE ( KVM_HOST_LPCR , offsetof ( struct kvm , arch . host_lpcr ) ) ;
DEFINE ( KVM_HOST_SDR1 , offsetof ( struct kvm , arch . host_sdr1 ) ) ;
DEFINE ( KVM_TLBIE_LOCK , offsetof ( struct kvm , arch . tlbie_lock ) ) ;
KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations
When we change or remove a HPT (hashed page table) entry, we can do
either a global TLB invalidation (tlbie) that works across the whole
machine, or a local invalidation (tlbiel) that only affects this core.
Currently we do local invalidations if the VM has only one vcpu or if
the guest requests it with the H_LOCAL flag, though the guest Linux
kernel currently doesn't ever use H_LOCAL. Then, to cope with the
possibility that vcpus moving around to different physical cores might
expose stale TLB entries, there is some code in kvmppc_hv_entry to
flush the whole TLB of entries for this VM if either this vcpu is now
running on a different physical core from where it last ran, or if this
physical core last ran a different vcpu.
There are a number of problems on POWER7 with this as it stands:
- The TLB invalidation is done per thread, whereas it only needs to be
done per core, since the TLB is shared between the threads.
- With the possibility of the host paging out guest pages, the use of
H_LOCAL by an SMP guest is dangerous since the guest could possibly
retain and use a stale TLB entry pointing to a page that had been
removed from the guest.
- The TLB invalidations that we do when a vcpu moves from one physical
core to another are unnecessary in the case of an SMP guest that isn't
using H_LOCAL.
- The optimization of using local invalidations rather than global should
apply to guests with one virtual core, not just one vcpu.
(None of this applies on PPC970, since there we always have to
invalidate the whole TLB when entering and leaving the guest, and we
can't support paging out guest memory.)
To fix these problems and simplify the code, we now maintain a simple
cpumask of which cpus need to flush the TLB on entry to the guest.
(This is indexed by cpu, though we only ever use the bits for thread
0 of each core.) Whenever we do a local TLB invalidation, we set the
bits for every cpu except the bit for thread 0 of the core that we're
currently running on. Whenever we enter a guest, we test and clear the
bit for our core, and flush the TLB if it was set.
On initial startup of the VM, and when resetting the HPT, we set all the
bits in the need_tlb_flush cpumask, since any core could potentially have
stale TLB entries from the previous VM to use the same LPID, or the
previous contents of the HPT.
Then, we maintain a count of the number of online virtual cores, and use
that when deciding whether to use a local invalidation rather than the
number of online vcpus. The code to make that decision is extracted out
into a new function, global_invalidates(). For multi-core guests on
POWER7 (i.e. when we are using mmu notifiers), we now never do local
invalidations regardless of the H_LOCAL flag.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2012-11-22 03:28:08 +04:00
DEFINE ( KVM_NEED_FLUSH , offsetof ( struct kvm , arch . need_tlb_flush . bits ) ) ;
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility. These processors require a physically
contiguous, aligned area of memory for each guest. When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access. The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.
Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator. The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.
KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs. The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.
This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA. It
also returns the size of the RMA in the argument structure.
Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace. To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory. Subsequently we will get rid of this
array and use memory associated with each memslot instead.
This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region. Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB. However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.
Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest. This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:25:44 +04:00
DEFINE ( KVM_LPCR , offsetof ( struct kvm , arch . lpcr ) ) ;
DEFINE ( KVM_RMOR , offsetof ( struct kvm , arch . rmor ) ) ;
KVM: PPC: Implement MMIO emulation support for Book3S HV guests
This provides the low-level support for MMIO emulation in Book3S HV
guests. When the guest tries to map a page which is not covered by
any memslot, that page is taken to be an MMIO emulation page. Instead
of inserting a valid HPTE, we insert an HPTE that has the valid bit
clear but another hypervisor software-use bit set, which we call
HPTE_V_ABSENT, to indicate that this is an absent page. An
absent page is treated much like a valid page as far as guest hcalls
(H_ENTER, H_REMOVE, H_READ etc.) are concerned, except of course that
an absent HPTE doesn't need to be invalidated with tlbie since it
was never valid as far as the hardware is concerned.
When the guest accesses a page for which there is an absent HPTE, it
will take a hypervisor data storage interrupt (HDSI) since we now set
the VPM1 bit in the LPCR. Our HDSI handler for HPTE-not-present faults
looks up the hash table and if it finds an absent HPTE mapping the
requested virtual address, will switch to kernel mode and handle the
fault in kvmppc_book3s_hv_page_fault(), which at present just calls
kvmppc_hv_emulate_mmio() to set up the MMIO emulation.
This is based on an earlier patch by Benjamin Herrenschmidt, but since
heavily reworked.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 16:36:37 +04:00
DEFINE ( KVM_VRMA_SLB_V , offsetof ( struct kvm , arch . vrma_slb_v ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_DSISR , offsetof ( struct kvm_vcpu , arch . shregs . dsisr ) ) ;
DEFINE ( VCPU_DAR , offsetof ( struct kvm_vcpu , arch . shregs . dar ) ) ;
2012-03-06 01:42:25 +04:00
DEFINE ( VCPU_VPA , offsetof ( struct kvm_vcpu , arch . vpa . pinned_addr ) ) ;
KVM: PPC: Book3S HV: Report VPA and DTL modifications in dirty map
At present, the KVM_GET_DIRTY_LOG ioctl doesn't report modifications
done by the host to the virtual processor areas (VPAs) and dispatch
trace logs (DTLs) registered by the guest. This is because those
modifications are done either in real mode or in the host kernel
context, and in neither case does the access go through the guest's
HPT, and thus no change (C) bit gets set in the guest's HPT.
However, the changes done by the host do need to be tracked so that
the modified pages get transferred when doing live migration. In
order to track these modifications, this adds a dirty flag to the
struct representing the VPA/DTL areas, and arranges to set the flag
when the VPA/DTL gets modified by the host. Then, when we are
collecting the dirty log, we also check the dirty flags for the
VPA and DTL for each vcpu and set the relevant bit in the dirty log
if necessary. Doing this also means we now need to keep track of
the guest physical address of the VPA/DTL areas.
So as not to lose track of modifications to a VPA/DTL area when it gets
unregistered, or when a new area gets registered in its place, we need
to transfer the dirty state to the rmap chain. This adds code to
kvmppc_unpin_guest_page() to do that if the area was dirty. To simplify
that code, we now require that all VPA, DTL and SLB shadow buffer areas
fit within a single host page. Guests already comply with this
requirement because pHyp requires that these areas not cross a 4k
boundary.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-04-18 23:51:04 +04:00
DEFINE ( VCPU_VPA_DIRTY , offsetof ( struct kvm_vcpu , arch . vpa . dirty ) ) ;
2014-01-08 14:25:30 +04:00
DEFINE ( VCPU_INTR_MSR , offsetof ( struct kvm_vcpu , arch . intr_msr ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
# endif
2010-04-16 02:11:42 +04:00
# ifdef CONFIG_PPC_BOOK3S
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_VCPUID , offsetof ( struct kvm_vcpu , vcpu_id ) ) ;
DEFINE ( VCPU_PURR , offsetof ( struct kvm_vcpu , arch . purr ) ) ;
DEFINE ( VCPU_SPURR , offsetof ( struct kvm_vcpu , arch . spurr ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_IC , offsetof ( struct kvm_vcpu , arch . ic ) ) ;
DEFINE ( VCPU_VTB , offsetof ( struct kvm_vcpu , arch . vtb ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_DSCR , offsetof ( struct kvm_vcpu , arch . dscr ) ) ;
DEFINE ( VCPU_AMR , offsetof ( struct kvm_vcpu , arch . amr ) ) ;
DEFINE ( VCPU_UAMOR , offsetof ( struct kvm_vcpu , arch . uamor ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_IAMR , offsetof ( struct kvm_vcpu , arch . iamr ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_CTRL , offsetof ( struct kvm_vcpu , arch . ctrl ) ) ;
DEFINE ( VCPU_DABR , offsetof ( struct kvm_vcpu , arch . dabr ) ) ;
KVM: PPC: Book3S HV: Add support for DABRX register on POWER7
The DABRX (DABR extension) register on POWER7 processors provides finer
control over which accesses cause a data breakpoint interrupt. It
contains 3 bits which indicate whether to enable accesses in user,
kernel and hypervisor modes respectively to cause data breakpoint
interrupts, plus one bit that enables both real mode and virtual mode
accesses to cause interrupts. Currently, KVM sets DABRX to allow
both kernel and user accesses to cause interrupts while in the guest.
This adds support for the guest to specify other values for DABRX.
PAPR defines a H_SET_XDABR hcall to allow the guest to set both DABR
and DABRX with one call. This adds a real-mode implementation of
H_SET_XDABR, which shares most of its code with the existing H_SET_DABR
implementation. To support this, we add a per-vcpu field to store the
DABRX value plus code to get and set it via the ONE_REG interface.
For Linux guests to use this new hcall, userspace needs to add
"hcall-xdabr" to the set of strings in the /chosen/hypertas-functions
property in the device tree. If userspace does this and then migrates
the guest to a host where the kernel doesn't include this patch, then
userspace will need to implement H_SET_XDABR by writing the specified
DABR value to the DABR using the ONE_REG interface. In that case, the
old kernel will set DABRX to DABRX_USER | DABRX_KERNEL. That should
still work correctly, at least for Linux guests, since Linux guests
cope with getting data breakpoint interrupts in modes that weren't
requested by just ignoring the interrupt, and Linux guests never set
DABRX_BTI.
The other thing this does is to make H_SET_DABR and H_SET_XDABR work
on POWER8, which has the DAWR and DAWRX instead of DABR/X. Guests that
know about POWER8 should use H_SET_MODE rather than H_SET_[X]DABR, but
guests running in POWER7 compatibility mode will still use H_SET_[X]DABR.
For them, this adds the logic to convert DABR/X values into DAWR/X values
on POWER8.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2014-01-08 14:25:29 +04:00
DEFINE ( VCPU_DABRX , offsetof ( struct kvm_vcpu , arch . dabrx ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_DAWR , offsetof ( struct kvm_vcpu , arch . dawr ) ) ;
DEFINE ( VCPU_DAWRX , offsetof ( struct kvm_vcpu , arch . dawrx ) ) ;
DEFINE ( VCPU_CIABR , offsetof ( struct kvm_vcpu , arch . ciabr ) ) ;
2009-10-30 08:47:18 +03:00
DEFINE ( VCPU_HFLAGS , offsetof ( struct kvm_vcpu , arch . hflags ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_DEC , offsetof ( struct kvm_vcpu , arch . dec ) ) ;
DEFINE ( VCPU_DEC_EXPIRES , offsetof ( struct kvm_vcpu , arch . dec_expires ) ) ;
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility. These processors require a physically
contiguous, aligned area of memory for each guest. When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access. The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.
Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator. The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.
KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs. The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.
This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA. It
also returns the size of the RMA in the argument structure.
Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace. To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory. Subsequently we will get rid of this
array and use memory associated with each memslot instead.
This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region. Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB. However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.
Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest. This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:25:44 +04:00
DEFINE ( VCPU_PENDING_EXC , offsetof ( struct kvm_vcpu , arch . pending_exceptions ) ) ;
KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code
With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per
core), whenever a CPU goes idle, we have to pull all the other
hardware threads in the core out of the guest, because the H_CEDE
hcall is handled in the kernel. This is inefficient.
This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall
in real mode. When a guest vcpu does an H_CEDE hcall, we now only
exit to the kernel if all the other vcpus in the same core are also
idle. Otherwise we mark this vcpu as napping, save state that could
be lost in nap mode (mainly GPRs and FPRs), and execute the nap
instruction. When the thread wakes up, because of a decrementer or
external interrupt, we come back in at kvm_start_guest (from the
system reset interrupt vector), find the `napping' flag set in the
paca, and go to the resume path.
This has some other ramifications. First, when starting a core, we
now start all the threads, both those that are immediately runnable and
those that are idle. This is so that we don't have to pull all the
threads out of the guest when an idle thread gets a decrementer interrupt
and wants to start running. In fact the idle threads will all start
with the H_CEDE hcall returning; being idle they will just do another
H_CEDE immediately and go to nap mode.
This required some changes to kvmppc_run_core() and kvmppc_run_vcpu().
These functions have been restructured to make them simpler and clearer.
We introduce a level of indirection in the wait queue that gets woken
when external and decrementer interrupts get generated for a vcpu, so
that we can have the 4 vcpus in a vcore using the same wait queue.
We need this because the 4 vcpus are being handled by one thread.
Secondly, when we need to exit from the guest to the kernel, we now
have to generate an IPI for any napping threads, because an HDEC
interrupt doesn't wake up a napping thread.
Thirdly, we now need to be able to handle virtual external interrupts
and decrementer interrupts becoming pending while a thread is napping,
and deliver those interrupts to the guest when the thread wakes.
This is done in kvmppc_cede_reentry, just before fast_guest_return.
Finally, since we are not using the generic kvm_vcpu_block for book3s_hv,
and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef
from kvm_arch_vcpu_runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-23 11:42:46 +04:00
DEFINE ( VCPU_CEDED , offsetof ( struct kvm_vcpu , arch . ceded ) ) ;
DEFINE ( VCPU_PRODDED , offsetof ( struct kvm_vcpu , arch . prodded ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_MMCR , offsetof ( struct kvm_vcpu , arch . mmcr ) ) ;
DEFINE ( VCPU_PMC , offsetof ( struct kvm_vcpu , arch . pmc ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_SPMC , offsetof ( struct kvm_vcpu , arch . spmc ) ) ;
2013-09-06 07:11:18 +04:00
DEFINE ( VCPU_SIAR , offsetof ( struct kvm_vcpu , arch . siar ) ) ;
DEFINE ( VCPU_SDAR , offsetof ( struct kvm_vcpu , arch . sdar ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_SIER , offsetof ( struct kvm_vcpu , arch . sier ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_SLB , offsetof ( struct kvm_vcpu , arch . slb ) ) ;
DEFINE ( VCPU_SLB_MAX , offsetof ( struct kvm_vcpu , arch . slb_max ) ) ;
DEFINE ( VCPU_SLB_NR , offsetof ( struct kvm_vcpu , arch . slb_nr ) ) ;
DEFINE ( VCPU_FAULT_DSISR , offsetof ( struct kvm_vcpu , arch . fault_dsisr ) ) ;
DEFINE ( VCPU_FAULT_DAR , offsetof ( struct kvm_vcpu , arch . fault_dar ) ) ;
DEFINE ( VCPU_LAST_INST , offsetof ( struct kvm_vcpu , arch . last_inst ) ) ;
DEFINE ( VCPU_TRAP , offsetof ( struct kvm_vcpu , arch . trap ) ) ;
2013-02-04 22:10:51 +04:00
DEFINE ( VCPU_CFAR , offsetof ( struct kvm_vcpu , arch . cfar ) ) ;
2013-09-20 08:52:39 +04:00
DEFINE ( VCPU_PPR , offsetof ( struct kvm_vcpu , arch . ppr ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCPU_FSCR , offsetof ( struct kvm_vcpu , arch . fscr ) ) ;
DEFINE ( VCPU_PSPB , offsetof ( struct kvm_vcpu , arch . pspb ) ) ;
DEFINE ( VCPU_EBBHR , offsetof ( struct kvm_vcpu , arch . ebbhr ) ) ;
DEFINE ( VCPU_EBBRR , offsetof ( struct kvm_vcpu , arch . ebbrr ) ) ;
DEFINE ( VCPU_BESCR , offsetof ( struct kvm_vcpu , arch . bescr ) ) ;
DEFINE ( VCPU_CSIGR , offsetof ( struct kvm_vcpu , arch . csigr ) ) ;
DEFINE ( VCPU_TACR , offsetof ( struct kvm_vcpu , arch . tacr ) ) ;
DEFINE ( VCPU_TCSCR , offsetof ( struct kvm_vcpu , arch . tcscr ) ) ;
DEFINE ( VCPU_ACOP , offsetof ( struct kvm_vcpu , arch . acop ) ) ;
DEFINE ( VCPU_WORT , offsetof ( struct kvm_vcpu , arch . wort ) ) ;
KVM: PPC: Book3S PR: Keep volatile reg values in vcpu rather than shadow_vcpu
Currently PR-style KVM keeps the volatile guest register values
(R0 - R13, CR, LR, CTR, XER, PC) in a shadow_vcpu struct rather than
the main kvm_vcpu struct. For 64-bit, the shadow_vcpu exists in two
places, a kmalloc'd struct and in the PACA, and it gets copied back
and forth in kvmppc_core_vcpu_load/put(), because the real-mode code
can't rely on being able to access the kmalloc'd struct.
This changes the code to copy the volatile values into the shadow_vcpu
as one of the last things done before entering the guest. Similarly
the values are copied back out of the shadow_vcpu to the kvm_vcpu
immediately after exiting the guest. We arrange for interrupts to be
still disabled at this point so that we can't get preempted on 64-bit
and end up copying values from the wrong PACA.
This means that the accessor functions in kvm_book3s.h for these
registers are greatly simplified, and are same between PR and HV KVM.
In places where accesses to shadow_vcpu fields are now replaced by
accesses to the kvm_vcpu, we can also remove the svcpu_get/put pairs.
Finally, on 64-bit, we don't need the kmalloc'd struct at all any more.
With this, the time to read the PVR one million times in a loop went
from 567.7ms to 575.5ms (averages of 6 values), an increase of about
1.4% for this worse-case test for guest entries and exits. The
standard deviation of the measurements is about 11ms, so the
difference is only marginally significant statistically.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 08:52:43 +04:00
DEFINE ( VCPU_SHADOW_SRR1 , offsetof ( struct kvm_vcpu , arch . shadow_srr1 ) ) ;
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:23:08 +04:00
DEFINE ( VCORE_ENTRY_EXIT , offsetof ( struct kvmppc_vcore , entry_exit_count ) ) ;
DEFINE ( VCORE_NAP_COUNT , offsetof ( struct kvmppc_vcore , nap_count ) ) ;
DEFINE ( VCORE_IN_GUEST , offsetof ( struct kvmppc_vcore , in_guest ) ) ;
KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code
With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per
core), whenever a CPU goes idle, we have to pull all the other
hardware threads in the core out of the guest, because the H_CEDE
hcall is handled in the kernel. This is inefficient.
This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall
in real mode. When a guest vcpu does an H_CEDE hcall, we now only
exit to the kernel if all the other vcpus in the same core are also
idle. Otherwise we mark this vcpu as napping, save state that could
be lost in nap mode (mainly GPRs and FPRs), and execute the nap
instruction. When the thread wakes up, because of a decrementer or
external interrupt, we come back in at kvm_start_guest (from the
system reset interrupt vector), find the `napping' flag set in the
paca, and go to the resume path.
This has some other ramifications. First, when starting a core, we
now start all the threads, both those that are immediately runnable and
those that are idle. This is so that we don't have to pull all the
threads out of the guest when an idle thread gets a decrementer interrupt
and wants to start running. In fact the idle threads will all start
with the H_CEDE hcall returning; being idle they will just do another
H_CEDE immediately and go to nap mode.
This required some changes to kvmppc_run_core() and kvmppc_run_vcpu().
These functions have been restructured to make them simpler and clearer.
We introduce a level of indirection in the wait queue that gets woken
when external and decrementer interrupts get generated for a vcpu, so
that we can have the 4 vcpus in a vcore using the same wait queue.
We need this because the 4 vcpus are being handled by one thread.
Secondly, when we need to exit from the guest to the kernel, we now
have to generate an IPI for any napping threads, because an HDEC
interrupt doesn't wake up a napping thread.
Thirdly, we now need to be able to handle virtual external interrupts
and decrementer interrupts becoming pending while a thread is napping,
and deliver those interrupts to the guest when the thread wakes.
This is done in kvmppc_cede_reentry, just before fast_guest_return.
Finally, since we are not using the generic kvm_vcpu_block for book3s_hv,
and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef
from kvm_arch_vcpu_runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-23 11:42:46 +04:00
DEFINE ( VCORE_NAPPING_THREADS , offsetof ( struct kvmppc_vcore , napping_threads ) ) ;
KVM: PPC: Book3S HV: Align physical and virtual CPU thread numbers
On a threaded processor such as POWER7, we group VCPUs into virtual
cores and arrange that the VCPUs in a virtual core run on the same
physical core. Currently we don't enforce any correspondence between
virtual thread numbers within a virtual core and physical thread
numbers. Physical threads are allocated starting at 0 on a first-come
first-served basis to runnable virtual threads (VCPUs).
POWER8 implements a new "msgsndp" instruction which guest kernels can
use to interrupt other threads in the same core or sub-core. Since
the instruction takes the destination physical thread ID as a parameter,
it becomes necessary to align the physical thread IDs with the virtual
thread IDs, that is, to make sure virtual thread N within a virtual
core always runs on physical thread N.
This means that it's possible that thread 0, which is where we call
__kvmppc_vcore_entry, may end up running some other vcpu than the
one whose task called kvmppc_run_core(), or it may end up running
no vcpu at all, if for example thread 0 of the virtual core is
currently executing in userspace. However, we do need thread 0
to be responsible for switching the MMU -- a previous version of
this patch that had other threads switching the MMU was found to
be responsible for occasional memory corruption and machine check
interrupts in the guest on POWER7 machines.
To accommodate this, we no longer pass the vcpu pointer to
__kvmppc_vcore_entry, but instead let the assembly code load it from
the PACA. Since the assembly code will need to know the kvm pointer
and the thread ID for threads which don't have a vcpu, we move the
thread ID into the PACA and we add a kvm pointer to the virtual core
structure.
In the case where thread 0 has no vcpu to run, it still calls into
kvmppc_hv_entry in order to do the MMU switch, and then naps until
either its vcpu is ready to run in the guest, or some other thread
needs to exit the guest. In the latter case, thread 0 jumps to the
code that switches the MMU back to the host. This control flow means
that now we switch the MMU before loading any guest vcpu state.
Similarly, on guest exit we now save all the guest vcpu state before
switching the MMU back to the host. This has required substantial
code movement, making the diff rather large.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2014-01-08 14:25:20 +04:00
DEFINE ( VCORE_KVM , offsetof ( struct kvmppc_vcore , kvm ) ) ;
KVM: PPC: Book3S HV: Implement timebase offset for guests
This allows guests to have a different timebase origin from the host.
This is needed for migration, where a guest can migrate from one host
to another and the two hosts might have a different timebase origin.
However, the timebase seen by the guest must not go backwards, and
should go forwards only by a small amount corresponding to the time
taken for the migration.
Therefore this provides a new per-vcpu value accessed via the one_reg
interface using the new KVM_REG_PPC_TB_OFFSET identifier. This value
defaults to 0 and is not modified by KVM. On entering the guest, this
value is added onto the timebase, and on exiting the guest, it is
subtracted from the timebase.
This is only supported for recent POWER hardware which has the TBU40
(timebase upper 40 bits) register. Writing to the TBU40 register only
alters the upper 40 bits of the timebase, leaving the lower 24 bits
unchanged. This provides a way to modify the timebase for guest
migration without disturbing the synchronization of the timebase
registers across CPU cores. The kernel rounds up the value given
to a multiple of 2^24.
Timebase values stored in KVM structures (struct kvm_vcpu, struct
kvmppc_vcore, etc.) are stored as host timebase values. The timebase
values in the dispatch trace log need to be guest timebase values,
however, since that is read directly by the guest. This moves the
setting of vcpu->arch.dec_expires on guest exit to a point after we
have restored the host timebase so that vcpu->arch.dec_expires is a
host timebase value.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-06 07:17:46 +04:00
DEFINE ( VCORE_TB_OFFSET , offsetof ( struct kvmppc_vcore , tb_offset ) ) ;
2013-09-20 08:52:38 +04:00
DEFINE ( VCORE_LPCR , offsetof ( struct kvmppc_vcore , lpcr ) ) ;
2013-09-21 08:35:02 +04:00
DEFINE ( VCORE_PCR , offsetof ( struct kvmppc_vcore , pcr ) ) ;
2014-01-08 14:25:21 +04:00
DEFINE ( VCORE_DPDES , offsetof ( struct kvmppc_vcore , dpdes ) ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
DEFINE ( VCPU_SLB_E , offsetof ( struct kvmppc_slb , orige ) ) ;
DEFINE ( VCPU_SLB_V , offsetof ( struct kvmppc_slb , origv ) ) ;
DEFINE ( VCPU_SLB_SIZE , sizeof ( struct kvmppc_slb ) ) ;
2014-01-08 14:25:32 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
DEFINE ( VCPU_TFHAR , offsetof ( struct kvm_vcpu , arch . tfhar ) ) ;
DEFINE ( VCPU_TFIAR , offsetof ( struct kvm_vcpu , arch . tfiar ) ) ;
DEFINE ( VCPU_TEXASR , offsetof ( struct kvm_vcpu , arch . texasr ) ) ;
DEFINE ( VCPU_GPR_TM , offsetof ( struct kvm_vcpu , arch . gpr_tm ) ) ;
DEFINE ( VCPU_FPRS_TM , offsetof ( struct kvm_vcpu , arch . fp_tm . fpr ) ) ;
DEFINE ( VCPU_VRS_TM , offsetof ( struct kvm_vcpu , arch . vr_tm . vr ) ) ;
DEFINE ( VCPU_VRSAVE_TM , offsetof ( struct kvm_vcpu , arch . vrsave_tm ) ) ;
DEFINE ( VCPU_CR_TM , offsetof ( struct kvm_vcpu , arch . cr_tm ) ) ;
DEFINE ( VCPU_LR_TM , offsetof ( struct kvm_vcpu , arch . lr_tm ) ) ;
DEFINE ( VCPU_CTR_TM , offsetof ( struct kvm_vcpu , arch . ctr_tm ) ) ;
DEFINE ( VCPU_AMR_TM , offsetof ( struct kvm_vcpu , arch . amr_tm ) ) ;
DEFINE ( VCPU_PPR_TM , offsetof ( struct kvm_vcpu , arch . ppr_tm ) ) ;
DEFINE ( VCPU_DSCR_TM , offsetof ( struct kvm_vcpu , arch . dscr_tm ) ) ;
DEFINE ( VCPU_TAR_TM , offsetof ( struct kvm_vcpu , arch . tar_tm ) ) ;
# endif
2011-06-29 04:20:58 +04:00
# ifdef CONFIG_PPC_BOOK3S_64
2013-10-07 20:47:51 +04:00
# ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
KVM: PPC: Book3S PR: Keep volatile reg values in vcpu rather than shadow_vcpu
Currently PR-style KVM keeps the volatile guest register values
(R0 - R13, CR, LR, CTR, XER, PC) in a shadow_vcpu struct rather than
the main kvm_vcpu struct. For 64-bit, the shadow_vcpu exists in two
places, a kmalloc'd struct and in the PACA, and it gets copied back
and forth in kvmppc_core_vcpu_load/put(), because the real-mode code
can't rely on being able to access the kmalloc'd struct.
This changes the code to copy the volatile values into the shadow_vcpu
as one of the last things done before entering the guest. Similarly
the values are copied back out of the shadow_vcpu to the kvm_vcpu
immediately after exiting the guest. We arrange for interrupts to be
still disabled at this point so that we can't get preempted on 64-bit
and end up copying values from the wrong PACA.
This means that the accessor functions in kvm_book3s.h for these
registers are greatly simplified, and are same between PR and HV KVM.
In places where accesses to shadow_vcpu fields are now replaced by
accesses to the kvm_vcpu, we can also remove the svcpu_get/put pairs.
Finally, on 64-bit, we don't need the kmalloc'd struct at all any more.
With this, the time to read the PVR one million times in a loop went
from 567.7ms to 575.5ms (averages of 6 values), an increase of about
1.4% for this worse-case test for guest entries and exits. The
standard deviation of the measurements is about 11ms, so the
difference is only marginally significant statistically.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 08:52:43 +04:00
DEFINE ( PACA_SVCPU , offsetof ( struct paca_struct , shadow_vcpu ) ) ;
2011-06-29 04:20:58 +04:00
# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
# else
# define SVCPU_FIELD(x, f)
# endif
2011-06-29 04:20:58 +04:00
# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
# else /* 32-bit */
# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
# endif
SVCPU_FIELD ( SVCPU_CR , cr ) ;
SVCPU_FIELD ( SVCPU_XER , xer ) ;
SVCPU_FIELD ( SVCPU_CTR , ctr ) ;
SVCPU_FIELD ( SVCPU_LR , lr ) ;
SVCPU_FIELD ( SVCPU_PC , pc ) ;
SVCPU_FIELD ( SVCPU_R0 , gpr [ 0 ] ) ;
SVCPU_FIELD ( SVCPU_R1 , gpr [ 1 ] ) ;
SVCPU_FIELD ( SVCPU_R2 , gpr [ 2 ] ) ;
SVCPU_FIELD ( SVCPU_R3 , gpr [ 3 ] ) ;
SVCPU_FIELD ( SVCPU_R4 , gpr [ 4 ] ) ;
SVCPU_FIELD ( SVCPU_R5 , gpr [ 5 ] ) ;
SVCPU_FIELD ( SVCPU_R6 , gpr [ 6 ] ) ;
SVCPU_FIELD ( SVCPU_R7 , gpr [ 7 ] ) ;
SVCPU_FIELD ( SVCPU_R8 , gpr [ 8 ] ) ;
SVCPU_FIELD ( SVCPU_R9 , gpr [ 9 ] ) ;
SVCPU_FIELD ( SVCPU_R10 , gpr [ 10 ] ) ;
SVCPU_FIELD ( SVCPU_R11 , gpr [ 11 ] ) ;
SVCPU_FIELD ( SVCPU_R12 , gpr [ 12 ] ) ;
SVCPU_FIELD ( SVCPU_R13 , gpr [ 13 ] ) ;
SVCPU_FIELD ( SVCPU_FAULT_DSISR , fault_dsisr ) ;
SVCPU_FIELD ( SVCPU_FAULT_DAR , fault_dar ) ;
SVCPU_FIELD ( SVCPU_LAST_INST , last_inst ) ;
SVCPU_FIELD ( SVCPU_SHADOW_SRR1 , shadow_srr1 ) ;
2010-04-16 02:11:44 +04:00
# ifdef CONFIG_PPC_BOOK3S_32
2011-06-29 04:20:58 +04:00
SVCPU_FIELD ( SVCPU_SR , sr ) ;
2010-04-16 02:11:44 +04:00
# endif
2011-06-29 04:20:58 +04:00
# ifdef CONFIG_PPC64
SVCPU_FIELD ( SVCPU_SLB , slb ) ;
SVCPU_FIELD ( SVCPU_SLB_MAX , slb_max ) ;
# endif
HSTATE_FIELD ( HSTATE_HOST_R1 , host_r1 ) ;
HSTATE_FIELD ( HSTATE_HOST_R2 , host_r2 ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
HSTATE_FIELD ( HSTATE_HOST_MSR , host_msr ) ;
2011-06-29 04:20:58 +04:00
HSTATE_FIELD ( HSTATE_VMHANDLER , vmhandler ) ;
HSTATE_FIELD ( HSTATE_SCRATCH0 , scratch0 ) ;
HSTATE_FIELD ( HSTATE_SCRATCH1 , scratch1 ) ;
2013-11-11 17:59:47 +04:00
HSTATE_FIELD ( HSTATE_SCRATCH2 , scratch2 ) ;
2011-06-29 04:20:58 +04:00
HSTATE_FIELD ( HSTATE_IN_GUEST , in_guest ) ;
2011-07-23 11:41:44 +04:00
HSTATE_FIELD ( HSTATE_RESTORE_HID5 , restore_hid5 ) ;
KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code
With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per
core), whenever a CPU goes idle, we have to pull all the other
hardware threads in the core out of the guest, because the H_CEDE
hcall is handled in the kernel. This is inefficient.
This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall
in real mode. When a guest vcpu does an H_CEDE hcall, we now only
exit to the kernel if all the other vcpus in the same core are also
idle. Otherwise we mark this vcpu as napping, save state that could
be lost in nap mode (mainly GPRs and FPRs), and execute the nap
instruction. When the thread wakes up, because of a decrementer or
external interrupt, we come back in at kvm_start_guest (from the
system reset interrupt vector), find the `napping' flag set in the
paca, and go to the resume path.
This has some other ramifications. First, when starting a core, we
now start all the threads, both those that are immediately runnable and
those that are idle. This is so that we don't have to pull all the
threads out of the guest when an idle thread gets a decrementer interrupt
and wants to start running. In fact the idle threads will all start
with the H_CEDE hcall returning; being idle they will just do another
H_CEDE immediately and go to nap mode.
This required some changes to kvmppc_run_core() and kvmppc_run_vcpu().
These functions have been restructured to make them simpler and clearer.
We introduce a level of indirection in the wait queue that gets woken
when external and decrementer interrupts get generated for a vcpu, so
that we can have the 4 vcpus in a vcore using the same wait queue.
We need this because the 4 vcpus are being handled by one thread.
Secondly, when we need to exit from the guest to the kernel, we now
have to generate an IPI for any napping threads, because an HDEC
interrupt doesn't wake up a napping thread.
Thirdly, we now need to be able to handle virtual external interrupts
and decrementer interrupts becoming pending while a thread is napping,
and deliver those interrupts to the guest when the thread wakes.
This is done in kvmppc_cede_reentry, just before fast_guest_return.
Finally, since we are not using the generic kvm_vcpu_block for book3s_hv,
and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef
from kvm_arch_vcpu_runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-23 11:42:46 +04:00
HSTATE_FIELD ( HSTATE_NAPPING , napping ) ;
2011-06-29 04:20:58 +04:00
2013-10-07 20:47:52 +04:00
# ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
2012-03-06 01:42:25 +04:00
HSTATE_FIELD ( HSTATE_HWTHREAD_REQ , hwthread_req ) ;
HSTATE_FIELD ( HSTATE_HWTHREAD_STATE , hwthread_state ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
HSTATE_FIELD ( HSTATE_KVM_VCPU , kvm_vcpu ) ;
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:23:08 +04:00
HSTATE_FIELD ( HSTATE_KVM_VCORE , kvm_vcore ) ;
HSTATE_FIELD ( HSTATE_XICS_PHYS , xics_phys ) ;
2013-04-18 00:30:50 +04:00
HSTATE_FIELD ( HSTATE_SAVED_XIRR , saved_xirr ) ;
HSTATE_FIELD ( HSTATE_HOST_IPI , host_ipi ) ;
KVM: PPC: Book3S HV: Align physical and virtual CPU thread numbers
On a threaded processor such as POWER7, we group VCPUs into virtual
cores and arrange that the VCPUs in a virtual core run on the same
physical core. Currently we don't enforce any correspondence between
virtual thread numbers within a virtual core and physical thread
numbers. Physical threads are allocated starting at 0 on a first-come
first-served basis to runnable virtual threads (VCPUs).
POWER8 implements a new "msgsndp" instruction which guest kernels can
use to interrupt other threads in the same core or sub-core. Since
the instruction takes the destination physical thread ID as a parameter,
it becomes necessary to align the physical thread IDs with the virtual
thread IDs, that is, to make sure virtual thread N within a virtual
core always runs on physical thread N.
This means that it's possible that thread 0, which is where we call
__kvmppc_vcore_entry, may end up running some other vcpu than the
one whose task called kvmppc_run_core(), or it may end up running
no vcpu at all, if for example thread 0 of the virtual core is
currently executing in userspace. However, we do need thread 0
to be responsible for switching the MMU -- a previous version of
this patch that had other threads switching the MMU was found to
be responsible for occasional memory corruption and machine check
interrupts in the guest on POWER7 machines.
To accommodate this, we no longer pass the vcpu pointer to
__kvmppc_vcore_entry, but instead let the assembly code load it from
the PACA. Since the assembly code will need to know the kvm pointer
and the thread ID for threads which don't have a vcpu, we move the
thread ID into the PACA and we add a kvm pointer to the virtual core
structure.
In the case where thread 0 has no vcpu to run, it still calls into
kvmppc_hv_entry in order to do the MMU switch, and then naps until
either its vcpu is ready to run in the guest, or some other thread
needs to exit the guest. In the latter case, thread 0 jumps to the
code that switches the MMU back to the host. This control flow means
that now we switch the MMU before loading any guest vcpu state.
Similarly, on guest exit we now save all the guest vcpu state before
switching the MMU back to the host. This has required substantial
code movement, making the diff rather large.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2014-01-08 14:25:20 +04:00
HSTATE_FIELD ( HSTATE_PTID , ptid ) ;
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
HSTATE_FIELD ( HSTATE_MMCR , host_mmcr ) ;
HSTATE_FIELD ( HSTATE_PMC , host_pmc ) ;
HSTATE_FIELD ( HSTATE_PURR , host_purr ) ;
HSTATE_FIELD ( HSTATE_SPURR , host_spurr ) ;
HSTATE_FIELD ( HSTATE_DSCR , host_dscr ) ;
HSTATE_FIELD ( HSTATE_DABR , dabr ) ;
HSTATE_FIELD ( HSTATE_DECEXP , dec_expires ) ;
KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code
With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per
core), whenever a CPU goes idle, we have to pull all the other
hardware threads in the core out of the guest, because the H_CEDE
hcall is handled in the kernel. This is inefficient.
This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall
in real mode. When a guest vcpu does an H_CEDE hcall, we now only
exit to the kernel if all the other vcpus in the same core are also
idle. Otherwise we mark this vcpu as napping, save state that could
be lost in nap mode (mainly GPRs and FPRs), and execute the nap
instruction. When the thread wakes up, because of a decrementer or
external interrupt, we come back in at kvm_start_guest (from the
system reset interrupt vector), find the `napping' flag set in the
paca, and go to the resume path.
This has some other ramifications. First, when starting a core, we
now start all the threads, both those that are immediately runnable and
those that are idle. This is so that we don't have to pull all the
threads out of the guest when an idle thread gets a decrementer interrupt
and wants to start running. In fact the idle threads will all start
with the H_CEDE hcall returning; being idle they will just do another
H_CEDE immediately and go to nap mode.
This required some changes to kvmppc_run_core() and kvmppc_run_vcpu().
These functions have been restructured to make them simpler and clearer.
We introduce a level of indirection in the wait queue that gets woken
when external and decrementer interrupts get generated for a vcpu, so
that we can have the 4 vcpus in a vcore using the same wait queue.
We need this because the 4 vcpus are being handled by one thread.
Secondly, when we need to exit from the guest to the kernel, we now
have to generate an IPI for any napping threads, because an HDEC
interrupt doesn't wake up a napping thread.
Thirdly, we now need to be able to handle virtual external interrupts
and decrementer interrupts becoming pending while a thread is napping,
and deliver those interrupts to the guest when the thread wakes.
This is done in kvmppc_cede_reentry, just before fast_guest_return.
Finally, since we are not using the generic kvm_vcpu_block for book3s_hv,
and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef
from kvm_arch_vcpu_runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-23 11:42:46 +04:00
DEFINE ( IPI_PRIORITY , IPI_PRIORITY ) ;
2013-10-07 20:47:52 +04:00
# endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
2013-02-04 22:10:51 +04:00
# ifdef CONFIG_PPC_BOOK3S_64
HSTATE_FIELD ( HSTATE_CFAR , cfar ) ;
2013-09-20 08:52:39 +04:00
HSTATE_FIELD ( HSTATE_PPR , ppr ) ;
2013-02-04 22:10:51 +04:00
# endif /* CONFIG_PPC_BOOK3S_64 */
2011-06-29 04:20:58 +04:00
# else /* CONFIG_PPC_BOOK3S */
2010-01-08 04:58:03 +03:00
DEFINE ( VCPU_CR , offsetof ( struct kvm_vcpu , arch . cr ) ) ;
DEFINE ( VCPU_XER , offsetof ( struct kvm_vcpu , arch . xer ) ) ;
2010-04-16 02:11:44 +04:00
DEFINE ( VCPU_LR , offsetof ( struct kvm_vcpu , arch . lr ) ) ;
DEFINE ( VCPU_CTR , offsetof ( struct kvm_vcpu , arch . ctr ) ) ;
DEFINE ( VCPU_PC , offsetof ( struct kvm_vcpu , arch . pc ) ) ;
DEFINE ( VCPU_LAST_INST , offsetof ( struct kvm_vcpu , arch . last_inst ) ) ;
DEFINE ( VCPU_FAULT_DEAR , offsetof ( struct kvm_vcpu , arch . fault_dear ) ) ;
DEFINE ( VCPU_FAULT_ESR , offsetof ( struct kvm_vcpu , arch . fault_esr ) ) ;
2013-02-27 22:13:10 +04:00
DEFINE ( VCPU_CRIT_SAVE , offsetof ( struct kvm_vcpu , arch . crit_save ) ) ;
2010-04-16 02:11:42 +04:00
# endif /* CONFIG_PPC_BOOK3S */
2011-06-29 04:20:58 +04:00
# endif /* CONFIG_KVM */
2010-07-29 16:47:57 +04:00
# ifdef CONFIG_KVM_GUEST
DEFINE ( KVM_MAGIC_SCRATCH1 , offsetof ( struct kvm_vcpu_arch_shared ,
scratch1 ) ) ;
DEFINE ( KVM_MAGIC_SCRATCH2 , offsetof ( struct kvm_vcpu_arch_shared ,
scratch2 ) ) ;
DEFINE ( KVM_MAGIC_SCRATCH3 , offsetof ( struct kvm_vcpu_arch_shared ,
scratch3 ) ) ;
DEFINE ( KVM_MAGIC_INT , offsetof ( struct kvm_vcpu_arch_shared ,
int_pending ) ) ;
DEFINE ( KVM_MAGIC_MSR , offsetof ( struct kvm_vcpu_arch_shared , msr ) ) ;
DEFINE ( KVM_MAGIC_CRITICAL , offsetof ( struct kvm_vcpu_arch_shared ,
critical ) ) ;
2010-08-03 12:39:35 +04:00
DEFINE ( KVM_MAGIC_SR , offsetof ( struct kvm_vcpu_arch_shared , sr ) ) ;
2010-07-29 16:47:57 +04:00
# endif
2008-12-11 04:55:41 +03:00
# ifdef CONFIG_44x
DEFINE ( PGD_T_LOG2 , PGD_T_LOG2 ) ;
DEFINE ( PTE_T_LOG2 , PTE_T_LOG2 ) ;
# endif
2009-10-17 03:48:40 +04:00
# ifdef CONFIG_PPC_FSL_BOOK3E
2010-05-13 23:38:21 +04:00
DEFINE ( TLBCAM_SIZE , sizeof ( struct tlbcam ) ) ;
DEFINE ( TLBCAM_MAS0 , offsetof ( struct tlbcam , MAS0 ) ) ;
DEFINE ( TLBCAM_MAS1 , offsetof ( struct tlbcam , MAS1 ) ) ;
DEFINE ( TLBCAM_MAS2 , offsetof ( struct tlbcam , MAS2 ) ) ;
DEFINE ( TLBCAM_MAS3 , offsetof ( struct tlbcam , MAS3 ) ) ;
DEFINE ( TLBCAM_MAS7 , offsetof ( struct tlbcam , MAS7 ) ) ;
# endif
2008-04-17 08:28:09 +04:00
2011-06-15 03:34:31 +04:00
# if defined(CONFIG_KVM) && defined(CONFIG_SPE)
DEFINE ( VCPU_EVR , offsetof ( struct kvm_vcpu , arch . evr [ 0 ] ) ) ;
DEFINE ( VCPU_ACC , offsetof ( struct kvm_vcpu , arch . acc ) ) ;
DEFINE ( VCPU_SPEFSCR , offsetof ( struct kvm_vcpu , arch . spefscr ) ) ;
DEFINE ( VCPU_HOST_SPEFSCR , offsetof ( struct kvm_vcpu , arch . host_spefscr ) ) ;
# endif
2011-12-20 19:34:43 +04:00
# ifdef CONFIG_KVM_BOOKE_HV
DEFINE ( VCPU_HOST_MAS4 , offsetof ( struct kvm_vcpu , arch . host_mas4 ) ) ;
DEFINE ( VCPU_HOST_MAS6 , offsetof ( struct kvm_vcpu , arch . host_mas6 ) ) ;
DEFINE ( VCPU_EPLC , offsetof ( struct kvm_vcpu , arch . eplc ) ) ;
# endif
2008-12-03 00:51:57 +03:00
# ifdef CONFIG_KVM_EXIT_TIMING
DEFINE ( VCPU_TIMING_EXIT_TBU , offsetof ( struct kvm_vcpu ,
arch . timing_exit . tv32 . tbu ) ) ;
DEFINE ( VCPU_TIMING_EXIT_TBL , offsetof ( struct kvm_vcpu ,
arch . timing_exit . tv32 . tbl ) ) ;
DEFINE ( VCPU_TIMING_LAST_ENTER_TBU , offsetof ( struct kvm_vcpu ,
arch . timing_last_enter . tv32 . tbu ) ) ;
DEFINE ( VCPU_TIMING_LAST_ENTER_TBL , offsetof ( struct kvm_vcpu ,
arch . timing_last_enter . tv32 . tbl ) ) ;
# endif
2011-09-19 21:45:04 +04:00
# ifdef CONFIG_PPC_POWERNV
DEFINE ( OPAL_MC_GPR3 , offsetof ( struct opal_machine_check_event , gpr3 ) ) ;
DEFINE ( OPAL_MC_SRR0 , offsetof ( struct opal_machine_check_event , srr0 ) ) ;
DEFINE ( OPAL_MC_SRR1 , offsetof ( struct opal_machine_check_event , srr1 ) ) ;
DEFINE ( PACA_OPAL_MC_EVT , offsetof ( struct paca_struct , opal_mc_evt ) ) ;
# endif
2005-09-26 10:04:21 +04:00
return 0 ;
}