2019-06-04 10:11:33 +02:00
// SPDX-License-Identifier: GPL-2.0-only
2009-10-30 05:47:10 +00:00
/*
* Copyright ( C ) 2009. SUSE Linux Products GmbH . All rights reserved .
*
* Authors :
* Alexander Graf < agraf @ suse . de >
* Kevin Wolf < mail @ kevin - wolf . de >
*
* Description :
* This file is derived from arch / powerpc / kvm / 44 x . c ,
* by Hollis Blanchard < hollisb @ us . ibm . com > .
*/
# include <linux/kvm_host.h>
# include <linux/err.h>
2011-05-27 10:46:24 -04:00
# include <linux/export.h>
2010-04-27 15:49:17 +10:00
# include <linux/slab.h>
2013-12-09 13:53:42 +01:00
# include <linux/module.h>
# include <linux/miscdevice.h>
2017-04-05 17:54:51 +10:00
# include <linux/gfp.h>
# include <linux/sched.h>
# include <linux/vmalloc.h>
# include <linux/highmem.h>
2009-10-30 05:47:10 +00:00
# include <asm/reg.h>
# include <asm/cputable.h>
# include <asm/cacheflush.h>
2016-12-24 11:46:01 -08:00
# include <linux/uaccess.h>
2009-10-30 05:47:10 +00:00
# include <asm/io.h>
# include <asm/kvm_ppc.h>
# include <asm/kvm_book3s.h>
# include <asm/mmu_context.h>
2011-06-29 00:16:42 +00:00
# include <asm/page.h>
2017-04-05 17:54:56 +10:00
# include <asm/xive.h>
2009-10-30 05:47:10 +00:00
2013-10-07 22:18:01 +05:30
# include "book3s.h"
2011-06-29 00:17:33 +00:00
# include "trace.h"
2009-10-30 05:47:10 +00:00
/* #define EXIT_DEBUG */
2010-04-16 00:11:53 +02:00
2021-06-18 22:27:05 +00:00
const struct _kvm_stats_desc kvm_vm_stats_desc [ ] = {
KVM_GENERIC_VM_STATS ( ) ,
STATS_DESC_ICOUNTER ( VM , num_2M_pages ) ,
STATS_DESC_ICOUNTER ( VM , num_1G_pages )
} ;
const struct kvm_stats_header kvm_vm_stats_header = {
. name_size = KVM_STATS_NAME_SIZE ,
. num_desc = ARRAY_SIZE ( kvm_vm_stats_desc ) ,
. id_offset = sizeof ( struct kvm_stats_header ) ,
. desc_offset = sizeof ( struct kvm_stats_header ) + KVM_STATS_NAME_SIZE ,
. data_offset = sizeof ( struct kvm_stats_header ) + KVM_STATS_NAME_SIZE +
sizeof ( kvm_vm_stats_desc ) ,
} ;
2021-06-18 22:27:06 +00:00
const struct _kvm_stats_desc kvm_vcpu_stats_desc [ ] = {
KVM_GENERIC_VCPU_STATS ( ) ,
STATS_DESC_COUNTER ( VCPU , sum_exits ) ,
STATS_DESC_COUNTER ( VCPU , mmio_exits ) ,
STATS_DESC_COUNTER ( VCPU , signal_exits ) ,
STATS_DESC_COUNTER ( VCPU , light_exits ) ,
STATS_DESC_COUNTER ( VCPU , itlb_real_miss_exits ) ,
STATS_DESC_COUNTER ( VCPU , itlb_virt_miss_exits ) ,
STATS_DESC_COUNTER ( VCPU , dtlb_real_miss_exits ) ,
STATS_DESC_COUNTER ( VCPU , dtlb_virt_miss_exits ) ,
STATS_DESC_COUNTER ( VCPU , syscall_exits ) ,
STATS_DESC_COUNTER ( VCPU , isi_exits ) ,
STATS_DESC_COUNTER ( VCPU , dsi_exits ) ,
STATS_DESC_COUNTER ( VCPU , emulated_inst_exits ) ,
STATS_DESC_COUNTER ( VCPU , dec_exits ) ,
STATS_DESC_COUNTER ( VCPU , ext_intr_exits ) ,
STATS_DESC_COUNTER ( VCPU , halt_successful_wait ) ,
STATS_DESC_COUNTER ( VCPU , dbell_exits ) ,
STATS_DESC_COUNTER ( VCPU , gdbell_exits ) ,
STATS_DESC_COUNTER ( VCPU , ld ) ,
STATS_DESC_COUNTER ( VCPU , st ) ,
STATS_DESC_COUNTER ( VCPU , pf_storage ) ,
STATS_DESC_COUNTER ( VCPU , pf_instruc ) ,
STATS_DESC_COUNTER ( VCPU , sp_storage ) ,
STATS_DESC_COUNTER ( VCPU , sp_instruc ) ,
STATS_DESC_COUNTER ( VCPU , queue_intr ) ,
STATS_DESC_COUNTER ( VCPU , ld_slow ) ,
STATS_DESC_COUNTER ( VCPU , st_slow ) ,
STATS_DESC_COUNTER ( VCPU , pthru_all ) ,
STATS_DESC_COUNTER ( VCPU , pthru_host ) ,
STATS_DESC_COUNTER ( VCPU , pthru_bad_aff )
} ;
const struct kvm_stats_header kvm_vcpu_stats_header = {
. name_size = KVM_STATS_NAME_SIZE ,
. num_desc = ARRAY_SIZE ( kvm_vcpu_stats_desc ) ,
. id_offset = sizeof ( struct kvm_stats_header ) ,
. desc_offset = sizeof ( struct kvm_stats_header ) + KVM_STATS_NAME_SIZE ,
. data_offset = sizeof ( struct kvm_stats_header ) + KVM_STATS_NAME_SIZE +
sizeof ( kvm_vcpu_stats_desc ) ,
} ;
2013-10-07 22:17:56 +05:30
static inline void kvmppc_update_int_pending ( struct kvm_vcpu * vcpu ,
unsigned long pending_now , unsigned long old_pending )
{
2013-10-07 22:18:02 +05:30
if ( is_kvmppc_hv_enabled ( vcpu - > kvm ) )
2013-10-07 22:17:56 +05:30
return ;
if ( pending_now )
2014-04-24 13:46:24 +02:00
kvmppc_set_int_pending ( vcpu , 1 ) ;
2013-10-07 22:17:56 +05:30
else if ( old_pending )
2014-04-24 13:46:24 +02:00
kvmppc_set_int_pending ( vcpu , 0 ) ;
2013-10-07 22:17:56 +05:30
}
static inline bool kvmppc_critical_section ( struct kvm_vcpu * vcpu )
{
ulong crit_raw ;
ulong crit_r1 ;
bool crit ;
2013-10-07 22:18:02 +05:30
if ( is_kvmppc_hv_enabled ( vcpu - > kvm ) )
2013-10-07 22:17:56 +05:30
return false ;
2014-04-24 13:46:24 +02:00
crit_raw = kvmppc_get_critical ( vcpu ) ;
2013-10-07 22:17:56 +05:30
crit_r1 = kvmppc_get_gpr ( vcpu , 1 ) ;
/* Truncate crit indicators in 32 bit mode */
2014-04-24 13:46:24 +02:00
if ( ! ( kvmppc_get_msr ( vcpu ) & MSR_SF ) ) {
2013-10-07 22:17:56 +05:30
crit_raw & = 0xffffffff ;
crit_r1 & = 0xffffffff ;
}
/* Critical section when crit == r1 */
crit = ( crit_raw = = crit_r1 ) ;
/* ... and we're in supervisor mode */
2014-04-24 13:46:24 +02:00
crit = crit & & ! ( kvmppc_get_msr ( vcpu ) & MSR_PR ) ;
2013-10-07 22:17:56 +05:30
return crit ;
}
2009-10-30 05:47:10 +00:00
void kvmppc_inject_interrupt ( struct kvm_vcpu * vcpu , int vec , u64 flags )
{
2019-10-02 16:00:22 +10:00
vcpu - > kvm - > arch . kvm_ops - > inject_interrupt ( vcpu , vec , flags ) ;
2009-10-30 05:47:10 +00:00
}
2009-12-21 20:21:23 +01:00
static int kvmppc_book3s_vec2irqprio ( unsigned int vec )
2009-10-30 05:47:10 +00:00
{
unsigned int prio ;
switch ( vec ) {
case 0x100 : prio = BOOK3S_IRQPRIO_SYSTEM_RESET ; break ;
case 0x200 : prio = BOOK3S_IRQPRIO_MACHINE_CHECK ; break ;
case 0x300 : prio = BOOK3S_IRQPRIO_DATA_STORAGE ; break ;
case 0x380 : prio = BOOK3S_IRQPRIO_DATA_SEGMENT ; break ;
case 0x400 : prio = BOOK3S_IRQPRIO_INST_STORAGE ; break ;
case 0x480 : prio = BOOK3S_IRQPRIO_INST_SEGMENT ; break ;
case 0x500 : prio = BOOK3S_IRQPRIO_EXTERNAL ; break ;
case 0x600 : prio = BOOK3S_IRQPRIO_ALIGNMENT ; break ;
case 0x700 : prio = BOOK3S_IRQPRIO_PROGRAM ; break ;
case 0x800 : prio = BOOK3S_IRQPRIO_FP_UNAVAIL ; break ;
case 0x900 : prio = BOOK3S_IRQPRIO_DECREMENTER ; break ;
case 0xc00 : prio = BOOK3S_IRQPRIO_SYSCALL ; break ;
case 0xd00 : prio = BOOK3S_IRQPRIO_DEBUG ; break ;
case 0xf20 : prio = BOOK3S_IRQPRIO_ALTIVEC ; break ;
case 0xf40 : prio = BOOK3S_IRQPRIO_VSX ; break ;
2014-04-29 16:48:44 +02:00
case 0xf60 : prio = BOOK3S_IRQPRIO_FAC_UNAVAIL ; break ;
2009-10-30 05:47:10 +00:00
default : prio = BOOK3S_IRQPRIO_MAX ; break ;
}
2009-12-21 20:21:23 +01:00
return prio ;
}
2013-04-17 20:30:26 +00:00
void kvmppc_book3s_dequeue_irqprio ( struct kvm_vcpu * vcpu ,
2009-12-21 20:21:24 +01:00
unsigned int vec )
{
2011-06-29 00:17:58 +00:00
unsigned long old_pending = vcpu - > arch . pending_exceptions ;
2009-12-21 20:21:24 +01:00
clear_bit ( kvmppc_book3s_vec2irqprio ( vec ) ,
& vcpu - > arch . pending_exceptions ) ;
2010-08-05 12:24:40 +02:00
2011-06-29 00:17:58 +00:00
kvmppc_update_int_pending ( vcpu , vcpu - > arch . pending_exceptions ,
old_pending ) ;
2009-12-21 20:21:24 +01:00
}
2009-12-21 20:21:23 +01:00
void kvmppc_book3s_queue_irqprio ( struct kvm_vcpu * vcpu , unsigned int vec )
{
vcpu - > stat . queue_intr + + ;
set_bit ( kvmppc_book3s_vec2irqprio ( vec ) ,
& vcpu - > arch . pending_exceptions ) ;
2009-10-30 05:47:10 +00:00
# ifdef EXIT_DEBUG
printk ( KERN_INFO " Queueing interrupt %x \n " , vec ) ;
# endif
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_book3s_queue_irqprio ) ;
2009-10-30 05:47:10 +00:00
KVM: PPC: Book3S HV: Simplify machine check handling
This makes the handling of machine check interrupts that occur inside
a guest simpler and more robust, with less done in assembler code and
in real mode.
Now, when a machine check occurs inside a guest, we always get the
machine check event struct and put a copy in the vcpu struct for the
vcpu where the machine check occurred. We no longer call
machine_check_queue_event() from kvmppc_realmode_mc_power7(), because
on POWER8, when a vcpu is running on an offline secondary thread and
we call machine_check_queue_event(), that calls irq_work_queue(),
which doesn't work because the CPU is offline, but instead triggers
the WARN_ON(lazy_irq_pending()) in pnv_smp_cpu_kill_self() (which
fires again and again because nothing clears the condition).
All that machine_check_queue_event() actually does is to cause the
event to be printed to the console. For a machine check occurring in
the guest, we now print the event in kvmppc_handle_exit_hv()
instead.
The assembly code at label machine_check_realmode now just calls C
code and then continues exiting the guest. We no longer either
synthesize a machine check for the guest in assembly code or return
to the guest without a machine check.
The code in kvmppc_handle_exit_hv() is extended to handle the case
where the guest is not FWNMI-capable. In that case we now always
synthesize a machine check interrupt for the guest. Previously, if
the host thinks it has recovered the machine check fully, it would
return to the guest without any notification that the machine check
had occurred. If the machine check was caused by some action of the
guest (such as creating duplicate SLB entries), it is much better to
tell the guest that it has caused a problem. Therefore we now always
generate a machine check interrupt for guests that are not
FWNMI-capable.
Reviewed-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-02-21 13:38:49 +11:00
void kvmppc_core_queue_machine_check ( struct kvm_vcpu * vcpu , ulong flags )
{
/* might as well deliver this straight away */
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_MACHINE_CHECK , flags ) ;
}
EXPORT_SYMBOL_GPL ( kvmppc_core_queue_machine_check ) ;
KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path
In the interest of minimising the amount of code that is run in
"real-mode", don't handle hcalls in real mode in the P9 path. This
requires some new handlers for H_CEDE and xics-on-xive to be added
before xive is pulled or cede logic is checked.
This introduces a change in radix guest behaviour where radix guests
that execute 'sc 1' in userspace now get a privilege fault whereas
previously the 'sc 1' would be reflected as a syscall interrupt to the
guest kernel. That reflection is only required for hash guests that run
PR KVM.
Background:
In POWER8 and earlier processors, it is very expensive to exit from the
HV real mode context of a guest hypervisor interrupt, and switch to host
virtual mode. On those processors, guest->HV interrupts reach the
hypervisor with the MMU off because the MMU is loaded with guest context
(LPCR, SDR1, SLB), and the other threads in the sub-core need to be
pulled out of the guest too. Then the primary must save off guest state,
invalidate SLB and ERAT, and load up host state before the MMU can be
enabled to run in host virtual mode (~= regular Linux mode).
Hash guests also require a lot of hcalls to run due to the nature of the
MMU architecture and paravirtualisation design. The XICS interrupt
controller requires hcalls to run.
So KVM traditionally tries hard to avoid the full exit, by handling
hcalls and other interrupts in real mode as much as possible.
By contrast, POWER9 has independent MMU context per-thread, and in radix
mode the hypervisor is in host virtual memory mode when the HV interrupt
is taken. Radix guests do not require significant hcalls to manage their
translations, and xive guests don't need hcalls to handle interrupts. So
it's much less important for performance to handle hcalls in real mode on
POWER9.
One caveat is that the TCE hcalls are performance critical, real-mode
variants introduced for POWER8 in order to achieve 10GbE performance.
Real mode TCE hcalls were found to be less important on POWER9, which
was able to drive 40GBe networking without them (using the virt mode
hcalls) but performance is still important. These hcalls will benefit
from subsequent guest entry/exit optimisation including possibly a
faster "partial exit" that does not entirely switch to host context to
handle the hcall.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210528090752.3542186-14-npiggin@gmail.com
2021-05-28 19:07:33 +10:00
void kvmppc_core_queue_syscall ( struct kvm_vcpu * vcpu )
{
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_SYSCALL , 0 ) ;
}
EXPORT_SYMBOL ( kvmppc_core_queue_syscall ) ;
2010-01-08 02:58:07 +01:00
void kvmppc_core_queue_program ( struct kvm_vcpu * vcpu , ulong flags )
2009-10-30 05:47:10 +00:00
{
2011-06-29 00:18:52 +00:00
/* might as well deliver this straight away */
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_PROGRAM , flags ) ;
2009-10-30 05:47:10 +00:00
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_core_queue_program ) ;
2009-10-30 05:47:10 +00:00
2017-03-22 21:02:08 +11:00
void kvmppc_core_queue_fpunavail ( struct kvm_vcpu * vcpu )
{
/* might as well deliver this straight away */
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_FP_UNAVAIL , 0 ) ;
}
void kvmppc_core_queue_vec_unavail ( struct kvm_vcpu * vcpu )
{
/* might as well deliver this straight away */
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_ALTIVEC , 0 ) ;
}
void kvmppc_core_queue_vsx_unavail ( struct kvm_vcpu * vcpu )
{
/* might as well deliver this straight away */
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_VSX , 0 ) ;
}
2009-10-30 05:47:10 +00:00
void kvmppc_core_queue_dec ( struct kvm_vcpu * vcpu )
{
kvmppc_book3s_queue_irqprio ( vcpu , BOOK3S_INTERRUPT_DECREMENTER ) ;
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_core_queue_dec ) ;
2009-10-30 05:47:10 +00:00
int kvmppc_core_pending_dec ( struct kvm_vcpu * vcpu )
{
2011-05-11 00:38:50 +00:00
return test_bit ( BOOK3S_IRQPRIO_DECREMENTER , & vcpu - > arch . pending_exceptions ) ;
2009-10-30 05:47:10 +00:00
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_core_pending_dec ) ;
2009-10-30 05:47:10 +00:00
2009-12-21 20:21:24 +01:00
void kvmppc_core_dequeue_dec ( struct kvm_vcpu * vcpu )
{
kvmppc_book3s_dequeue_irqprio ( vcpu , BOOK3S_INTERRUPT_DECREMENTER ) ;
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_core_dequeue_dec ) ;
2009-12-21 20:21:24 +01:00
2009-10-30 05:47:10 +00:00
void kvmppc_core_queue_external ( struct kvm_vcpu * vcpu ,
struct kvm_interrupt * irq )
{
2018-10-08 16:30:48 +11:00
/*
* This case ( KVM_INTERRUPT_SET ) should never actually arise for
* a pseries guest ( because pseries guests expect their interrupt
* controllers to continue asserting an external interrupt request
* until it is acknowledged at the interrupt controller ) , but is
* included to avoid ABI breakage and potentially for other
* sorts of guest .
*
* There is a subtlety here : HV KVM does not test the
* external_oneshot flag in the code that synthesizes
* external interrupts for the guest just before entering
* the guest . That is OK even if userspace did do a
* KVM_INTERRUPT_SET on a pseries guest vcpu , because the
* caller ( kvm_vcpu_ioctl_interrupt ) does a kvm_vcpu_kick ( )
* which ends up doing a smp_send_reschedule ( ) , which will
* pull the guest all the way out to the host , meaning that
* we will call kvmppc_core_prepare_to_enter ( ) before entering
* the guest again , and that will handle the external_oneshot
* flag correctly .
*/
if ( irq - > irq = = KVM_INTERRUPT_SET )
vcpu - > arch . external_oneshot = 1 ;
kvmppc_book3s_queue_irqprio ( vcpu , BOOK3S_INTERRUPT_EXTERNAL ) ;
2009-10-30 05:47:10 +00:00
}
2013-02-14 14:00:25 +00:00
void kvmppc_core_dequeue_external ( struct kvm_vcpu * vcpu )
2010-03-24 21:48:18 +01:00
{
kvmppc_book3s_dequeue_irqprio ( vcpu , BOOK3S_INTERRUPT_EXTERNAL ) ;
}
2014-06-18 21:56:55 +02:00
void kvmppc_core_queue_data_storage ( struct kvm_vcpu * vcpu , ulong dar ,
ulong flags )
{
kvmppc_set_dar ( vcpu , dar ) ;
kvmppc_set_dsisr ( vcpu , flags ) ;
2018-06-07 18:04:37 +10:00
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_DATA_STORAGE , 0 ) ;
2014-06-18 21:56:55 +02:00
}
2018-06-07 18:04:37 +10:00
EXPORT_SYMBOL_GPL ( kvmppc_core_queue_data_storage ) ;
2014-06-18 21:56:55 +02:00
void kvmppc_core_queue_inst_storage ( struct kvm_vcpu * vcpu , ulong flags )
{
2018-06-07 18:04:37 +10:00
kvmppc_inject_interrupt ( vcpu , BOOK3S_INTERRUPT_INST_STORAGE , flags ) ;
2014-06-18 21:56:55 +02:00
}
2018-06-07 18:04:37 +10:00
EXPORT_SYMBOL_GPL ( kvmppc_core_queue_inst_storage ) ;
2014-06-18 21:56:55 +02:00
2015-05-22 09:25:02 +02:00
static int kvmppc_book3s_irqprio_deliver ( struct kvm_vcpu * vcpu ,
unsigned int priority )
2009-10-30 05:47:10 +00:00
{
int deliver = 1 ;
int vec = 0 ;
2011-06-29 00:17:58 +00:00
bool crit = kvmppc_critical_section ( vcpu ) ;
2009-10-30 05:47:10 +00:00
switch ( priority ) {
case BOOK3S_IRQPRIO_DECREMENTER :
2014-04-24 13:46:24 +02:00
deliver = ( kvmppc_get_msr ( vcpu ) & MSR_EE ) & & ! crit ;
2009-10-30 05:47:10 +00:00
vec = BOOK3S_INTERRUPT_DECREMENTER ;
break ;
case BOOK3S_IRQPRIO_EXTERNAL :
2014-04-24 13:46:24 +02:00
deliver = ( kvmppc_get_msr ( vcpu ) & MSR_EE ) & & ! crit ;
2009-10-30 05:47:10 +00:00
vec = BOOK3S_INTERRUPT_EXTERNAL ;
break ;
case BOOK3S_IRQPRIO_SYSTEM_RESET :
vec = BOOK3S_INTERRUPT_SYSTEM_RESET ;
break ;
case BOOK3S_IRQPRIO_MACHINE_CHECK :
vec = BOOK3S_INTERRUPT_MACHINE_CHECK ;
break ;
case BOOK3S_IRQPRIO_DATA_STORAGE :
vec = BOOK3S_INTERRUPT_DATA_STORAGE ;
break ;
case BOOK3S_IRQPRIO_INST_STORAGE :
vec = BOOK3S_INTERRUPT_INST_STORAGE ;
break ;
case BOOK3S_IRQPRIO_DATA_SEGMENT :
vec = BOOK3S_INTERRUPT_DATA_SEGMENT ;
break ;
case BOOK3S_IRQPRIO_INST_SEGMENT :
vec = BOOK3S_INTERRUPT_INST_SEGMENT ;
break ;
case BOOK3S_IRQPRIO_ALIGNMENT :
vec = BOOK3S_INTERRUPT_ALIGNMENT ;
break ;
case BOOK3S_IRQPRIO_PROGRAM :
vec = BOOK3S_INTERRUPT_PROGRAM ;
break ;
case BOOK3S_IRQPRIO_VSX :
vec = BOOK3S_INTERRUPT_VSX ;
break ;
case BOOK3S_IRQPRIO_ALTIVEC :
vec = BOOK3S_INTERRUPT_ALTIVEC ;
break ;
case BOOK3S_IRQPRIO_FP_UNAVAIL :
vec = BOOK3S_INTERRUPT_FP_UNAVAIL ;
break ;
case BOOK3S_IRQPRIO_SYSCALL :
vec = BOOK3S_INTERRUPT_SYSCALL ;
break ;
case BOOK3S_IRQPRIO_DEBUG :
vec = BOOK3S_INTERRUPT_TRACE ;
break ;
case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR :
vec = BOOK3S_INTERRUPT_PERFMON ;
break ;
2014-04-29 16:48:44 +02:00
case BOOK3S_IRQPRIO_FAC_UNAVAIL :
vec = BOOK3S_INTERRUPT_FAC_UNAVAIL ;
break ;
2009-10-30 05:47:10 +00:00
default :
deliver = 0 ;
printk ( KERN_ERR " KVM: Unknown interrupt: 0x%x \n " , priority ) ;
break ;
}
#if 0
printk ( KERN_INFO " Deliver interrupt 0x%x? %x \n " , vec , deliver ) ;
# endif
if ( deliver )
2011-06-29 00:18:52 +00:00
kvmppc_inject_interrupt ( vcpu , vec , 0 ) ;
2009-10-30 05:47:10 +00:00
return deliver ;
}
2010-08-30 10:44:15 +02:00
/*
* This function determines if an irqprio should be cleared once issued .
*/
static bool clear_irqprio ( struct kvm_vcpu * vcpu , unsigned int priority )
{
switch ( priority ) {
case BOOK3S_IRQPRIO_DECREMENTER :
/* DEC interrupts get cleared by mtdec */
return false ;
2018-10-08 16:30:48 +11:00
case BOOK3S_IRQPRIO_EXTERNAL :
/*
* External interrupts get cleared by userspace
* except when set by the KVM_INTERRUPT ioctl with
* KVM_INTERRUPT_SET ( not KVM_INTERRUPT_SET_LEVEL ) .
*/
if ( vcpu - > arch . external_oneshot ) {
vcpu - > arch . external_oneshot = 0 ;
return true ;
}
2010-08-30 10:44:15 +02:00
return false ;
}
return true ;
}
2012-02-16 14:07:37 +00:00
int kvmppc_core_prepare_to_enter ( struct kvm_vcpu * vcpu )
2009-10-30 05:47:10 +00:00
{
unsigned long * pending = & vcpu - > arch . pending_exceptions ;
2010-07-29 14:47:51 +02:00
unsigned long old_pending = vcpu - > arch . pending_exceptions ;
2009-10-30 05:47:10 +00:00
unsigned int priority ;
# ifdef EXIT_DEBUG
if ( vcpu - > arch . pending_exceptions )
printk ( KERN_EMERG " KVM: Check pending: %lx \n " , vcpu - > arch . pending_exceptions ) ;
# endif
priority = __ffs ( * pending ) ;
2010-04-16 00:11:56 +02:00
while ( priority < BOOK3S_IRQPRIO_MAX ) {
2009-12-21 20:21:24 +01:00
if ( kvmppc_book3s_irqprio_deliver ( vcpu , priority ) & &
2010-08-30 10:44:15 +02:00
clear_irqprio ( vcpu , priority ) ) {
2009-10-30 05:47:10 +00:00
clear_bit ( priority , & vcpu - > arch . pending_exceptions ) ;
break ;
}
priority = find_next_bit ( pending ,
BITS_PER_BYTE * sizeof ( * pending ) ,
priority + 1 ) ;
}
2010-07-29 14:47:51 +02:00
/* Tell the guest about our interrupt status */
2011-06-29 00:17:58 +00:00
kvmppc_update_int_pending ( vcpu , * pending , old_pending ) ;
2012-02-16 14:07:37 +00:00
return 0 ;
2009-10-30 05:47:10 +00:00
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_core_prepare_to_enter ) ;
2009-10-30 05:47:10 +00:00
kvm: rename pfn_t to kvm_pfn_t
To date, we have implemented two I/O usage models for persistent memory,
PMEM (a persistent "ram disk") and DAX (mmap persistent memory into
userspace). This series adds a third, DAX-GUP, that allows DAX mappings
to be the target of direct-i/o. It allows userspace to coordinate
DMA/RDMA from/to persistent memory.
The implementation leverages the ZONE_DEVICE mm-zone that went into
4.3-rc1 (also discussed at kernel summit) to flag pages that are owned
and dynamically mapped by a device driver. The pmem driver, after
mapping a persistent memory range into the system memmap via
devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus
page-backed pmem-pfns via flags in the new pfn_t type.
The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the
resulting pte(s) inserted into the process page tables with a new
_PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys
off _PAGE_DEVMAP to pin the device hosting the page range active.
Finally, get_page() and put_page() are modified to take references
against the device driver established page mapping.
Finally, this need for "struct page" for persistent memory requires
memory capacity to store the memmap array. Given the memmap array for a
large pool of persistent may exhaust available DRAM introduce a
mechanism to allocate the memmap from persistent memory. The new
"struct vmem_altmap *" parameter to devm_memremap_pages() enables
arch_add_memory() to use reserved pmem capacity rather than the page
allocator.
This patch (of 18):
The core has developed a need for a "pfn_t" type [1]. Move the existing
pfn_t in KVM to kvm_pfn_t [2].
[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 16:56:11 -08:00
kvm_pfn_t kvmppc_gpa_to_pfn ( struct kvm_vcpu * vcpu , gpa_t gpa , bool writing ,
KVM: PPC: Book3S PR: Better handling of host-side read-only pages
Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page. This reduces
the effectiveness of KSM because it means that we unshare every page we
access. Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.
This fixes both these problems. We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store. The mmu.xlate() functions now only
set C for stores. kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not. If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.
This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page). Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults. In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one. This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.
Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().
Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 14:52:51 +10:00
bool * writable )
2010-07-29 14:47:54 +02:00
{
2014-07-13 16:37:12 +02:00
ulong mp_pa = vcpu - > arch . magic_page_pa & KVM_PAM ;
gfn_t gfn = gpa > > PAGE_SHIFT ;
2010-07-29 14:47:54 +02:00
2014-04-24 13:46:24 +02:00
if ( ! ( kvmppc_get_msr ( vcpu ) & MSR_SF ) )
2012-03-13 21:52:44 +00:00
mp_pa = ( uint32_t ) mp_pa ;
2010-07-29 14:47:54 +02:00
/* Magic page override */
2014-07-13 16:37:12 +02:00
gpa & = ~ 0xFFFULL ;
if ( unlikely ( mp_pa ) & & unlikely ( ( gpa & KVM_PAM ) = = mp_pa ) ) {
2010-07-29 14:47:54 +02:00
ulong shared_page = ( ( ulong ) vcpu - > arch . shared ) & PAGE_MASK ;
kvm: rename pfn_t to kvm_pfn_t
To date, we have implemented two I/O usage models for persistent memory,
PMEM (a persistent "ram disk") and DAX (mmap persistent memory into
userspace). This series adds a third, DAX-GUP, that allows DAX mappings
to be the target of direct-i/o. It allows userspace to coordinate
DMA/RDMA from/to persistent memory.
The implementation leverages the ZONE_DEVICE mm-zone that went into
4.3-rc1 (also discussed at kernel summit) to flag pages that are owned
and dynamically mapped by a device driver. The pmem driver, after
mapping a persistent memory range into the system memmap via
devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus
page-backed pmem-pfns via flags in the new pfn_t type.
The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the
resulting pte(s) inserted into the process page tables with a new
_PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys
off _PAGE_DEVMAP to pin the device hosting the page range active.
Finally, get_page() and put_page() are modified to take references
against the device driver established page mapping.
Finally, this need for "struct page" for persistent memory requires
memory capacity to store the memmap array. Given the memmap array for a
large pool of persistent may exhaust available DRAM introduce a
mechanism to allocate the memmap from persistent memory. The new
"struct vmem_altmap *" parameter to devm_memremap_pages() enables
arch_add_memory() to use reserved pmem capacity rather than the page
allocator.
This patch (of 18):
The core has developed a need for a "pfn_t" type [1]. Move the existing
pfn_t in KVM to kvm_pfn_t [2].
[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 16:56:11 -08:00
kvm_pfn_t pfn ;
2010-07-29 14:47:54 +02:00
kvm: rename pfn_t to kvm_pfn_t
To date, we have implemented two I/O usage models for persistent memory,
PMEM (a persistent "ram disk") and DAX (mmap persistent memory into
userspace). This series adds a third, DAX-GUP, that allows DAX mappings
to be the target of direct-i/o. It allows userspace to coordinate
DMA/RDMA from/to persistent memory.
The implementation leverages the ZONE_DEVICE mm-zone that went into
4.3-rc1 (also discussed at kernel summit) to flag pages that are owned
and dynamically mapped by a device driver. The pmem driver, after
mapping a persistent memory range into the system memmap via
devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus
page-backed pmem-pfns via flags in the new pfn_t type.
The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the
resulting pte(s) inserted into the process page tables with a new
_PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys
off _PAGE_DEVMAP to pin the device hosting the page range active.
Finally, get_page() and put_page() are modified to take references
against the device driver established page mapping.
Finally, this need for "struct page" for persistent memory requires
memory capacity to store the memmap array. Given the memmap array for a
large pool of persistent may exhaust available DRAM introduce a
mechanism to allocate the memmap from persistent memory. The new
"struct vmem_altmap *" parameter to devm_memremap_pages() enables
arch_add_memory() to use reserved pmem capacity rather than the page
allocator.
This patch (of 18):
The core has developed a need for a "pfn_t" type [1]. Move the existing
pfn_t in KVM to kvm_pfn_t [2].
[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 16:56:11 -08:00
pfn = ( kvm_pfn_t ) virt_to_phys ( ( void * ) shared_page ) > > PAGE_SHIFT ;
2010-07-29 14:47:54 +02:00
get_page ( pfn_to_page ( pfn ) ) ;
KVM: PPC: Book3S PR: Better handling of host-side read-only pages
Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page. This reduces
the effectiveness of KSM because it means that we unshare every page we
access. Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.
This fixes both these problems. We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store. The mmu.xlate() functions now only
set C for stores. kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not. If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.
This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page). Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults. In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one. This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.
Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().
Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 14:52:51 +10:00
if ( writable )
* writable = true ;
2010-07-29 14:47:54 +02:00
return pfn ;
}
KVM: PPC: Book3S PR: Better handling of host-side read-only pages
Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page. This reduces
the effectiveness of KSM because it means that we unshare every page we
access. Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.
This fixes both these problems. We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store. The mmu.xlate() functions now only
set C for stores. kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not. If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.
This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page). Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults. In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one. This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.
Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().
Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 14:52:51 +10:00
return gfn_to_pfn_prot ( vcpu - > kvm , gfn , writing , writable ) ;
2010-07-29 14:47:54 +02:00
}
2014-07-13 16:37:12 +02:00
EXPORT_SYMBOL_GPL ( kvmppc_gpa_to_pfn ) ;
2010-07-29 14:47:54 +02:00
2014-06-20 13:52:36 +02:00
int kvmppc_xlate ( struct kvm_vcpu * vcpu , ulong eaddr , enum xlate_instdata xlid ,
enum xlate_readwrite xlrw , struct kvmppc_pte * pte )
2009-10-30 05:47:10 +00:00
{
2014-06-20 13:52:36 +02:00
bool data = ( xlid = = XLATE_DATA ) ;
bool iswrite = ( xlrw = = XLATE_WRITE ) ;
2014-04-24 13:46:24 +02:00
int relocated = ( kvmppc_get_msr ( vcpu ) & ( data ? MSR_DR : MSR_IR ) ) ;
2009-10-30 05:47:10 +00:00
int r ;
if ( relocated ) {
KVM: PPC: Book3S PR: Better handling of host-side read-only pages
Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page. This reduces
the effectiveness of KSM because it means that we unshare every page we
access. Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.
This fixes both these problems. We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store. The mmu.xlate() functions now only
set C for stores. kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not. If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.
This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page). Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults. In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one. This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.
Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().
Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 14:52:51 +10:00
r = vcpu - > arch . mmu . xlate ( vcpu , eaddr , pte , data , iswrite ) ;
2009-10-30 05:47:10 +00:00
} else {
pte - > eaddr = eaddr ;
2010-07-29 14:47:52 +02:00
pte - > raddr = eaddr & KVM_PAM ;
2010-03-24 21:48:17 +01:00
pte - > vpage = VSID_REAL | eaddr > > 12 ;
2009-10-30 05:47:10 +00:00
pte - > may_read = true ;
pte - > may_write = true ;
pte - > may_execute = true ;
r = 0 ;
2014-07-11 02:58:58 +02:00
if ( ( kvmppc_get_msr ( vcpu ) & ( MSR_IR | MSR_DR ) ) = = MSR_DR & &
! data ) {
if ( ( vcpu - > arch . hflags & BOOK3S_HFLAG_SPLIT_HACK ) & &
( ( eaddr & SPLIT_HACK_MASK ) = = SPLIT_HACK_OFFS ) )
pte - > raddr & = ~ SPLIT_HACK_MASK ;
}
2009-10-30 05:47:10 +00:00
}
return r ;
}
2018-05-21 13:24:21 +08:00
int kvmppc_load_last_inst ( struct kvm_vcpu * vcpu ,
enum instruction_fetch_type type , u32 * inst )
2014-07-23 19:06:21 +03:00
{
ulong pc = kvmppc_get_pc ( vcpu ) ;
int r ;
if ( type = = INST_SC )
pc - = 4 ;
r = kvmppc_ld ( vcpu , & pc , sizeof ( u32 ) , inst , false ) ;
if ( r = = EMULATE_DONE )
return r ;
else
return EMULATE_AGAIN ;
}
EXPORT_SYMBOL_GPL ( kvmppc_load_last_inst ) ;
2012-08-08 20:38:19 +00:00
int kvmppc_subarch_vcpu_init ( struct kvm_vcpu * vcpu )
{
return 0 ;
}
void kvmppc_subarch_vcpu_uninit ( struct kvm_vcpu * vcpu )
{
}
2013-10-07 22:17:53 +05:30
int kvm_arch_vcpu_ioctl_get_sregs ( struct kvm_vcpu * vcpu ,
struct kvm_sregs * sregs )
{
2017-12-04 21:35:28 +01:00
int ret ;
vcpu_load ( vcpu ) ;
ret = vcpu - > kvm - > arch . kvm_ops - > get_sregs ( vcpu , sregs ) ;
vcpu_put ( vcpu ) ;
return ret ;
2013-10-07 22:17:53 +05:30
}
int kvm_arch_vcpu_ioctl_set_sregs ( struct kvm_vcpu * vcpu ,
struct kvm_sregs * sregs )
{
2017-12-04 21:35:29 +01:00
int ret ;
vcpu_load ( vcpu ) ;
ret = vcpu - > kvm - > arch . kvm_ops - > set_sregs ( vcpu , sregs ) ;
vcpu_put ( vcpu ) ;
return ret ;
2013-10-07 22:17:53 +05:30
}
2009-10-30 05:47:10 +00:00
int kvm_arch_vcpu_ioctl_get_regs ( struct kvm_vcpu * vcpu , struct kvm_regs * regs )
{
int i ;
2010-04-16 00:11:40 +02:00
regs - > pc = kvmppc_get_pc ( vcpu ) ;
2010-01-08 02:58:02 +01:00
regs - > cr = kvmppc_get_cr ( vcpu ) ;
2010-04-16 00:11:40 +02:00
regs - > ctr = kvmppc_get_ctr ( vcpu ) ;
regs - > lr = kvmppc_get_lr ( vcpu ) ;
2010-01-08 02:58:02 +01:00
regs - > xer = kvmppc_get_xer ( vcpu ) ;
2014-04-24 13:46:24 +02:00
regs - > msr = kvmppc_get_msr ( vcpu ) ;
regs - > srr0 = kvmppc_get_srr0 ( vcpu ) ;
regs - > srr1 = kvmppc_get_srr1 ( vcpu ) ;
2009-10-30 05:47:10 +00:00
regs - > pid = vcpu - > arch . pid ;
2014-04-24 13:46:24 +02:00
regs - > sprg0 = kvmppc_get_sprg0 ( vcpu ) ;
regs - > sprg1 = kvmppc_get_sprg1 ( vcpu ) ;
regs - > sprg2 = kvmppc_get_sprg2 ( vcpu ) ;
regs - > sprg3 = kvmppc_get_sprg3 ( vcpu ) ;
regs - > sprg4 = kvmppc_get_sprg4 ( vcpu ) ;
regs - > sprg5 = kvmppc_get_sprg5 ( vcpu ) ;
regs - > sprg6 = kvmppc_get_sprg6 ( vcpu ) ;
regs - > sprg7 = kvmppc_get_sprg7 ( vcpu ) ;
2009-10-30 05:47:10 +00:00
for ( i = 0 ; i < ARRAY_SIZE ( regs - > gpr ) ; i + + )
2010-01-08 02:58:01 +01:00
regs - > gpr [ i ] = kvmppc_get_gpr ( vcpu , i ) ;
2009-10-30 05:47:10 +00:00
return 0 ;
}
int kvm_arch_vcpu_ioctl_set_regs ( struct kvm_vcpu * vcpu , struct kvm_regs * regs )
{
int i ;
2010-04-16 00:11:40 +02:00
kvmppc_set_pc ( vcpu , regs - > pc ) ;
2010-01-08 02:58:02 +01:00
kvmppc_set_cr ( vcpu , regs - > cr ) ;
2010-04-16 00:11:40 +02:00
kvmppc_set_ctr ( vcpu , regs - > ctr ) ;
kvmppc_set_lr ( vcpu , regs - > lr ) ;
2010-01-08 02:58:02 +01:00
kvmppc_set_xer ( vcpu , regs - > xer ) ;
2009-10-30 05:47:10 +00:00
kvmppc_set_msr ( vcpu , regs - > msr ) ;
2014-04-24 13:46:24 +02:00
kvmppc_set_srr0 ( vcpu , regs - > srr0 ) ;
kvmppc_set_srr1 ( vcpu , regs - > srr1 ) ;
kvmppc_set_sprg0 ( vcpu , regs - > sprg0 ) ;
kvmppc_set_sprg1 ( vcpu , regs - > sprg1 ) ;
kvmppc_set_sprg2 ( vcpu , regs - > sprg2 ) ;
kvmppc_set_sprg3 ( vcpu , regs - > sprg3 ) ;
kvmppc_set_sprg4 ( vcpu , regs - > sprg4 ) ;
kvmppc_set_sprg5 ( vcpu , regs - > sprg5 ) ;
kvmppc_set_sprg6 ( vcpu , regs - > sprg6 ) ;
kvmppc_set_sprg7 ( vcpu , regs - > sprg7 ) ;
2009-10-30 05:47:10 +00:00
2010-01-08 02:58:01 +01:00
for ( i = 0 ; i < ARRAY_SIZE ( regs - > gpr ) ; i + + )
kvmppc_set_gpr ( vcpu , i , regs - > gpr [ i ] ) ;
2009-10-30 05:47:10 +00:00
return 0 ;
}
int kvm_arch_vcpu_ioctl_get_fpu ( struct kvm_vcpu * vcpu , struct kvm_fpu * fpu )
{
2020-09-11 12:53:45 +02:00
return - EOPNOTSUPP ;
2009-10-30 05:47:10 +00:00
}
int kvm_arch_vcpu_ioctl_set_fpu ( struct kvm_vcpu * vcpu , struct kvm_fpu * fpu )
{
2020-09-11 12:53:45 +02:00
return - EOPNOTSUPP ;
2009-10-30 05:47:10 +00:00
}
2014-08-20 16:36:24 +03:00
int kvmppc_get_one_reg ( struct kvm_vcpu * vcpu , u64 id ,
union kvmppc_one_reg * val )
2012-09-25 20:31:56 +00:00
{
2014-08-20 16:36:24 +03:00
int r = 0 ;
2012-09-25 20:32:30 +00:00
long int i ;
2012-09-25 20:31:56 +00:00
2014-08-20 16:36:24 +03:00
r = vcpu - > kvm - > arch . kvm_ops - > get_one_reg ( vcpu , id , val ) ;
2012-09-25 20:31:56 +00:00
if ( r = = - EINVAL ) {
r = 0 ;
2014-08-20 16:36:24 +03:00
switch ( id ) {
2012-09-25 20:31:56 +00:00
case KVM_REG_PPC_DAR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , kvmppc_get_dar ( vcpu ) ) ;
2012-09-25 20:31:56 +00:00
break ;
case KVM_REG_PPC_DSISR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , kvmppc_get_dsisr ( vcpu ) ) ;
2012-09-25 20:31:56 +00:00
break ;
2012-09-25 20:32:30 +00:00
case KVM_REG_PPC_FPR0 . . . KVM_REG_PPC_FPR31 :
2014-08-20 16:36:24 +03:00
i = id - KVM_REG_PPC_FPR0 ;
* val = get_reg_val ( id , VCPU_FPR ( vcpu , i ) ) ;
2012-09-25 20:32:30 +00:00
break ;
case KVM_REG_PPC_FPSCR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . fp . fpscr ) ;
2012-09-25 20:32:30 +00:00
break ;
2013-10-15 20:43:02 +11:00
# ifdef CONFIG_VSX
case KVM_REG_PPC_VSR0 . . . KVM_REG_PPC_VSR31 :
if ( cpu_has_feature ( CPU_FTR_VSX ) ) {
2014-08-20 16:36:24 +03:00
i = id - KVM_REG_PPC_VSR0 ;
val - > vsxval [ 0 ] = vcpu - > arch . fp . fpr [ i ] [ 0 ] ;
val - > vsxval [ 1 ] = vcpu - > arch . fp . fpr [ i ] [ 1 ] ;
2013-10-15 20:43:02 +11:00
} else {
r = - ENXIO ;
}
break ;
# endif /* CONFIG_VSX */
2014-08-20 16:36:24 +03:00
case KVM_REG_PPC_DEBUG_INST :
* val = get_reg_val ( id , INS_TW ) ;
2013-03-20 20:24:58 +00:00
break ;
2013-04-17 20:32:26 +00:00
# ifdef CONFIG_KVM_XICS
case KVM_REG_PPC_ICP_STATE :
2017-04-05 17:54:56 +10:00
if ( ! vcpu - > arch . icp & & ! vcpu - > arch . xive_vcpu ) {
2013-04-17 20:32:26 +00:00
r = - ENXIO ;
break ;
}
KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE
Currently, the KVM code assumes that if the host kernel is using the
XIVE interrupt controller (the new interrupt controller that first
appeared in POWER9 systems), then the in-kernel XICS emulation will
use the XIVE hardware to deliver interrupts to the guest. However,
this only works when the host is running in hypervisor mode and has
full access to all of the XIVE functionality. It doesn't work in any
nested virtualization scenario, either with PR KVM or nested-HV KVM,
because the XICS-on-XIVE code calls directly into the native-XIVE
routines, which are not initialized and cannot function correctly
because they use OPAL calls, and OPAL is not available in a guest.
This means that using the in-kernel XICS emulation in a nested
hypervisor that is using XIVE as its interrupt controller will cause a
(nested) host kernel crash. To fix this, we change most of the places
where the current code calls xive_enabled() to select between the
XICS-on-XIVE emulation and the plain XICS emulation to call a new
function, xics_on_xive(), which returns false in a guest.
However, there is a further twist. The plain XICS emulation has some
functions which are used in real mode and access the underlying XICS
controller (the interrupt controller of the host) directly. In the
case of a nested hypervisor, this means doing XICS hypercalls
directly. When the nested host is using XIVE as its interrupt
controller, these hypercalls will fail. Therefore this also adds
checks in the places where the XICS emulation wants to access the
underlying interrupt controller directly, and if that is XIVE, makes
the code use the virtual mode fallback paths, which call generic
kernel infrastructure rather than doing direct XICS access.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2019-02-04 22:07:20 +11:00
if ( xics_on_xive ( ) )
2017-04-05 17:54:56 +10:00
* val = get_reg_val ( id , kvmppc_xive_get_icp ( vcpu ) ) ;
else
* val = get_reg_val ( id , kvmppc_xics_get_icp ( vcpu ) ) ;
2013-04-17 20:32:26 +00:00
break ;
# endif /* CONFIG_KVM_XICS */
2019-04-18 12:39:35 +02:00
# ifdef CONFIG_KVM_XIVE
case KVM_REG_PPC_VP_STATE :
if ( ! vcpu - > arch . xive_vcpu ) {
r = - ENXIO ;
break ;
}
if ( xive_enabled ( ) )
r = kvmppc_xive_native_get_vp ( vcpu , val ) ;
else
r = - ENXIO ;
break ;
# endif /* CONFIG_KVM_XIVE */
2014-04-29 16:48:44 +02:00
case KVM_REG_PPC_FSCR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . fscr ) ;
2014-04-29 16:48:44 +02:00
break ;
2014-04-22 12:26:58 +02:00
case KVM_REG_PPC_TAR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . tar ) ;
2014-04-22 12:26:58 +02:00
break ;
2014-04-29 13:36:21 +02:00
case KVM_REG_PPC_EBBHR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . ebbhr ) ;
2014-04-29 13:36:21 +02:00
break ;
case KVM_REG_PPC_EBBRR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . ebbrr ) ;
2014-04-29 13:36:21 +02:00
break ;
case KVM_REG_PPC_BESCR :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . bescr ) ;
2014-04-29 13:36:21 +02:00
break ;
2014-06-05 17:38:05 +05:30
case KVM_REG_PPC_IC :
2014-08-20 16:36:24 +03:00
* val = get_reg_val ( id , vcpu - > arch . ic ) ;
2014-06-05 17:38:05 +05:30
break ;
2012-09-25 20:31:56 +00:00
default :
r = - EINVAL ;
break ;
}
}
return r ;
}
2014-08-20 16:36:24 +03:00
int kvmppc_set_one_reg ( struct kvm_vcpu * vcpu , u64 id ,
union kvmppc_one_reg * val )
2012-09-25 20:31:56 +00:00
{
2014-08-20 16:36:24 +03:00
int r = 0 ;
2012-09-25 20:32:30 +00:00
long int i ;
2012-09-25 20:31:56 +00:00
2014-08-20 16:36:24 +03:00
r = vcpu - > kvm - > arch . kvm_ops - > set_one_reg ( vcpu , id , val ) ;
2012-09-25 20:31:56 +00:00
if ( r = = - EINVAL ) {
r = 0 ;
2014-08-20 16:36:24 +03:00
switch ( id ) {
2012-09-25 20:31:56 +00:00
case KVM_REG_PPC_DAR :
2014-08-20 16:36:24 +03:00
kvmppc_set_dar ( vcpu , set_reg_val ( id , * val ) ) ;
2012-09-25 20:31:56 +00:00
break ;
case KVM_REG_PPC_DSISR :
2014-08-20 16:36:24 +03:00
kvmppc_set_dsisr ( vcpu , set_reg_val ( id , * val ) ) ;
2012-09-25 20:31:56 +00:00
break ;
2012-09-25 20:32:30 +00:00
case KVM_REG_PPC_FPR0 . . . KVM_REG_PPC_FPR31 :
2014-08-20 16:36:24 +03:00
i = id - KVM_REG_PPC_FPR0 ;
VCPU_FPR ( vcpu , i ) = set_reg_val ( id , * val ) ;
2012-09-25 20:32:30 +00:00
break ;
case KVM_REG_PPC_FPSCR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . fp . fpscr = set_reg_val ( id , * val ) ;
2012-09-25 20:32:30 +00:00
break ;
2013-10-15 20:43:02 +11:00
# ifdef CONFIG_VSX
case KVM_REG_PPC_VSR0 . . . KVM_REG_PPC_VSR31 :
if ( cpu_has_feature ( CPU_FTR_VSX ) ) {
2014-08-20 16:36:24 +03:00
i = id - KVM_REG_PPC_VSR0 ;
vcpu - > arch . fp . fpr [ i ] [ 0 ] = val - > vsxval [ 0 ] ;
vcpu - > arch . fp . fpr [ i ] [ 1 ] = val - > vsxval [ 1 ] ;
2013-10-15 20:43:02 +11:00
} else {
r = - ENXIO ;
}
break ;
# endif /* CONFIG_VSX */
2013-04-17 20:32:26 +00:00
# ifdef CONFIG_KVM_XICS
case KVM_REG_PPC_ICP_STATE :
2017-04-05 17:54:56 +10:00
if ( ! vcpu - > arch . icp & & ! vcpu - > arch . xive_vcpu ) {
2013-04-17 20:32:26 +00:00
r = - ENXIO ;
break ;
}
KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE
Currently, the KVM code assumes that if the host kernel is using the
XIVE interrupt controller (the new interrupt controller that first
appeared in POWER9 systems), then the in-kernel XICS emulation will
use the XIVE hardware to deliver interrupts to the guest. However,
this only works when the host is running in hypervisor mode and has
full access to all of the XIVE functionality. It doesn't work in any
nested virtualization scenario, either with PR KVM or nested-HV KVM,
because the XICS-on-XIVE code calls directly into the native-XIVE
routines, which are not initialized and cannot function correctly
because they use OPAL calls, and OPAL is not available in a guest.
This means that using the in-kernel XICS emulation in a nested
hypervisor that is using XIVE as its interrupt controller will cause a
(nested) host kernel crash. To fix this, we change most of the places
where the current code calls xive_enabled() to select between the
XICS-on-XIVE emulation and the plain XICS emulation to call a new
function, xics_on_xive(), which returns false in a guest.
However, there is a further twist. The plain XICS emulation has some
functions which are used in real mode and access the underlying XICS
controller (the interrupt controller of the host) directly. In the
case of a nested hypervisor, this means doing XICS hypercalls
directly. When the nested host is using XIVE as its interrupt
controller, these hypercalls will fail. Therefore this also adds
checks in the places where the XICS emulation wants to access the
underlying interrupt controller directly, and if that is XIVE, makes
the code use the virtual mode fallback paths, which call generic
kernel infrastructure rather than doing direct XICS access.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2019-02-04 22:07:20 +11:00
if ( xics_on_xive ( ) )
2017-04-05 17:54:56 +10:00
r = kvmppc_xive_set_icp ( vcpu , set_reg_val ( id , * val ) ) ;
else
r = kvmppc_xics_set_icp ( vcpu , set_reg_val ( id , * val ) ) ;
2013-04-17 20:32:26 +00:00
break ;
# endif /* CONFIG_KVM_XICS */
2019-04-18 12:39:35 +02:00
# ifdef CONFIG_KVM_XIVE
case KVM_REG_PPC_VP_STATE :
if ( ! vcpu - > arch . xive_vcpu ) {
r = - ENXIO ;
break ;
}
if ( xive_enabled ( ) )
r = kvmppc_xive_native_set_vp ( vcpu , val ) ;
else
r = - ENXIO ;
break ;
# endif /* CONFIG_KVM_XIVE */
2014-04-29 16:48:44 +02:00
case KVM_REG_PPC_FSCR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . fscr = set_reg_val ( id , * val ) ;
2014-04-29 16:48:44 +02:00
break ;
2014-04-22 12:26:58 +02:00
case KVM_REG_PPC_TAR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . tar = set_reg_val ( id , * val ) ;
2014-04-22 12:26:58 +02:00
break ;
2014-04-29 13:36:21 +02:00
case KVM_REG_PPC_EBBHR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . ebbhr = set_reg_val ( id , * val ) ;
2014-04-29 13:36:21 +02:00
break ;
case KVM_REG_PPC_EBBRR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . ebbrr = set_reg_val ( id , * val ) ;
2014-04-29 13:36:21 +02:00
break ;
case KVM_REG_PPC_BESCR :
2014-08-20 16:36:24 +03:00
vcpu - > arch . bescr = set_reg_val ( id , * val ) ;
2014-04-29 13:36:21 +02:00
break ;
2014-06-05 17:38:05 +05:30
case KVM_REG_PPC_IC :
2014-08-20 16:36:24 +03:00
vcpu - > arch . ic = set_reg_val ( id , * val ) ;
2014-06-05 17:38:05 +05:30
break ;
2012-09-25 20:31:56 +00:00
default :
r = - EINVAL ;
break ;
}
}
return r ;
}
2013-10-07 22:17:53 +05:30
void kvmppc_core_vcpu_load ( struct kvm_vcpu * vcpu , int cpu )
{
2013-10-07 22:18:01 +05:30
vcpu - > kvm - > arch . kvm_ops - > vcpu_load ( vcpu , cpu ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_core_vcpu_put ( struct kvm_vcpu * vcpu )
{
2013-10-07 22:18:01 +05:30
vcpu - > kvm - > arch . kvm_ops - > vcpu_put ( vcpu ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_set_msr ( struct kvm_vcpu * vcpu , u64 msr )
{
2013-10-07 22:18:01 +05:30
vcpu - > kvm - > arch . kvm_ops - > set_msr ( vcpu , msr ) ;
2013-10-07 22:17:53 +05:30
}
2013-10-07 22:17:59 +05:30
EXPORT_SYMBOL_GPL ( kvmppc_set_msr ) ;
2013-10-07 22:17:53 +05:30
2020-04-27 12:35:11 +08:00
int kvmppc_vcpu_run ( struct kvm_vcpu * vcpu )
2013-10-07 22:17:53 +05:30
{
2020-04-27 12:35:11 +08:00
return vcpu - > kvm - > arch . kvm_ops - > vcpu_run ( vcpu ) ;
2013-10-07 22:17:53 +05:30
}
2009-10-30 05:47:10 +00:00
int kvm_arch_vcpu_ioctl_translate ( struct kvm_vcpu * vcpu ,
struct kvm_translation * tr )
{
return 0 ;
}
2013-04-08 00:32:12 +00:00
int kvm_arch_vcpu_ioctl_set_guest_debug ( struct kvm_vcpu * vcpu ,
struct kvm_guest_debug * dbg )
{
2017-12-04 21:35:33 +01:00
vcpu_load ( vcpu ) ;
2014-09-09 22:37:35 +05:30
vcpu - > guest_debug = dbg - > control ;
2017-12-04 21:35:33 +01:00
vcpu_put ( vcpu ) ;
2014-09-09 22:37:35 +05:30
return 0 ;
2013-04-08 00:32:12 +00:00
}
2014-09-01 17:19:56 +03:00
void kvmppc_decrementer_func ( struct kvm_vcpu * vcpu )
2011-11-17 12:39:59 +00:00
{
kvmppc_core_queue_dec ( vcpu ) ;
kvm_vcpu_kick ( vcpu ) ;
}
2013-10-07 22:17:53 +05:30
2019-12-18 13:55:00 -08:00
int kvmppc_core_vcpu_create ( struct kvm_vcpu * vcpu )
2013-10-07 22:17:53 +05:30
{
2019-12-18 13:55:00 -08:00
return vcpu - > kvm - > arch . kvm_ops - > vcpu_create ( vcpu ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_core_vcpu_free ( struct kvm_vcpu * vcpu )
{
2013-10-07 22:18:01 +05:30
vcpu - > kvm - > arch . kvm_ops - > vcpu_free ( vcpu ) ;
2013-10-07 22:17:53 +05:30
}
int kvmppc_core_check_requests ( struct kvm_vcpu * vcpu )
{
2013-10-07 22:18:01 +05:30
return vcpu - > kvm - > arch . kvm_ops - > check_requests ( vcpu ) ;
2013-10-07 22:17:53 +05:30
}
2020-02-18 13:07:29 -08:00
void kvm_arch_sync_dirty_log ( struct kvm * kvm , struct kvm_memory_slot * memslot )
{
}
2013-10-07 22:17:53 +05:30
int kvm_vm_ioctl_get_dirty_log ( struct kvm * kvm , struct kvm_dirty_log * log )
{
2013-10-07 22:18:01 +05:30
return kvm - > arch . kvm_ops - > get_dirty_log ( kvm , log ) ;
2013-10-07 22:17:53 +05:30
}
2020-02-18 13:07:27 -08:00
void kvmppc_core_free_memslot ( struct kvm * kvm , struct kvm_memory_slot * slot )
2013-10-07 22:17:53 +05:30
{
2020-02-18 13:07:27 -08:00
kvm - > arch . kvm_ops - > free_memslot ( slot ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_core_flush_memslot ( struct kvm * kvm , struct kvm_memory_slot * memslot )
{
2013-10-07 22:18:01 +05:30
kvm - > arch . kvm_ops - > flush_memslot ( kvm , memslot ) ;
2013-10-07 22:17:53 +05:30
}
int kvmppc_core_prepare_memory_region ( struct kvm * kvm ,
struct kvm_memory_slot * memslot ,
2020-02-18 13:07:18 -08:00
const struct kvm_userspace_memory_region * mem ,
enum kvm_mr_change change )
2013-10-07 22:17:53 +05:30
{
2020-02-18 13:07:18 -08:00
return kvm - > arch . kvm_ops - > prepare_memory_region ( kvm , memslot , mem ,
change ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_core_commit_memory_region ( struct kvm * kvm ,
2015-05-18 13:59:39 +02:00
const struct kvm_userspace_memory_region * mem ,
2015-05-18 13:20:23 +02:00
const struct kvm_memory_slot * old ,
2018-12-12 15:15:30 +11:00
const struct kvm_memory_slot * new ,
enum kvm_mr_change change )
2013-10-07 22:17:53 +05:30
{
2018-12-12 15:15:30 +11:00
kvm - > arch . kvm_ops - > commit_memory_region ( kvm , mem , old , new , change ) ;
2013-10-07 22:17:53 +05:30
}
2021-04-01 17:56:53 -07:00
bool kvm_unmap_gfn_range ( struct kvm * kvm , struct kvm_gfn_range * range )
2013-10-07 22:17:53 +05:30
{
2021-04-01 17:56:53 -07:00
return kvm - > arch . kvm_ops - > unmap_gfn_range ( kvm , range ) ;
2013-10-07 22:17:53 +05:30
}
2021-04-01 17:56:53 -07:00
bool kvm_age_gfn ( struct kvm * kvm , struct kvm_gfn_range * range )
2013-10-07 22:17:53 +05:30
{
2021-04-01 17:56:53 -07:00
return kvm - > arch . kvm_ops - > age_gfn ( kvm , range ) ;
2013-10-07 22:17:53 +05:30
}
2021-04-01 17:56:53 -07:00
bool kvm_test_age_gfn ( struct kvm * kvm , struct kvm_gfn_range * range )
2013-10-07 22:17:53 +05:30
{
2021-04-01 17:56:53 -07:00
return kvm - > arch . kvm_ops - > test_age_gfn ( kvm , range ) ;
2013-10-07 22:17:53 +05:30
}
2021-04-01 17:56:53 -07:00
bool kvm_set_spte_gfn ( struct kvm * kvm , struct kvm_gfn_range * range )
2013-10-07 22:17:53 +05:30
{
2021-04-01 17:56:53 -07:00
return kvm - > arch . kvm_ops - > set_spte_gfn ( kvm , range ) ;
2013-10-07 22:17:53 +05:30
}
int kvmppc_core_init_vm ( struct kvm * kvm )
{
# ifdef CONFIG_PPC64
2016-02-15 12:55:05 +11:00
INIT_LIST_HEAD_RCU ( & kvm - > arch . spapr_tce_tables ) ;
2013-10-07 22:17:53 +05:30
INIT_LIST_HEAD ( & kvm - > arch . rtas_tokens ) ;
2019-05-29 11:54:00 +10:00
mutex_init ( & kvm - > arch . rtas_token_lock ) ;
2013-10-07 22:17:53 +05:30
# endif
2013-10-07 22:18:01 +05:30
return kvm - > arch . kvm_ops - > init_vm ( kvm ) ;
2013-10-07 22:17:53 +05:30
}
void kvmppc_core_destroy_vm ( struct kvm * kvm )
{
2013-10-07 22:18:01 +05:30
kvm - > arch . kvm_ops - > destroy_vm ( kvm ) ;
2013-10-07 22:17:53 +05:30
# ifdef CONFIG_PPC64
kvmppc_rtas_tokens_free ( kvm ) ;
WARN_ON ( ! list_empty ( & kvm - > arch . spapr_tce_tables ) ) ;
# endif
2019-04-18 12:39:42 +02:00
# ifdef CONFIG_KVM_XICS
/*
2020-08-10 12:08:05 +02:00
* Free the XIVE and XICS devices which are not directly freed by the
2019-04-18 12:39:42 +02:00
* device ' release ' method
*/
kfree ( kvm - > arch . xive_devices . native ) ;
kvm - > arch . xive_devices . native = NULL ;
kfree ( kvm - > arch . xive_devices . xics_on_xive ) ;
kvm - > arch . xive_devices . xics_on_xive = NULL ;
2020-08-10 12:08:05 +02:00
kfree ( kvm - > arch . xics_device ) ;
kvm - > arch . xics_device = NULL ;
2019-04-18 12:39:42 +02:00
# endif /* CONFIG_KVM_XICS */
2013-10-07 22:17:53 +05:30
}
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
int kvmppc_h_logical_ci_load ( struct kvm_vcpu * vcpu )
{
unsigned long size = kvmppc_get_gpr ( vcpu , 4 ) ;
unsigned long addr = kvmppc_get_gpr ( vcpu , 5 ) ;
u64 buf ;
2015-09-18 08:57:28 +02:00
int srcu_idx ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
int ret ;
if ( ! is_power_of_2 ( size ) | | ( size > sizeof ( buf ) ) )
return H_TOO_HARD ;
2015-09-18 08:57:28 +02:00
srcu_idx = srcu_read_lock ( & vcpu - > kvm - > srcu ) ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
ret = kvm_io_bus_read ( vcpu , KVM_MMIO_BUS , addr , size , & buf ) ;
2015-09-18 08:57:28 +02:00
srcu_read_unlock ( & vcpu - > kvm - > srcu , srcu_idx ) ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
if ( ret ! = 0 )
return H_TOO_HARD ;
switch ( size ) {
case 1 :
kvmppc_set_gpr ( vcpu , 4 , * ( u8 * ) & buf ) ;
break ;
case 2 :
kvmppc_set_gpr ( vcpu , 4 , be16_to_cpu ( * ( __be16 * ) & buf ) ) ;
break ;
case 4 :
kvmppc_set_gpr ( vcpu , 4 , be32_to_cpu ( * ( __be32 * ) & buf ) ) ;
break ;
case 8 :
kvmppc_set_gpr ( vcpu , 4 , be64_to_cpu ( * ( __be64 * ) & buf ) ) ;
break ;
default :
BUG ( ) ;
}
return H_SUCCESS ;
}
EXPORT_SYMBOL_GPL ( kvmppc_h_logical_ci_load ) ;
int kvmppc_h_logical_ci_store ( struct kvm_vcpu * vcpu )
{
unsigned long size = kvmppc_get_gpr ( vcpu , 4 ) ;
unsigned long addr = kvmppc_get_gpr ( vcpu , 5 ) ;
unsigned long val = kvmppc_get_gpr ( vcpu , 6 ) ;
u64 buf ;
2015-09-18 08:57:28 +02:00
int srcu_idx ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
int ret ;
switch ( size ) {
case 1 :
* ( u8 * ) & buf = val ;
break ;
case 2 :
* ( __be16 * ) & buf = cpu_to_be16 ( val ) ;
break ;
case 4 :
* ( __be32 * ) & buf = cpu_to_be32 ( val ) ;
break ;
case 8 :
* ( __be64 * ) & buf = cpu_to_be64 ( val ) ;
break ;
default :
return H_TOO_HARD ;
}
2015-09-18 08:57:28 +02:00
srcu_idx = srcu_read_lock ( & vcpu - > kvm - > srcu ) ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
ret = kvm_io_bus_write ( vcpu , KVM_MMIO_BUS , addr , size , & buf ) ;
2015-09-18 08:57:28 +02:00
srcu_read_unlock ( & vcpu - > kvm - > srcu , srcu_idx ) ;
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 11:53:25 +11:00
if ( ret ! = 0 )
return H_TOO_HARD ;
return H_SUCCESS ;
}
EXPORT_SYMBOL_GPL ( kvmppc_h_logical_ci_store ) ;
2013-10-07 22:17:53 +05:30
int kvmppc_core_check_processor_compat ( void )
{
2013-10-07 22:18:01 +05:30
/*
* We always return 0 for book3s . We check
2015-05-27 15:05:42 +03:00
* for compatibility while loading the HV
2013-10-07 22:18:01 +05:30
* or PR module
*/
return 0 ;
}
2014-06-02 11:03:00 +10:00
int kvmppc_book3s_hcall_implemented ( struct kvm * kvm , unsigned long hcall )
{
return kvm - > arch . kvm_ops - > hcall_implemented ( hcall ) ;
}
2017-04-05 17:54:56 +10:00
# ifdef CONFIG_KVM_XICS
int kvm_set_irq ( struct kvm * kvm , int irq_source_id , u32 irq , int level ,
bool line_status )
{
KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE
Currently, the KVM code assumes that if the host kernel is using the
XIVE interrupt controller (the new interrupt controller that first
appeared in POWER9 systems), then the in-kernel XICS emulation will
use the XIVE hardware to deliver interrupts to the guest. However,
this only works when the host is running in hypervisor mode and has
full access to all of the XIVE functionality. It doesn't work in any
nested virtualization scenario, either with PR KVM or nested-HV KVM,
because the XICS-on-XIVE code calls directly into the native-XIVE
routines, which are not initialized and cannot function correctly
because they use OPAL calls, and OPAL is not available in a guest.
This means that using the in-kernel XICS emulation in a nested
hypervisor that is using XIVE as its interrupt controller will cause a
(nested) host kernel crash. To fix this, we change most of the places
where the current code calls xive_enabled() to select between the
XICS-on-XIVE emulation and the plain XICS emulation to call a new
function, xics_on_xive(), which returns false in a guest.
However, there is a further twist. The plain XICS emulation has some
functions which are used in real mode and access the underlying XICS
controller (the interrupt controller of the host) directly. In the
case of a nested hypervisor, this means doing XICS hypercalls
directly. When the nested host is using XIVE as its interrupt
controller, these hypercalls will fail. Therefore this also adds
checks in the places where the XICS emulation wants to access the
underlying interrupt controller directly, and if that is XIVE, makes
the code use the virtual mode fallback paths, which call generic
kernel infrastructure rather than doing direct XICS access.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2019-02-04 22:07:20 +11:00
if ( xics_on_xive ( ) )
2017-04-05 17:54:56 +10:00
return kvmppc_xive_set_irq ( kvm , irq_source_id , irq , level ,
line_status ) ;
else
return kvmppc_xics_set_irq ( kvm , irq_source_id , irq , level ,
line_status ) ;
}
int kvm_arch_set_irq_inatomic ( struct kvm_kernel_irq_routing_entry * irq_entry ,
struct kvm * kvm , int irq_source_id ,
int level , bool line_status )
{
return kvm_set_irq ( kvm , irq_source_id , irq_entry - > gsi ,
level , line_status ) ;
}
static int kvmppc_book3s_set_irq ( struct kvm_kernel_irq_routing_entry * e ,
struct kvm * kvm , int irq_source_id , int level ,
bool line_status )
{
return kvm_set_irq ( kvm , irq_source_id , e - > gsi , level , line_status ) ;
}
int kvm_irq_map_gsi ( struct kvm * kvm ,
struct kvm_kernel_irq_routing_entry * entries , int gsi )
{
entries - > gsi = gsi ;
entries - > type = KVM_IRQ_ROUTING_IRQCHIP ;
entries - > set = kvmppc_book3s_set_irq ;
entries - > irqchip . irqchip = 0 ;
entries - > irqchip . pin = gsi ;
return 1 ;
}
int kvm_irq_map_chip_pin ( struct kvm * kvm , unsigned irqchip , unsigned pin )
{
return pin ;
}
# endif /* CONFIG_KVM_XICS */
2013-10-07 22:18:01 +05:30
static int kvmppc_book3s_init ( void )
{
int r ;
r = kvm_init ( NULL , sizeof ( struct kvm_vcpu ) , 0 , THIS_MODULE ) ;
if ( r )
return r ;
2014-04-06 23:31:48 +02:00
# ifdef CONFIG_KVM_BOOK3S_32_HANDLER
2013-10-07 22:18:01 +05:30
r = kvmppc_book3s_init_pr ( ) ;
# endif
2017-04-05 17:54:56 +10:00
# ifdef CONFIG_KVM_XICS
# ifdef CONFIG_KVM_XIVE
KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE
Currently, the KVM code assumes that if the host kernel is using the
XIVE interrupt controller (the new interrupt controller that first
appeared in POWER9 systems), then the in-kernel XICS emulation will
use the XIVE hardware to deliver interrupts to the guest. However,
this only works when the host is running in hypervisor mode and has
full access to all of the XIVE functionality. It doesn't work in any
nested virtualization scenario, either with PR KVM or nested-HV KVM,
because the XICS-on-XIVE code calls directly into the native-XIVE
routines, which are not initialized and cannot function correctly
because they use OPAL calls, and OPAL is not available in a guest.
This means that using the in-kernel XICS emulation in a nested
hypervisor that is using XIVE as its interrupt controller will cause a
(nested) host kernel crash. To fix this, we change most of the places
where the current code calls xive_enabled() to select between the
XICS-on-XIVE emulation and the plain XICS emulation to call a new
function, xics_on_xive(), which returns false in a guest.
However, there is a further twist. The plain XICS emulation has some
functions which are used in real mode and access the underlying XICS
controller (the interrupt controller of the host) directly. In the
case of a nested hypervisor, this means doing XICS hypercalls
directly. When the nested host is using XIVE as its interrupt
controller, these hypercalls will fail. Therefore this also adds
checks in the places where the XICS emulation wants to access the
underlying interrupt controller directly, and if that is XIVE, makes
the code use the virtual mode fallback paths, which call generic
kernel infrastructure rather than doing direct XICS access.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2019-02-04 22:07:20 +11:00
if ( xics_on_xive ( ) ) {
2017-04-05 17:54:56 +10:00
kvm_register_device_ops ( & kvm_xive_ops , KVM_DEV_TYPE_XICS ) ;
2021-05-28 19:07:44 +10:00
if ( kvmppc_xive_native_supported ( ) )
2019-08-26 16:21:21 +10:00
kvm_register_device_ops ( & kvm_xive_native_ops ,
KVM_DEV_TYPE_XIVE ) ;
2017-04-05 17:54:56 +10:00
} else
# endif
kvm_register_device_ops ( & kvm_xics_ops , KVM_DEV_TYPE_XICS ) ;
# endif
return r ;
2013-10-07 22:18:01 +05:30
}
static void kvmppc_book3s_exit ( void )
{
2014-04-06 23:31:48 +02:00
# ifdef CONFIG_KVM_BOOK3S_32_HANDLER
2013-10-07 22:18:01 +05:30
kvmppc_book3s_exit_pr ( ) ;
# endif
kvm_exit ( ) ;
2013-10-07 22:17:53 +05:30
}
2013-10-07 22:18:01 +05:30
module_init ( kvmppc_book3s_init ) ;
module_exit ( kvmppc_book3s_exit ) ;
2013-12-09 13:53:42 +01:00
/* On 32bit this is our one and only kernel module */
2014-04-06 23:31:48 +02:00
# ifdef CONFIG_KVM_BOOK3S_32_HANDLER
2013-12-09 13:53:42 +01:00
MODULE_ALIAS_MISCDEV ( KVM_MINOR ) ;
MODULE_ALIAS ( " devname:kvm " ) ;
# endif