2011-11-10 16:57:22 +04:00
/*
2012-06-28 11:23:08 +04:00
* Kernel - based Virtual Machine - - Performance Monitoring Unit support
2011-11-10 16:57:22 +04:00
*
2015-06-19 16:45:05 +03:00
* Copyright 2015 Red Hat , Inc . and / or its affiliates .
2011-11-10 16:57:22 +04:00
*
* Authors :
* Avi Kivity < avi @ redhat . com >
* Gleb Natapov < gleb @ redhat . com >
2015-06-19 16:45:05 +03:00
* Wei Huang < wei @ redhat . com >
2011-11-10 16:57:22 +04:00
*
* This work is licensed under the terms of the GNU GPL , version 2. See
* the COPYING file in the top - level directory .
*
*/
# include <linux/types.h>
# include <linux/kvm_host.h>
# include <linux/perf_event.h>
2014-08-20 14:25:52 +04:00
# include <asm/perf_event.h>
2011-11-10 16:57:22 +04:00
# include "x86.h"
# include "cpuid.h"
# include "lapic.h"
2015-06-19 14:54:23 +03:00
# include "pmu.h"
2011-11-10 16:57:22 +04:00
2015-06-19 16:45:05 +03:00
/* NOTE:
* - Each perf counter is defined as " struct kvm_pmc " ;
* - There are two types of perf counters : general purpose ( gp ) and fixed .
* gp counters are stored in gp_counters [ ] and fixed counters are stored
* in fixed_counters [ ] respectively . Both of them are part of " struct
* kvm_pmu " ;
* - pmu . c understands the difference between gp counters and fixed counters .
* However AMD doesn ' t support fixed - counters ;
* - There are three types of index to access perf counters ( PMC ) :
* 1. MSR ( named msr ) : For example Intel has MSR_IA32_PERFCTRn and AMD
* has MSR_K7_PERFCTRn .
* 2. MSR Index ( named idx ) : This normally is used by RDPMC instruction .
* For instance AMD RDPMC instruction uses 0000 _0003h in ECX to access
* C001_0007h ( MSR_K7_PERCTR3 ) . Intel has a similar mechanism , except
* that it also supports fixed counters . idx can be used to as index to
* gp and fixed counters .
* 3. Global PMC Index ( named pmc ) : pmc is an index specific to PMU
* code . Each pmc , stored in kvm_pmc . idx field , is unique across
* all perf counters ( both gp and fixed ) . The mapping relationship
* between pmc and perf counters is as the following :
* * Intel : [ 0 . . INTEL_PMC_MAX_GENERIC - 1 ] < = > gp counters
* [ INTEL_PMC_IDX_FIXED . . INTEL_PMC_IDX_FIXED + 2 ] < = > fixed
* * AMD : [ 0 . . AMD64_NUM_COUNTERS - 1 ] < = > gp counters
*/
2011-11-10 16:57:22 +04:00
2015-06-19 14:44:45 +03:00
static void kvm_pmi_trigger_fn ( struct irq_work * irq_work )
2011-11-10 16:57:22 +04:00
{
2015-06-19 15:00:33 +03:00
struct kvm_pmu * pmu = container_of ( irq_work , struct kvm_pmu , irq_work ) ;
struct kvm_vcpu * vcpu = pmu_to_vcpu ( pmu ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 14:44:45 +03:00
kvm_pmu_deliver_pmi ( vcpu ) ;
2011-11-10 16:57:22 +04:00
}
static void kvm_perf_overflow ( struct perf_event * perf_event ,
struct perf_sample_data * data ,
struct pt_regs * regs )
{
struct kvm_pmc * pmc = perf_event - > overflow_handler_context ;
2015-06-19 15:00:33 +03:00
struct kvm_pmu * pmu = pmc_to_pmu ( pmc ) ;
2015-06-19 15:15:28 +03:00
if ( ! test_and_set_bit ( pmc - > idx ,
( unsigned long * ) & pmu - > reprogram_pmi ) ) {
2014-04-18 04:35:08 +04:00
__set_bit ( pmc - > idx , ( unsigned long * ) & pmu - > global_status ) ;
kvm_make_request ( KVM_REQ_PMU , pmc - > vcpu ) ;
}
2011-11-10 16:57:22 +04:00
}
static void kvm_perf_overflow_intr ( struct perf_event * perf_event ,
2015-06-19 15:15:28 +03:00
struct perf_sample_data * data ,
struct pt_regs * regs )
2011-11-10 16:57:22 +04:00
{
struct kvm_pmc * pmc = perf_event - > overflow_handler_context ;
2015-06-19 15:00:33 +03:00
struct kvm_pmu * pmu = pmc_to_pmu ( pmc ) ;
2015-06-19 15:15:28 +03:00
if ( ! test_and_set_bit ( pmc - > idx ,
( unsigned long * ) & pmu - > reprogram_pmi ) ) {
2014-04-18 04:35:08 +04:00
__set_bit ( pmc - > idx , ( unsigned long * ) & pmu - > global_status ) ;
2011-11-10 16:57:22 +04:00
kvm_make_request ( KVM_REQ_PMU , pmc - > vcpu ) ;
2015-06-19 15:15:28 +03:00
2011-11-10 16:57:22 +04:00
/*
* Inject PMI . If vcpu was in a guest mode during NMI PMI
* can be ejected on a guest mode re - entry . Otherwise we can ' t
* be sure that vcpu wasn ' t executing hlt instruction at the
2015-06-19 15:15:28 +03:00
* time of vmexit and is not going to re - enter guest mode until
2011-11-10 16:57:22 +04:00
* woken up . So we should wake it , but this is impossible from
* NMI context . Do it from irq work instead .
*/
if ( ! kvm_is_in_guest ( ) )
2015-06-19 15:00:33 +03:00
irq_work_queue ( & pmc_to_pmu ( pmc ) - > irq_work ) ;
2011-11-10 16:57:22 +04:00
else
kvm_make_request ( KVM_REQ_PMI , pmc - > vcpu ) ;
}
}
2015-06-19 14:44:45 +03:00
static void pmc_reprogram_counter ( struct kvm_pmc * pmc , u32 type ,
2015-06-19 15:15:28 +03:00
unsigned config , bool exclude_user ,
bool exclude_kernel , bool intr ,
bool in_tx , bool in_tx_cp )
2011-11-10 16:57:22 +04:00
{
struct perf_event * event ;
struct perf_event_attr attr = {
. type = type ,
. size = sizeof ( attr ) ,
. pinned = true ,
. exclude_idle = true ,
. exclude_host = 1 ,
. exclude_user = exclude_user ,
. exclude_kernel = exclude_kernel ,
. config = config ,
} ;
2015-06-19 15:15:28 +03:00
KVM: x86: never specify a sample period for virtualized in_tx_cp counters
pmc_reprogram_counter() always sets a sample period based on the value of
pmc->counter. However, hsw_hw_config() rejects sample periods less than
2^31 - 1. So for example, if a KVM guest does
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_RAW;
attr.size = sizeof(attr);
attr.config = 0x2005101c4; // conditional branches retired IN_TXCP
attr.sample_period = 0;
int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
the guest kernel counts some conditional branch events, then updates the
virtual PMU register with a nonzero count. The host reaches
pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP
in hsw_hw_config(), prints "kvm_pmu: event creation failed" in
pmc_reprogram_counter(), and silently (from the guest's point of view) stops
counting events.
We fix event counting by forcing attr.sample_period to always be zero for
in_tx_cp counters. Sampling doesn't work, but it already didn't work and
can't be fixed without major changes to the approach in hsw_hw_config().
Signed-off-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
2017-02-01 07:06:11 +03:00
attr . sample_period = ( - pmc - > counter ) & pmc_bitmask ( pmc ) ;
2013-07-19 02:57:02 +04:00
if ( in_tx )
attr . config | = HSW_IN_TX ;
KVM: x86: never specify a sample period for virtualized in_tx_cp counters
pmc_reprogram_counter() always sets a sample period based on the value of
pmc->counter. However, hsw_hw_config() rejects sample periods less than
2^31 - 1. So for example, if a KVM guest does
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_RAW;
attr.size = sizeof(attr);
attr.config = 0x2005101c4; // conditional branches retired IN_TXCP
attr.sample_period = 0;
int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
the guest kernel counts some conditional branch events, then updates the
virtual PMU register with a nonzero count. The host reaches
pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP
in hsw_hw_config(), prints "kvm_pmu: event creation failed" in
pmc_reprogram_counter(), and silently (from the guest's point of view) stops
counting events.
We fix event counting by forcing attr.sample_period to always be zero for
in_tx_cp counters. Sampling doesn't work, but it already didn't work and
can't be fixed without major changes to the approach in hsw_hw_config().
Signed-off-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
2017-02-01 07:06:11 +03:00
if ( in_tx_cp ) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period . Just clear the sample period so at least
* allocating the counter doesn ' t fail .
*/
attr . sample_period = 0 ;
2013-07-19 02:57:02 +04:00
attr . config | = HSW_IN_TX_CHECKPOINTED ;
KVM: x86: never specify a sample period for virtualized in_tx_cp counters
pmc_reprogram_counter() always sets a sample period based on the value of
pmc->counter. However, hsw_hw_config() rejects sample periods less than
2^31 - 1. So for example, if a KVM guest does
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_RAW;
attr.size = sizeof(attr);
attr.config = 0x2005101c4; // conditional branches retired IN_TXCP
attr.sample_period = 0;
int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
the guest kernel counts some conditional branch events, then updates the
virtual PMU register with a nonzero count. The host reaches
pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP
in hsw_hw_config(), prints "kvm_pmu: event creation failed" in
pmc_reprogram_counter(), and silently (from the guest's point of view) stops
counting events.
We fix event counting by forcing attr.sample_period to always be zero for
in_tx_cp counters. Sampling doesn't work, but it already didn't work and
can't be fixed without major changes to the approach in hsw_hw_config().
Signed-off-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
2017-02-01 07:06:11 +03:00
}
2011-11-10 16:57:22 +04:00
event = perf_event_create_kernel_counter ( & attr , - 1 , current ,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow , pmc ) ;
if ( IS_ERR ( event ) ) {
2015-06-19 15:15:28 +03:00
printk_once ( " kvm_pmu: event creation failed %ld \n " ,
PTR_ERR ( event ) ) ;
2011-11-10 16:57:22 +04:00
return ;
}
pmc - > perf_event = event ;
2015-06-19 15:00:33 +03:00
clear_bit ( pmc - > idx , ( unsigned long * ) & pmc_to_pmu ( pmc ) - > reprogram_pmi ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 16:45:05 +03:00
void reprogram_gp_counter ( struct kvm_pmc * pmc , u64 eventsel )
2011-11-10 16:57:22 +04:00
{
unsigned config , type = PERF_TYPE_RAW ;
u8 event_select , unit_mask ;
2012-02-26 18:55:40 +04:00
if ( eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL )
printk_once ( " kvm pmu: pin control bit is ignored \n " ) ;
2011-11-10 16:57:22 +04:00
pmc - > eventsel = eventsel ;
2015-06-19 14:44:45 +03:00
pmc_stop_counter ( pmc ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 14:44:45 +03:00
if ( ! ( eventsel & ARCH_PERFMON_EVENTSEL_ENABLE ) | | ! pmc_is_enabled ( pmc ) )
2011-11-10 16:57:22 +04:00
return ;
event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT ;
unit_mask = ( eventsel & ARCH_PERFMON_EVENTSEL_UMASK ) > > 8 ;
2012-02-26 18:55:41 +04:00
if ( ! ( eventsel & ( ARCH_PERFMON_EVENTSEL_EDGE |
2015-06-19 15:15:28 +03:00
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED ) ) ) {
2015-06-19 16:45:05 +03:00
config = kvm_x86_ops - > pmu_ops - > find_arch_event ( pmc_to_pmu ( pmc ) ,
event_select ,
unit_mask ) ;
2011-11-10 16:57:22 +04:00
if ( config ! = PERF_COUNT_HW_MAX )
type = PERF_TYPE_HARDWARE ;
}
if ( type = = PERF_TYPE_RAW )
config = eventsel & X86_RAW_EVENT_MASK ;
2015-06-19 14:44:45 +03:00
pmc_reprogram_counter ( pmc , type , config ,
2015-06-19 15:15:28 +03:00
! ( eventsel & ARCH_PERFMON_EVENTSEL_USR ) ,
! ( eventsel & ARCH_PERFMON_EVENTSEL_OS ) ,
eventsel & ARCH_PERFMON_EVENTSEL_INT ,
( eventsel & HSW_IN_TX ) ,
( eventsel & HSW_IN_TX_CHECKPOINTED ) ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 16:45:05 +03:00
EXPORT_SYMBOL_GPL ( reprogram_gp_counter ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 16:45:05 +03:00
void reprogram_fixed_counter ( struct kvm_pmc * pmc , u8 ctrl , int idx )
2011-11-10 16:57:22 +04:00
{
2015-06-19 15:15:28 +03:00
unsigned en_field = ctrl & 0x3 ;
bool pmi = ctrl & 0x8 ;
2011-11-10 16:57:22 +04:00
2015-06-19 14:44:45 +03:00
pmc_stop_counter ( pmc ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 15:15:28 +03:00
if ( ! en_field | | ! pmc_is_enabled ( pmc ) )
2011-11-10 16:57:22 +04:00
return ;
2015-06-19 14:44:45 +03:00
pmc_reprogram_counter ( pmc , PERF_TYPE_HARDWARE ,
2015-06-19 16:45:05 +03:00
kvm_x86_ops - > pmu_ops - > find_fixed_event ( idx ) ,
2015-06-19 15:15:28 +03:00
! ( en_field & 0x2 ) , /* exclude user */
! ( en_field & 0x1 ) , /* exclude kernel */
pmi , false , false ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 16:45:05 +03:00
EXPORT_SYMBOL_GPL ( reprogram_fixed_counter ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 16:45:05 +03:00
void reprogram_counter ( struct kvm_pmu * pmu , int pmc_idx )
2011-11-10 16:57:22 +04:00
{
2015-06-19 16:45:05 +03:00
struct kvm_pmc * pmc = kvm_x86_ops - > pmu_ops - > pmc_idx_to_pmc ( pmu , pmc_idx ) ;
2011-11-10 16:57:22 +04:00
if ( ! pmc )
return ;
if ( pmc_is_gp ( pmc ) )
reprogram_gp_counter ( pmc , pmc - > eventsel ) ;
else {
2015-06-19 15:15:28 +03:00
int idx = pmc_idx - INTEL_PMC_IDX_FIXED ;
u8 ctrl = fixed_ctrl_field ( pmu - > fixed_ctr_ctrl , idx ) ;
reprogram_fixed_counter ( pmc , ctrl , idx ) ;
2011-11-10 16:57:22 +04:00
}
}
2015-06-19 16:45:05 +03:00
EXPORT_SYMBOL_GPL ( reprogram_counter ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 16:51:47 +03:00
void kvm_pmu_handle_event ( struct kvm_vcpu * vcpu )
{
struct kvm_pmu * pmu = vcpu_to_pmu ( vcpu ) ;
u64 bitmask ;
int bit ;
bitmask = pmu - > reprogram_pmi ;
for_each_set_bit ( bit , ( unsigned long * ) & bitmask , X86_PMC_IDX_MAX ) {
2015-06-19 16:45:05 +03:00
struct kvm_pmc * pmc = kvm_x86_ops - > pmu_ops - > pmc_idx_to_pmc ( pmu , bit ) ;
2015-06-19 16:51:47 +03:00
if ( unlikely ( ! pmc | | ! pmc - > perf_event ) ) {
clear_bit ( bit , ( unsigned long * ) & pmu - > reprogram_pmi ) ;
continue ;
}
reprogram_counter ( pmu , bit ) ;
}
}
/* check if idx is a valid index to access PMU */
int kvm_pmu_is_valid_msr_idx ( struct kvm_vcpu * vcpu , unsigned idx )
{
2015-06-19 16:45:05 +03:00
return kvm_x86_ops - > pmu_ops - > is_valid_msr_idx ( vcpu , idx ) ;
2015-06-19 17:16:59 +03:00
}
2018-03-12 14:12:53 +03:00
bool is_vmware_backdoor_pmc ( u32 pmc_idx )
{
switch ( pmc_idx ) {
case VMWARE_BACKDOOR_PMC_HOST_TSC :
case VMWARE_BACKDOOR_PMC_REAL_TIME :
case VMWARE_BACKDOOR_PMC_APPARENT_TIME :
return true ;
}
return false ;
}
static int kvm_pmu_rdpmc_vmware ( struct kvm_vcpu * vcpu , unsigned idx , u64 * data )
{
u64 ctr_val ;
switch ( idx ) {
case VMWARE_BACKDOOR_PMC_HOST_TSC :
ctr_val = rdtsc ( ) ;
break ;
case VMWARE_BACKDOOR_PMC_REAL_TIME :
ctr_val = ktime_get_boot_ns ( ) ;
break ;
case VMWARE_BACKDOOR_PMC_APPARENT_TIME :
ctr_val = ktime_get_boot_ns ( ) +
vcpu - > kvm - > arch . kvmclock_offset ;
break ;
default :
return 1 ;
}
* data = ctr_val ;
return 0 ;
}
2015-06-19 17:16:59 +03:00
int kvm_pmu_rdpmc ( struct kvm_vcpu * vcpu , unsigned idx , u64 * data )
{
bool fast_mode = idx & ( 1u < < 31 ) ;
2019-03-25 22:10:17 +03:00
struct kvm_pmu * pmu = vcpu_to_pmu ( vcpu ) ;
2015-06-19 17:16:59 +03:00
struct kvm_pmc * pmc ;
2019-05-20 18:20:40 +03:00
u64 mask = fast_mode ? ~ 0u : ~ 0ull ;
2015-06-19 17:16:59 +03:00
2019-03-25 22:10:17 +03:00
if ( ! pmu - > version )
return 1 ;
2018-03-12 14:12:53 +03:00
if ( is_vmware_backdoor_pmc ( idx ) )
return kvm_pmu_rdpmc_vmware ( vcpu , idx , data ) ;
2019-05-20 18:20:40 +03:00
pmc = kvm_x86_ops - > pmu_ops - > msr_idx_to_pmc ( vcpu , idx , & mask ) ;
2015-06-19 17:16:59 +03:00
if ( ! pmc )
return 1 ;
2019-05-20 18:20:40 +03:00
* data = pmc_read_counter ( pmc ) & mask ;
2015-06-19 16:51:47 +03:00
return 0 ;
}
void kvm_pmu_deliver_pmi ( struct kvm_vcpu * vcpu )
{
2016-01-08 15:48:51 +03:00
if ( lapic_in_kernel ( vcpu ) )
2015-06-19 16:51:47 +03:00
kvm_apic_local_deliver ( vcpu - > arch . apic , APIC_LVTPC ) ;
}
2015-06-19 14:44:45 +03:00
bool kvm_pmu_is_valid_msr ( struct kvm_vcpu * vcpu , u32 msr )
2011-11-10 16:57:22 +04:00
{
2015-06-19 16:45:05 +03:00
return kvm_x86_ops - > pmu_ops - > is_valid_msr ( vcpu , msr ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 16:45:05 +03:00
int kvm_pmu_get_msr ( struct kvm_vcpu * vcpu , u32 msr , u64 * data )
2011-11-10 16:57:22 +04:00
{
2015-06-19 16:45:05 +03:00
return kvm_x86_ops - > pmu_ops - > get_msr ( vcpu , msr , data ) ;
2011-11-10 16:57:22 +04:00
}
2013-03-28 20:18:35 +04:00
int kvm_pmu_set_msr ( struct kvm_vcpu * vcpu , struct msr_data * msr_info )
2011-11-10 16:57:22 +04:00
{
2015-06-19 16:45:05 +03:00
return kvm_x86_ops - > pmu_ops - > set_msr ( vcpu , msr_info ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 15:15:28 +03:00
/* refresh PMU settings. This function generally is called when underlying
* settings are changed ( such as changes of PMU CPUID by guest VMs ) , which
* should rarely happen .
*/
2015-06-19 14:44:45 +03:00
void kvm_pmu_refresh ( struct kvm_vcpu * vcpu )
2011-11-10 16:57:22 +04:00
{
2015-06-19 16:45:05 +03:00
kvm_x86_ops - > pmu_ops - > refresh ( vcpu ) ;
2011-11-10 16:57:22 +04:00
}
void kvm_pmu_reset ( struct kvm_vcpu * vcpu )
{
2015-06-19 15:00:33 +03:00
struct kvm_pmu * pmu = vcpu_to_pmu ( vcpu ) ;
2011-11-10 16:57:22 +04:00
irq_work_sync ( & pmu - > irq_work ) ;
2015-06-19 16:45:05 +03:00
kvm_x86_ops - > pmu_ops - > reset ( vcpu ) ;
2011-11-10 16:57:22 +04:00
}
2015-06-19 16:51:47 +03:00
void kvm_pmu_init ( struct kvm_vcpu * vcpu )
2011-11-10 16:57:22 +04:00
{
2015-06-19 15:00:33 +03:00
struct kvm_pmu * pmu = vcpu_to_pmu ( vcpu ) ;
2011-11-10 16:57:22 +04:00
2015-06-19 16:51:47 +03:00
memset ( pmu , 0 , sizeof ( * pmu ) ) ;
2015-06-19 16:45:05 +03:00
kvm_x86_ops - > pmu_ops - > init ( vcpu ) ;
2015-06-19 16:51:47 +03:00
init_irq_work ( & pmu - > irq_work , kvm_pmi_trigger_fn ) ;
kvm_pmu_refresh ( vcpu ) ;
}
void kvm_pmu_destroy ( struct kvm_vcpu * vcpu )
{
kvm_pmu_reset ( vcpu ) ;
2011-11-10 16:57:22 +04:00
}