2018-08-22 15:20:00 +08:00
// SPDX-License-Identifier: GPL-2.0
/*
* KVM dirty page logging test
*
* Copyright ( C ) 2018 , Red Hat , Inc .
*/
2018-09-18 19:54:34 +02:00
# define _GNU_SOURCE /* for program_invocation_name */
2018-08-22 15:20:00 +08:00
# include <stdio.h>
# include <stdlib.h>
# include <unistd.h>
# include <time.h>
# include <pthread.h>
2020-09-30 21:22:37 -04:00
# include <semaphore.h>
# include <sys/types.h>
# include <signal.h>
# include <errno.h>
2018-08-22 15:20:00 +08:00
# include <linux/bitmap.h>
# include <linux/bitops.h>
2020-09-30 21:22:37 -04:00
# include <asm/barrier.h>
2018-08-22 15:20:00 +08:00
# include "test_util.h"
# include "kvm_util.h"
2018-09-18 19:54:28 +02:00
# include "processor.h"
2018-08-22 15:20:00 +08:00
2018-09-18 19:54:32 +02:00
# define VCPU_ID 1
2018-08-22 15:20:00 +08:00
/* The memory slot index to track dirty pages */
2018-09-18 19:54:32 +02:00
# define TEST_MEM_SLOT_INDEX 1
2019-07-31 17:15:25 +02:00
/* Default guest test virtual memory offset */
# define DEFAULT_GUEST_TEST_MEM 0xc0000000
2018-09-18 19:54:32 +02:00
2018-08-22 15:20:00 +08:00
/* How many pages to dirty for each guest loop */
2018-09-18 19:54:32 +02:00
# define TEST_PAGES_PER_LOOP 1024
2018-08-22 15:20:00 +08:00
/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
2018-10-28 12:58:42 -07:00
# define TEST_HOST_LOOP_N 32UL
2018-09-18 19:54:32 +02:00
2018-08-22 15:20:00 +08:00
/* Interval for each host loop (ms) */
2018-10-28 12:58:42 -07:00
# define TEST_HOST_LOOP_INTERVAL 10UL
2018-08-22 15:20:00 +08:00
2019-07-31 17:15:25 +02:00
/* Dirty bitmaps are always little endian, so we need to swap on big endian */
# if defined(__s390x__)
# define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
# define test_bit_le(nr, addr) \
test_bit ( ( nr ) ^ BITOP_LE_SWIZZLE , addr )
# define set_bit_le(nr, addr) \
set_bit ( ( nr ) ^ BITOP_LE_SWIZZLE , addr )
# define clear_bit_le(nr, addr) \
clear_bit ( ( nr ) ^ BITOP_LE_SWIZZLE , addr )
# define test_and_set_bit_le(nr, addr) \
test_and_set_bit ( ( nr ) ^ BITOP_LE_SWIZZLE , addr )
# define test_and_clear_bit_le(nr, addr) \
test_and_clear_bit ( ( nr ) ^ BITOP_LE_SWIZZLE , addr )
# else
# define test_bit_le test_bit
# define set_bit_le set_bit
# define clear_bit_le clear_bit
# define test_and_set_bit_le test_and_set_bit
# define test_and_clear_bit_le test_and_clear_bit
# endif
2020-09-30 21:22:39 -04:00
# define TEST_DIRTY_RING_COUNT 65536
# define SIG_IPI SIGUSR1
2020-09-30 21:22:37 -04:00
2018-08-22 15:20:00 +08:00
/*
2018-09-18 19:54:32 +02:00
* Guest / Host shared variables . Ensure addr_gva2hva ( ) and / or
* sync_global_to / from_guest ( ) are used when accessing from
* the host . READ / WRITE_ONCE ( ) should also be used with anything
* that may change .
2018-08-22 15:20:00 +08:00
*/
2018-09-18 19:54:32 +02:00
static uint64_t host_page_size ;
static uint64_t guest_page_size ;
2018-09-18 19:54:34 +02:00
static uint64_t guest_num_pages ;
2018-09-18 19:54:32 +02:00
static uint64_t random_array [ TEST_PAGES_PER_LOOP ] ;
static uint64_t iteration ;
2018-08-22 15:20:00 +08:00
2018-09-18 19:54:36 +02:00
/*
2018-11-06 14:57:08 +01:00
* Guest physical memory offset of the testing memory slot .
* This will be set to the topmost valid physical address minus
* the test memory size .
*/
static uint64_t guest_test_phys_mem ;
/*
* Guest virtual memory offset of the testing memory slot .
* Must not conflict with identity mapped test code .
2018-09-18 19:54:36 +02:00
*/
2018-11-06 14:57:07 +01:00
static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM ;
2018-09-18 19:54:36 +02:00
2018-08-22 15:20:00 +08:00
/*
2018-09-18 19:54:32 +02:00
* Continuously write to the first 8 bytes of a random pages within
* the testing memory region .
2018-08-22 15:20:00 +08:00
*/
2018-09-18 19:54:32 +02:00
static void guest_code ( void )
2018-08-22 15:20:00 +08:00
{
2019-07-31 17:15:25 +02:00
uint64_t addr ;
2018-09-18 19:54:32 +02:00
int i ;
2018-08-22 15:20:00 +08:00
2019-07-31 17:15:25 +02:00
/*
* On s390x , all pages of a 1 M segment are initially marked as dirty
* when a page of the segment is written to for the very first time .
* To compensate this specialty in this test , we need to touch all
* pages during the first iteration .
*/
for ( i = 0 ; i < guest_num_pages ; i + + ) {
addr = guest_test_virt_mem + i * guest_page_size ;
* ( uint64_t * ) addr = READ_ONCE ( iteration ) ;
}
2018-08-22 15:20:00 +08:00
while ( true ) {
for ( i = 0 ; i < TEST_PAGES_PER_LOOP ; i + + ) {
2019-07-31 17:15:25 +02:00
addr = guest_test_virt_mem ;
2018-09-18 19:54:34 +02:00
addr + = ( READ_ONCE ( random_array [ i ] ) % guest_num_pages )
2018-09-18 19:54:32 +02:00
* guest_page_size ;
addr & = ~ ( host_page_size - 1 ) ;
* ( uint64_t * ) addr = READ_ONCE ( iteration ) ;
2018-08-22 15:20:00 +08:00
}
2018-09-18 19:54:32 +02:00
2018-08-22 15:20:00 +08:00
/* Tell the host that we need more random numbers */
GUEST_SYNC ( 1 ) ;
}
}
2018-09-18 19:54:32 +02:00
/* Host variables */
static bool host_quit ;
2018-08-22 15:20:00 +08:00
/* Points to the test VM memory region on which we track dirty logs */
2018-09-18 19:54:32 +02:00
static void * host_test_mem ;
static uint64_t host_num_pages ;
2018-08-22 15:20:00 +08:00
/* For statistics only */
2018-09-18 19:54:32 +02:00
static uint64_t host_dirty_count ;
static uint64_t host_clear_count ;
static uint64_t host_track_next_count ;
2018-08-22 15:20:00 +08:00
2020-09-30 21:22:37 -04:00
/* Whether dirty ring reset is requested, or finished */
static sem_t dirty_ring_vcpu_stop ;
static sem_t dirty_ring_vcpu_cont ;
2020-09-30 21:22:39 -04:00
/*
* This is updated by the vcpu thread to tell the host whether it ' s a
* ring - full event . It should only be read until a sem_wait ( ) of
* dirty_ring_vcpu_stop and before vcpu continues to run .
*/
static bool dirty_ring_vcpu_ring_full ;
2020-09-30 21:22:37 -04:00
/*
* This is only used for verifying the dirty pages . Dirty ring has a very
* tricky case when the ring just got full , kvm will do userspace exit due to
* ring full . When that happens , the very last PFN is set but actually the
* data is not changed ( the guest WRITE is not really applied yet ) , because
* we found that the dirty ring is full , refused to continue the vcpu , and
* recorded the dirty gfn with the old contents .
*
* For this specific case , it ' s safe to skip checking this pfn for this
* bit , because it ' s a redundant bit , and when the write happens later the bit
* will be set again . We use this variable to always keep track of the latest
* dirty gfn we ' ve collected , so that if a mismatch of data found later in the
* verifying process , we let it pass .
*/
static uint64_t dirty_ring_last_page ;
2020-09-30 21:22:33 -04:00
enum log_mode_t {
/* Only use KVM_GET_DIRTY_LOG for logging */
LOG_MODE_DIRTY_LOG = 0 ,
/* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
LOG_MODE_CLEAR_LOG = 1 ,
2020-09-30 21:22:37 -04:00
/* Use dirty ring for logging */
LOG_MODE_DIRTY_RING = 2 ,
2020-09-30 21:22:33 -04:00
LOG_MODE_NUM ,
/* Run all supported modes */
LOG_MODE_ALL = LOG_MODE_NUM ,
} ;
/* Mode of logging to test. Default is to run all supported modes */
static enum log_mode_t host_log_mode_option = LOG_MODE_ALL ;
/* Logging mode for current run */
static enum log_mode_t host_log_mode ;
2020-09-30 21:22:37 -04:00
static pthread_t vcpu_thread ;
2020-09-30 21:22:41 -04:00
static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT ;
2020-09-30 21:22:37 -04:00
2020-09-30 21:22:39 -04:00
static void vcpu_kick ( void )
{
pthread_kill ( vcpu_thread , SIG_IPI ) ;
}
2020-09-30 21:22:37 -04:00
/*
* In our test we do signal tricks , let ' s use a better version of
* sem_wait to avoid signal interrupts
*/
static void sem_wait_until ( sem_t * sem )
{
int ret ;
do
ret = sem_wait ( sem ) ;
while ( ret = = - 1 & & errno = = EINTR ) ;
}
2020-09-30 21:22:33 -04:00
static bool clear_log_supported ( void )
{
return kvm_check_cap ( KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 ) ;
}
static void clear_log_create_vm_done ( struct kvm_vm * vm )
{
struct kvm_enable_cap cap = { } ;
u64 manual_caps ;
manual_caps = kvm_check_cap ( KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 ) ;
TEST_ASSERT ( manual_caps , " MANUAL_CAPS is zero! " ) ;
manual_caps & = ( KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
KVM_DIRTY_LOG_INITIALLY_SET ) ;
cap . cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 ;
cap . args [ 0 ] = manual_caps ;
vm_enable_cap ( vm , & cap ) ;
}
static void dirty_log_collect_dirty_pages ( struct kvm_vm * vm , int slot ,
void * bitmap , uint32_t num_pages )
{
kvm_vm_get_dirty_log ( vm , slot , bitmap ) ;
}
static void clear_log_collect_dirty_pages ( struct kvm_vm * vm , int slot ,
void * bitmap , uint32_t num_pages )
{
kvm_vm_get_dirty_log ( vm , slot , bitmap ) ;
kvm_vm_clear_dirty_log ( vm , slot , bitmap , 0 , num_pages ) ;
}
2020-09-30 21:22:37 -04:00
static void default_after_vcpu_run ( struct kvm_vm * vm , int ret , int err )
2020-09-30 21:22:35 -04:00
{
struct kvm_run * run = vcpu_state ( vm , VCPU_ID ) ;
2020-09-30 21:22:37 -04:00
TEST_ASSERT ( ret = = 0 | | ( ret = = - 1 & & err = = EINTR ) ,
" vcpu run failed: errno=%d " , err ) ;
2020-09-30 21:22:35 -04:00
TEST_ASSERT ( get_ucall ( vm , VCPU_ID , NULL ) = = UCALL_SYNC ,
" Invalid guest sync status: exit_reason=%s \n " ,
exit_reason_str ( run - > exit_reason ) ) ;
}
2020-09-30 21:22:37 -04:00
static bool dirty_ring_supported ( void )
{
return kvm_check_cap ( KVM_CAP_DIRTY_LOG_RING ) ;
}
static void dirty_ring_create_vm_done ( struct kvm_vm * vm )
{
/*
* Switch to dirty ring mode after VM creation but before any
* of the vcpu creation .
*/
2020-09-30 21:22:41 -04:00
vm_enable_dirty_ring ( vm , test_dirty_ring_count *
2020-09-30 21:22:37 -04:00
sizeof ( struct kvm_dirty_gfn ) ) ;
}
static inline bool dirty_gfn_is_dirtied ( struct kvm_dirty_gfn * gfn )
{
return gfn - > flags = = KVM_DIRTY_GFN_F_DIRTY ;
}
static inline void dirty_gfn_set_collected ( struct kvm_dirty_gfn * gfn )
{
gfn - > flags = KVM_DIRTY_GFN_F_RESET ;
}
static uint32_t dirty_ring_collect_one ( struct kvm_dirty_gfn * dirty_gfns ,
int slot , void * bitmap ,
uint32_t num_pages , uint32_t * fetch_index )
{
struct kvm_dirty_gfn * cur ;
uint32_t count = 0 ;
while ( true ) {
2020-09-30 21:22:41 -04:00
cur = & dirty_gfns [ * fetch_index % test_dirty_ring_count ] ;
2020-09-30 21:22:37 -04:00
if ( ! dirty_gfn_is_dirtied ( cur ) )
break ;
TEST_ASSERT ( cur - > slot = = slot , " Slot number didn't match: "
" %u != %u " , cur - > slot , slot ) ;
TEST_ASSERT ( cur - > offset < num_pages , " Offset overflow: "
" 0x%llx >= 0x%x " , cur - > offset , num_pages ) ;
//pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
set_bit_le ( cur - > offset , bitmap ) ;
dirty_ring_last_page = cur - > offset ;
dirty_gfn_set_collected ( cur ) ;
( * fetch_index ) + + ;
count + + ;
}
return count ;
}
static void dirty_ring_wait_vcpu ( void )
{
2020-09-30 21:22:39 -04:00
/* This makes sure that hardware PML cache flushed */
vcpu_kick ( ) ;
2020-09-30 21:22:37 -04:00
sem_wait_until ( & dirty_ring_vcpu_stop ) ;
}
static void dirty_ring_continue_vcpu ( void )
{
pr_info ( " Notifying vcpu to continue \n " ) ;
sem_post ( & dirty_ring_vcpu_cont ) ;
}
static void dirty_ring_collect_dirty_pages ( struct kvm_vm * vm , int slot ,
void * bitmap , uint32_t num_pages )
{
/* We only have one vcpu */
static uint32_t fetch_index = 0 ;
uint32_t count = 0 , cleared ;
2020-09-30 21:22:39 -04:00
bool continued_vcpu = false ;
2020-09-30 21:22:37 -04:00
dirty_ring_wait_vcpu ( ) ;
2020-09-30 21:22:39 -04:00
if ( ! dirty_ring_vcpu_ring_full ) {
/*
* This is not a ring - full event , it ' s safe to allow
* vcpu to continue
*/
dirty_ring_continue_vcpu ( ) ;
continued_vcpu = true ;
}
2020-09-30 21:22:37 -04:00
/* Only have one vcpu */
count = dirty_ring_collect_one ( vcpu_map_dirty_ring ( vm , VCPU_ID ) ,
slot , bitmap , num_pages , & fetch_index ) ;
cleared = kvm_vm_reset_dirty_ring ( vm ) ;
/* Cleared pages should be the same as collected */
TEST_ASSERT ( cleared = = count , " Reset dirty pages (%u) mismatch "
" with collected (%u) " , cleared , count ) ;
2020-09-30 21:22:39 -04:00
if ( ! continued_vcpu ) {
TEST_ASSERT ( dirty_ring_vcpu_ring_full ,
" Didn't continue vcpu even without ring full " ) ;
dirty_ring_continue_vcpu ( ) ;
}
2020-09-30 21:22:37 -04:00
pr_info ( " Iteration %ld collected %u pages \n " , iteration , count ) ;
}
static void dirty_ring_after_vcpu_run ( struct kvm_vm * vm , int ret , int err )
{
struct kvm_run * run = vcpu_state ( vm , VCPU_ID ) ;
/* A ucall-sync or ring-full event is allowed */
if ( get_ucall ( vm , VCPU_ID , NULL ) = = UCALL_SYNC ) {
/* We should allow this to continue */
;
2020-09-30 21:22:39 -04:00
} else if ( run - > exit_reason = = KVM_EXIT_DIRTY_RING_FULL | |
( ret = = - 1 & & err = = EINTR ) ) {
2020-09-30 21:22:37 -04:00
/* Update the flag first before pause */
2020-09-30 21:22:39 -04:00
WRITE_ONCE ( dirty_ring_vcpu_ring_full ,
run - > exit_reason = = KVM_EXIT_DIRTY_RING_FULL ) ;
2020-09-30 21:22:37 -04:00
sem_post ( & dirty_ring_vcpu_stop ) ;
2020-09-30 21:22:39 -04:00
pr_info ( " vcpu stops because %s... \n " ,
dirty_ring_vcpu_ring_full ?
" dirty ring is full " : " vcpu is kicked out " ) ;
2020-09-30 21:22:37 -04:00
sem_wait_until ( & dirty_ring_vcpu_cont ) ;
pr_info ( " vcpu continues now. \n " ) ;
} else {
TEST_ASSERT ( false , " Invalid guest sync status: "
" exit_reason=%s \n " ,
exit_reason_str ( run - > exit_reason ) ) ;
}
}
static void dirty_ring_before_vcpu_join ( void )
{
/* Kick another round of vcpu just to make sure it will quit */
sem_post ( & dirty_ring_vcpu_cont ) ;
}
2020-09-30 21:22:33 -04:00
struct log_mode {
const char * name ;
/* Return true if this mode is supported, otherwise false */
bool ( * supported ) ( void ) ;
/* Hook when the vm creation is done (before vcpu creation) */
void ( * create_vm_done ) ( struct kvm_vm * vm ) ;
/* Hook to collect the dirty pages into the bitmap provided */
void ( * collect_dirty_pages ) ( struct kvm_vm * vm , int slot ,
void * bitmap , uint32_t num_pages ) ;
2020-09-30 21:22:35 -04:00
/* Hook to call when after each vcpu run */
2020-09-30 21:22:37 -04:00
void ( * after_vcpu_run ) ( struct kvm_vm * vm , int ret , int err ) ;
void ( * before_vcpu_join ) ( void ) ;
2020-09-30 21:22:33 -04:00
} log_modes [ LOG_MODE_NUM ] = {
{
. name = " dirty-log " ,
. collect_dirty_pages = dirty_log_collect_dirty_pages ,
2020-09-30 21:22:35 -04:00
. after_vcpu_run = default_after_vcpu_run ,
2020-09-30 21:22:33 -04:00
} ,
{
. name = " clear-log " ,
. supported = clear_log_supported ,
. create_vm_done = clear_log_create_vm_done ,
. collect_dirty_pages = clear_log_collect_dirty_pages ,
2020-09-30 21:22:35 -04:00
. after_vcpu_run = default_after_vcpu_run ,
2020-09-30 21:22:33 -04:00
} ,
2020-09-30 21:22:37 -04:00
{
. name = " dirty-ring " ,
. supported = dirty_ring_supported ,
. create_vm_done = dirty_ring_create_vm_done ,
. collect_dirty_pages = dirty_ring_collect_dirty_pages ,
. before_vcpu_join = dirty_ring_before_vcpu_join ,
. after_vcpu_run = dirty_ring_after_vcpu_run ,
} ,
2020-09-30 21:22:33 -04:00
} ;
2018-08-22 15:20:00 +08:00
/*
* We use this bitmap to track some pages that should have its dirty
* bit set in the _next_ iteration . For example , if we detected the
* page value changed to current iteration but at the same time the
* page bit is cleared in the latest bitmap , then the system must
* report that write in the next get dirty log call .
*/
2018-09-18 19:54:32 +02:00
static unsigned long * host_bmap_track ;
2018-08-22 15:20:00 +08:00
2020-09-30 21:22:33 -04:00
static void log_modes_dump ( void )
{
int i ;
printf ( " all " ) ;
for ( i = 0 ; i < LOG_MODE_NUM ; i + + )
printf ( " , %s " , log_modes [ i ] . name ) ;
printf ( " \n " ) ;
}
static bool log_mode_supported ( void )
{
struct log_mode * mode = & log_modes [ host_log_mode ] ;
if ( mode - > supported )
return mode - > supported ( ) ;
return true ;
}
static void log_mode_create_vm_done ( struct kvm_vm * vm )
{
struct log_mode * mode = & log_modes [ host_log_mode ] ;
if ( mode - > create_vm_done )
mode - > create_vm_done ( vm ) ;
}
static void log_mode_collect_dirty_pages ( struct kvm_vm * vm , int slot ,
void * bitmap , uint32_t num_pages )
{
struct log_mode * mode = & log_modes [ host_log_mode ] ;
TEST_ASSERT ( mode - > collect_dirty_pages ! = NULL ,
" collect_dirty_pages() is required for any log mode! " ) ;
mode - > collect_dirty_pages ( vm , slot , bitmap , num_pages ) ;
}
2020-09-30 21:22:37 -04:00
static void log_mode_after_vcpu_run ( struct kvm_vm * vm , int ret , int err )
2020-09-30 21:22:35 -04:00
{
struct log_mode * mode = & log_modes [ host_log_mode ] ;
if ( mode - > after_vcpu_run )
2020-09-30 21:22:37 -04:00
mode - > after_vcpu_run ( vm , ret , err ) ;
}
static void log_mode_before_vcpu_join ( void )
{
struct log_mode * mode = & log_modes [ host_log_mode ] ;
if ( mode - > before_vcpu_join )
mode - > before_vcpu_join ( ) ;
2020-09-30 21:22:35 -04:00
}
2018-09-18 19:54:32 +02:00
static void generate_random_array ( uint64_t * guest_array , uint64_t size )
2018-08-22 15:20:00 +08:00
{
uint64_t i ;
2018-09-18 19:54:32 +02:00
for ( i = 0 ; i < size ; i + + )
2018-08-22 15:20:00 +08:00
guest_array [ i ] = random ( ) ;
}
2018-09-18 19:54:32 +02:00
static void * vcpu_worker ( void * data )
2018-08-22 15:20:00 +08:00
{
2020-09-30 21:22:37 -04:00
int ret , vcpu_fd ;
2018-08-22 15:20:00 +08:00
struct kvm_vm * vm = data ;
2018-09-18 19:54:32 +02:00
uint64_t * guest_array ;
uint64_t pages_count = 0 ;
2020-09-30 21:22:39 -04:00
struct kvm_signal_mask * sigmask = alloca ( offsetof ( struct kvm_signal_mask , sigset )
+ sizeof ( sigset_t ) ) ;
sigset_t * sigset = ( sigset_t * ) & sigmask - > sigset ;
2018-08-22 15:20:00 +08:00
2020-09-30 21:22:37 -04:00
vcpu_fd = vcpu_get_fd ( vm , VCPU_ID ) ;
2020-09-30 21:22:39 -04:00
/*
* SIG_IPI is unblocked atomically while in KVM_RUN . It causes the
* ioctl to return with - EINTR , but it is still pending and we need
* to accept it with the sigwait .
*/
sigmask - > len = 8 ;
pthread_sigmask ( 0 , NULL , sigset ) ;
vcpu_ioctl ( vm , VCPU_ID , KVM_SET_SIGNAL_MASK , sigmask ) ;
sigaddset ( sigset , SIG_IPI ) ;
pthread_sigmask ( SIG_BLOCK , sigset , NULL ) ;
sigemptyset ( sigset ) ;
sigaddset ( sigset , SIG_IPI ) ;
2018-09-18 19:54:32 +02:00
guest_array = addr_gva2hva ( vm , ( vm_vaddr_t ) random_array ) ;
2018-08-22 15:20:00 +08:00
while ( ! READ_ONCE ( host_quit ) ) {
2020-09-30 21:22:37 -04:00
/* Clear any existing kick signals */
2020-09-30 21:22:35 -04:00
generate_random_array ( guest_array , TEST_PAGES_PER_LOOP ) ;
pages_count + = TEST_PAGES_PER_LOOP ;
2018-09-18 19:54:32 +02:00
/* Let the guest dirty the random pages */
2020-09-30 21:22:37 -04:00
ret = ioctl ( vcpu_fd , KVM_RUN , NULL ) ;
2020-09-30 21:22:39 -04:00
if ( ret = = - 1 & & errno = = EINTR ) {
int sig = - 1 ;
sigwait ( sigset , & sig ) ;
assert ( sig = = SIG_IPI ) ;
}
2020-09-30 21:22:37 -04:00
log_mode_after_vcpu_run ( vm , ret , errno ) ;
2018-08-22 15:20:00 +08:00
}
2020-02-14 15:59:16 +01:00
pr_info ( " Dirtied % " PRIu64 " pages \n " , pages_count ) ;
2018-08-22 15:20:00 +08:00
return NULL ;
}
2020-02-14 15:59:20 +01:00
static void vm_dirty_log_verify ( enum vm_guest_mode mode , unsigned long * bmap )
2018-08-22 15:20:00 +08:00
{
2020-02-14 15:59:20 +01:00
uint64_t step = vm_num_host_pages ( mode , 1 ) ;
2018-08-22 15:20:00 +08:00
uint64_t page ;
2018-09-18 19:54:32 +02:00
uint64_t * value_ptr ;
2020-09-30 21:22:37 -04:00
uint64_t min_iter = 0 ;
2018-08-22 15:20:00 +08:00
2018-09-18 19:54:34 +02:00
for ( page = 0 ; page < host_num_pages ; page + = step ) {
2018-09-18 19:54:32 +02:00
value_ptr = host_test_mem + page * host_page_size ;
2018-08-22 15:20:00 +08:00
/* If this is a special page that we were tracking... */
2019-07-31 17:15:25 +02:00
if ( test_and_clear_bit_le ( page , host_bmap_track ) ) {
2018-08-22 15:20:00 +08:00
host_track_next_count + + ;
2019-07-31 17:15:25 +02:00
TEST_ASSERT ( test_bit_le ( page , bmap ) ,
2018-08-22 15:20:00 +08:00
" Page % " PRIu64 " should have its dirty bit "
" set in this iteration but it is missing " ,
page ) ;
}
2020-09-30 21:22:28 -04:00
if ( test_and_clear_bit_le ( page , bmap ) ) {
2020-09-30 21:22:37 -04:00
bool matched ;
2018-08-22 15:20:00 +08:00
host_dirty_count + + ;
2020-09-30 21:22:37 -04:00
2018-08-22 15:20:00 +08:00
/*
* If the bit is set , the value written onto
* the corresponding page should be either the
* previous iteration number or the current one .
*/
2020-09-30 21:22:37 -04:00
matched = ( * value_ptr = = iteration | |
* value_ptr = = iteration - 1 ) ;
if ( host_log_mode = = LOG_MODE_DIRTY_RING & & ! matched ) {
if ( * value_ptr = = iteration - 2 & & min_iter < = iteration - 2 ) {
/*
* Short answer : this case is special
* only for dirty ring test where the
* page is the last page before a kvm
* dirty ring full in iteration N - 2.
*
* Long answer : Assuming ring size R ,
* one possible condition is :
*
* main thr vcpu thr
* - - - - - - - - - - - - - - - -
* iter = 1
* write 1 to page 0 ~ ( R - 1 )
* full , vmexit
* collect 0 ~ ( R - 1 )
* kick vcpu
* write 1 to ( R - 1 ) ~ ( 2 R - 2 )
* full , vmexit
* iter = 2
* collect ( R - 1 ) ~ ( 2 R - 2 )
* kick vcpu
* write 1 to ( 2 R - 2 )
* ( NOTE ! ! ! " 1 " cached in cpu reg )
* write 2 to ( 2 R - 1 ) ~ ( 3 R - 3 )
* full , vmexit
* iter = 3
* collect ( 2 R - 2 ) ~ ( 3 R - 3 )
* ( here if we read value on page
* " 2R-2 " is 1 , while iter = 3 ! ! ! )
*
* This however can only happen once per iteration .
*/
min_iter = iteration - 1 ;
continue ;
} else if ( page = = dirty_ring_last_page ) {
/*
* Please refer to comments in
* dirty_ring_last_page .
*/
continue ;
}
}
TEST_ASSERT ( matched ,
2018-08-22 15:20:00 +08:00
" Set page % " PRIu64 " value % " PRIu64
" incorrect (iteration=% " PRIu64 " ) " ,
page , * value_ptr , iteration ) ;
} else {
host_clear_count + + ;
/*
* If cleared , the value written can be any
* value smaller or equals to the iteration
* number . Note that the value can be exactly
* ( iteration - 1 ) if that write can happen
* like this :
*
* ( 1 ) increase loop count to " iteration-1 "
* ( 2 ) write to page P happens ( with value
* " iteration-1 " )
* ( 3 ) get dirty log for " iteration-1 " ; we ' ll
* see that page P bit is set ( dirtied ) ,
* and not set the bit in host_bmap_track
* ( 4 ) increase loop count to " iteration "
* ( which is current iteration )
* ( 5 ) get dirty log for current iteration ,
* we ' ll see that page P is cleared , with
* value " iteration-1 " .
*/
TEST_ASSERT ( * value_ptr < = iteration ,
" Clear page % " PRIu64 " value % " PRIu64
" incorrect (iteration=% " PRIu64 " ) " ,
page , * value_ptr , iteration ) ;
if ( * value_ptr = = iteration ) {
/*
* This page is _just_ modified ; it
* should report its dirtyness in the
* next run
*/
2019-07-31 17:15:25 +02:00
set_bit_le ( page , host_bmap_track ) ;
2018-08-22 15:20:00 +08:00
}
}
}
}
2018-09-18 19:54:34 +02:00
static struct kvm_vm * create_vm ( enum vm_guest_mode mode , uint32_t vcpuid ,
2019-08-30 09:36:16 +08:00
uint64_t extra_mem_pages , void * guest_code )
2018-08-22 15:20:00 +08:00
{
2018-09-18 19:54:34 +02:00
struct kvm_vm * vm ;
uint64_t extra_pg_pages = extra_mem_pages / 512 * 2 ;
2020-02-14 15:59:16 +01:00
pr_info ( " Testing guest mode: %s \n " , vm_guest_mode_string ( mode ) ) ;
2020-11-04 22:23:48 +01:00
vm = vm_create ( mode , DEFAULT_GUEST_PHY_PAGES + extra_pg_pages , O_RDWR ) ;
2018-09-18 19:54:34 +02:00
kvm_vm_elf_load ( vm , program_invocation_name , 0 , 0 ) ;
# ifdef __x86_64__
vm_create_irqchip ( vm ) ;
# endif
2020-09-30 21:22:33 -04:00
log_mode_create_vm_done ( vm ) ;
2018-09-18 19:54:34 +02:00
vm_vcpu_add_default ( vm , vcpuid , guest_code ) ;
return vm ;
2018-08-22 15:20:00 +08:00
}
2019-08-30 09:36:17 +08:00
# define DIRTY_MEM_BITS 30 /* 1G */
# define PAGE_SHIFT_4K 12
2018-09-18 19:54:34 +02:00
static void run_test ( enum vm_guest_mode mode , unsigned long iterations ,
2018-11-06 14:57:09 +01:00
unsigned long interval , uint64_t phys_offset )
2018-08-22 15:20:00 +08:00
{
struct kvm_vm * vm ;
2018-09-18 19:54:32 +02:00
unsigned long * bmap ;
2018-08-22 15:20:00 +08:00
2020-09-30 21:22:33 -04:00
if ( ! log_mode_supported ( ) ) {
print_skip ( " Log mode '%s' not supported " ,
log_modes [ host_log_mode ] . name ) ;
return ;
}
2019-08-30 09:36:17 +08:00
/*
* We reserve page table for 2 times of extra dirty mem which
* will definitely cover the original ( 1 G + ) test range . Here
* we do the calculation with 4 K page size which is the
* smallest so the page number will be enough for all archs
* ( e . g . , 64 K page size guest will need even less memory for
* page tables ) .
*/
vm = create_vm ( mode , VCPU_ID ,
2ul < < ( DIRTY_MEM_BITS - PAGE_SHIFT_4K ) ,
guest_code ) ;
2019-08-30 09:36:19 +08:00
guest_page_size = vm_get_page_size ( vm ) ;
2019-04-17 15:28:44 +02:00
/*
* A little more than 1 G of guest page sized pages . Cover the
* case where the size is not aligned to 64 pages .
*/
2019-08-30 09:36:19 +08:00
guest_num_pages = ( 1ul < < ( DIRTY_MEM_BITS -
2020-02-14 15:59:20 +01:00
vm_get_page_shift ( vm ) ) ) + 3 ;
guest_num_pages = vm_adjust_num_guest_pages ( mode , guest_num_pages ) ;
2020-03-12 11:40:55 +01:00
2018-09-18 19:54:32 +02:00
host_page_size = getpagesize ( ) ;
2020-02-14 15:59:20 +01:00
host_num_pages = vm_num_host_pages ( mode , guest_num_pages ) ;
2018-09-18 19:54:32 +02:00
2018-11-06 14:57:09 +01:00
if ( ! phys_offset ) {
2019-08-30 09:36:19 +08:00
guest_test_phys_mem = ( vm_get_max_gfn ( vm ) -
guest_num_pages ) * guest_page_size ;
2018-11-06 14:57:07 +01:00
guest_test_phys_mem & = ~ ( host_page_size - 1 ) ;
2018-11-06 14:57:09 +01:00
} else {
guest_test_phys_mem = phys_offset ;
2018-09-18 19:54:36 +02:00
}
2019-07-31 17:15:25 +02:00
# ifdef __s390x__
/* Align to 1M (segment size) */
guest_test_phys_mem & = ~ ( ( 1 < < 20 ) - 1 ) ;
# endif
2020-02-14 15:59:16 +01:00
pr_info ( " guest physical test memory offset: 0x%lx \n " , guest_test_phys_mem ) ;
2018-09-18 19:54:36 +02:00
2018-09-18 19:54:32 +02:00
bmap = bitmap_alloc ( host_num_pages ) ;
host_bmap_track = bitmap_alloc ( host_num_pages ) ;
2018-08-22 15:20:00 +08:00
/* Add an extra memory slot for testing dirty logging */
vm_userspace_mem_region_add ( vm , VM_MEM_SRC_ANONYMOUS ,
2018-11-06 14:57:07 +01:00
guest_test_phys_mem ,
2018-08-22 15:20:00 +08:00
TEST_MEM_SLOT_INDEX ,
2018-09-18 19:54:34 +02:00
guest_num_pages ,
2018-08-22 15:20:00 +08:00
KVM_MEM_LOG_DIRTY_PAGES ) ;
2018-11-06 14:57:07 +01:00
/* Do mapping for the dirty track memory slot */
2020-03-13 16:56:43 +01:00
virt_map ( vm , guest_test_virt_mem , guest_test_phys_mem , guest_num_pages , 0 ) ;
2018-09-18 19:54:32 +02:00
/* Cache the HVA pointer of the region */
2018-11-06 14:57:07 +01:00
host_test_mem = addr_gpa2hva ( vm , ( vm_paddr_t ) guest_test_phys_mem ) ;
2018-08-22 15:20:00 +08:00
2019-07-31 17:15:23 +02:00
ucall_init ( vm , NULL ) ;
2018-08-22 15:20:00 +08:00
2018-09-18 19:54:34 +02:00
/* Export the shared variables to the guest */
2018-09-18 19:54:32 +02:00
sync_global_to_guest ( vm , host_page_size ) ;
sync_global_to_guest ( vm , guest_page_size ) ;
2018-11-06 14:57:07 +01:00
sync_global_to_guest ( vm , guest_test_virt_mem ) ;
2018-09-18 19:54:34 +02:00
sync_global_to_guest ( vm , guest_num_pages ) ;
2018-08-22 15:20:00 +08:00
/* Start the iterations */
2018-09-18 19:54:32 +02:00
iteration = 1 ;
sync_global_to_guest ( vm , iteration ) ;
2018-09-18 19:54:34 +02:00
host_quit = false ;
host_dirty_count = 0 ;
host_clear_count = 0 ;
host_track_next_count = 0 ;
2018-08-22 15:20:00 +08:00
pthread_create ( & vcpu_thread , NULL , vcpu_worker , vm ) ;
2018-09-18 19:54:32 +02:00
while ( iteration < iterations ) {
2018-08-22 15:20:00 +08:00
/* Give the vcpu thread some time to dirty some pages */
usleep ( interval * 1000 ) ;
2020-09-30 21:22:33 -04:00
log_mode_collect_dirty_pages ( vm , TEST_MEM_SLOT_INDEX ,
bmap , host_num_pages ) ;
2020-02-14 15:59:20 +01:00
vm_dirty_log_verify ( mode , bmap ) ;
2018-09-18 19:54:32 +02:00
iteration + + ;
sync_global_to_guest ( vm , iteration ) ;
2018-08-22 15:20:00 +08:00
}
/* Tell the vcpu thread to quit */
host_quit = true ;
2020-09-30 21:22:37 -04:00
log_mode_before_vcpu_join ( ) ;
2018-08-22 15:20:00 +08:00
pthread_join ( vcpu_thread , NULL ) ;
2020-02-14 15:59:16 +01:00
pr_info ( " Total bits checked: dirty (% " PRIu64 " ), clear (% " PRIu64 " ), "
" track_next (% " PRIu64 " ) \n " , host_dirty_count , host_clear_count ,
host_track_next_count ) ;
2018-08-22 15:20:00 +08:00
free ( bmap ) ;
free ( host_bmap_track ) ;
2018-09-18 19:54:32 +02:00
ucall_uninit ( vm ) ;
2018-08-22 15:20:00 +08:00
kvm_vm_free ( vm ) ;
2018-09-18 19:54:34 +02:00
}
2020-02-14 15:59:18 +01:00
struct guest_mode {
2018-09-18 19:54:34 +02:00
bool supported ;
bool enabled ;
} ;
2020-02-14 15:59:18 +01:00
static struct guest_mode guest_modes [ NUM_VM_MODES ] ;
2018-11-06 14:57:10 +01:00
2020-02-14 15:59:18 +01:00
# define guest_mode_init(mode, supported, enabled) ({ \
guest_modes [ mode ] = ( struct guest_mode ) { supported , enabled } ; \
2018-11-06 14:57:10 +01:00
} )
2018-09-18 19:54:34 +02:00
static void help ( char * name )
{
int i ;
puts ( " " ) ;
2018-09-18 19:54:36 +02:00
printf ( " usage: %s [-h] [-i iterations] [-I interval] "
2018-11-06 14:57:08 +01:00
" [-p offset] [-m mode] \n " , name ) ;
2018-09-18 19:54:34 +02:00
puts ( " " ) ;
2020-09-30 21:22:41 -04:00
printf ( " -c: specify dirty ring size, in number of entries \n " ) ;
printf ( " (only useful for dirty-ring test; default: % " PRIu32 " ) \n " ,
TEST_DIRTY_RING_COUNT ) ;
2018-09-18 19:54:34 +02:00
printf ( " -i: specify iteration counts (default: % " PRIu64 " ) \n " ,
TEST_HOST_LOOP_N ) ;
printf ( " -I: specify interval in ms (default: % " PRIu64 " ms) \n " ,
TEST_HOST_LOOP_INTERVAL ) ;
2018-11-06 14:57:08 +01:00
printf ( " -p: specify guest physical test memory offset \n "
" Warning: a low offset can conflict with the loaded test code. \n " ) ;
2020-09-30 21:22:33 -04:00
printf ( " -M: specify the host logging mode "
" (default: run all log modes). Supported modes: \n \t " ) ;
log_modes_dump ( ) ;
2018-09-18 19:54:34 +02:00
printf ( " -m: specify the guest mode ID to test "
" (default: test all supported modes) \n "
" This option may be used multiple times. \n "
" Guest mode IDs: \n " ) ;
for ( i = 0 ; i < NUM_VM_MODES ; + + i ) {
2018-11-06 14:57:10 +01:00
printf ( " %d: %s%s \n " , i , vm_guest_mode_string ( i ) ,
2020-02-14 15:59:18 +01:00
guest_modes [ i ] . supported ? " (supported) " : " " ) ;
2018-09-18 19:54:34 +02:00
}
puts ( " " ) ;
exit ( 0 ) ;
}
int main ( int argc , char * argv [ ] )
{
unsigned long iterations = TEST_HOST_LOOP_N ;
unsigned long interval = TEST_HOST_LOOP_INTERVAL ;
bool mode_selected = false ;
2018-11-06 14:57:09 +01:00
uint64_t phys_offset = 0 ;
2019-05-17 11:04:45 +02:00
unsigned int mode ;
2020-09-30 21:22:33 -04:00
int opt , i , j ;
kvm: introduce manual dirty log reprotect
There are two problems with KVM_GET_DIRTY_LOG. First, and less important,
it can take kvm->mmu_lock for an extended period of time. Second, its user
can actually see many false positives in some cases. The latter is due
to a benign race like this:
1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects
them.
2. The guest modifies the pages, causing them to be marked ditry.
3. Userspace actually copies the pages.
4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though
they were not written to since (3).
This is especially a problem for large guests, where the time between
(1) and (3) can be substantial. This patch introduces a new
capability which, when enabled, makes KVM_GET_DIRTY_LOG not
write-protect the pages it returns. Instead, userspace has to
explicitly clear the dirty log bits just before using the content
of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a
64-page granularity rather than requiring to sync a full memslot;
this way, the mmu_lock is taken for small amounts of time, and
only a small amount of time will pass between write protection
of pages and the sending of their content.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2018-10-23 02:36:47 +02:00
2020-09-30 21:22:37 -04:00
sem_init ( & dirty_ring_vcpu_stop , 0 , 0 ) ;
sem_init ( & dirty_ring_vcpu_cont , 0 , 0 ) ;
2018-11-06 14:57:10 +01:00
# ifdef __x86_64__
2020-02-14 15:59:18 +01:00
guest_mode_init ( VM_MODE_PXXV48_4K , true , true ) ;
2018-11-06 14:57:10 +01:00
# endif
# ifdef __aarch64__
2020-02-14 15:59:18 +01:00
guest_mode_init ( VM_MODE_P40V48_4K , true , true ) ;
guest_mode_init ( VM_MODE_P40V48_64K , true , true ) ;
2020-02-14 15:59:14 +01:00
{
unsigned int limit = kvm_check_cap ( KVM_CAP_ARM_VM_IPA_SIZE ) ;
if ( limit > = 52 )
2020-02-14 15:59:18 +01:00
guest_mode_init ( VM_MODE_P52V48_64K , true , true ) ;
2020-02-14 15:59:14 +01:00
if ( limit > = 48 ) {
2020-02-14 15:59:18 +01:00
guest_mode_init ( VM_MODE_P48V48_4K , true , true ) ;
guest_mode_init ( VM_MODE_P48V48_64K , true , true ) ;
2020-02-14 15:59:14 +01:00
}
2018-11-06 14:57:12 +01:00
}
2018-11-06 14:57:10 +01:00
# endif
2019-07-31 17:15:25 +02:00
# ifdef __s390x__
2020-02-14 15:59:18 +01:00
guest_mode_init ( VM_MODE_P40V48_4K , true , true ) ;
2019-07-31 17:15:25 +02:00
# endif
2018-11-06 14:57:10 +01:00
2020-09-30 21:22:41 -04:00
while ( ( opt = getopt ( argc , argv , " c:hi:I:p:m:M: " ) ) ! = - 1 ) {
2018-09-18 19:54:34 +02:00
switch ( opt ) {
2020-09-30 21:22:41 -04:00
case ' c ' :
test_dirty_ring_count = strtol ( optarg , NULL , 10 ) ;
break ;
2018-09-18 19:54:34 +02:00
case ' i ' :
iterations = strtol ( optarg , NULL , 10 ) ;
break ;
case ' I ' :
interval = strtol ( optarg , NULL , 10 ) ;
break ;
2018-11-06 14:57:08 +01:00
case ' p ' :
2018-11-06 14:57:09 +01:00
phys_offset = strtoull ( optarg , NULL , 0 ) ;
2018-09-18 19:54:36 +02:00
break ;
2018-09-18 19:54:34 +02:00
case ' m ' :
if ( ! mode_selected ) {
for ( i = 0 ; i < NUM_VM_MODES ; + + i )
2020-02-14 15:59:18 +01:00
guest_modes [ i ] . enabled = false ;
2018-09-18 19:54:34 +02:00
mode_selected = true ;
}
mode = strtoul ( optarg , NULL , 10 ) ;
TEST_ASSERT ( mode < NUM_VM_MODES ,
" Guest mode ID %d too big " , mode ) ;
2020-02-14 15:59:18 +01:00
guest_modes [ mode ] . enabled = true ;
2018-09-18 19:54:34 +02:00
break ;
2020-09-30 21:22:33 -04:00
case ' M ' :
if ( ! strcmp ( optarg , " all " ) ) {
host_log_mode_option = LOG_MODE_ALL ;
break ;
}
for ( i = 0 ; i < LOG_MODE_NUM ; i + + ) {
if ( ! strcmp ( optarg , log_modes [ i ] . name ) ) {
pr_info ( " Setting log mode to: '%s' \n " ,
optarg ) ;
host_log_mode_option = i ;
break ;
}
}
if ( i = = LOG_MODE_NUM ) {
printf ( " Log mode '%s' invalid. Please choose "
" from: " , optarg ) ;
log_modes_dump ( ) ;
exit ( 1 ) ;
}
break ;
2018-09-18 19:54:34 +02:00
case ' h ' :
default :
help ( argv [ 0 ] ) ;
break ;
}
}
TEST_ASSERT ( iterations > 2 , " Iterations must be greater than two " ) ;
TEST_ASSERT ( interval > 0 , " Interval must be greater than zero " ) ;
2020-02-14 15:59:16 +01:00
pr_info ( " Test iterations: % " PRIu64 " , interval: % " PRIu64 " (ms) \n " ,
iterations , interval ) ;
2018-09-18 19:54:34 +02:00
srandom ( time ( 0 ) ) ;
for ( i = 0 ; i < NUM_VM_MODES ; + + i ) {
2020-02-14 15:59:18 +01:00
if ( ! guest_modes [ i ] . enabled )
2018-09-18 19:54:34 +02:00
continue ;
2020-02-14 15:59:18 +01:00
TEST_ASSERT ( guest_modes [ i ] . supported ,
2018-09-18 19:54:34 +02:00
" Guest mode ID %d (%s) not supported. " ,
2018-11-06 14:57:10 +01:00
i , vm_guest_mode_string ( i ) ) ;
2020-09-30 21:22:33 -04:00
if ( host_log_mode_option = = LOG_MODE_ALL ) {
/* Run each log mode */
for ( j = 0 ; j < LOG_MODE_NUM ; j + + ) {
pr_info ( " Testing Log Mode '%s' \n " ,
log_modes [ j ] . name ) ;
host_log_mode = j ;
run_test ( i , iterations , interval , phys_offset ) ;
}
} else {
host_log_mode = host_log_mode_option ;
run_test ( i , iterations , interval , phys_offset ) ;
}
2018-09-18 19:54:34 +02:00
}
2018-08-22 15:20:00 +08:00
return 0 ;
}