2007-07-26 21:41:02 +04:00
/*P:400 This contains run_guest() which actually calls into the Host<->Guest
* Switcher and analyzes the return , such as determining if the Guest wants the
* Host to do something . This file also contains useful helper routines , and a
* couple of non - obvious setup and teardown pieces which were implemented after
* days of debugging pain . : */
2007-07-19 12:49:23 +04:00
# include <linux/module.h>
# include <linux/stringify.h>
# include <linux/stddef.h>
# include <linux/io.h>
# include <linux/mm.h>
# include <linux/vmalloc.h>
# include <linux/cpu.h>
# include <linux/freezer.h>
# include <asm/paravirt.h>
# include <asm/desc.h>
# include <asm/pgtable.h>
# include <asm/uaccess.h>
# include <asm/poll.h>
# include <asm/highmem.h>
# include <asm/asm-offsets.h>
# include <asm/i387.h>
# include "lg.h"
/* Found in switcher.S */
extern char start_switcher_text [ ] , end_switcher_text [ ] , switch_to_guest [ ] ;
extern unsigned long default_idt_entries [ ] ;
/* Every guest maps the core switcher code. */
# define SHARED_SWITCHER_PAGES \
DIV_ROUND_UP ( end_switcher_text - start_switcher_text , PAGE_SIZE )
/* Pages for switcher itself, then two pages per cpu */
# define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
/* We map at -4M for ease of mapping into the guest (one PTE page). */
# define SWITCHER_ADDR 0xFFC00000
static struct vm_struct * switcher_vma ;
static struct page * * switcher_page ;
static int cpu_had_pge ;
static struct {
unsigned long offset ;
unsigned short segment ;
} lguest_entry ;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX ( lguest_lock ) ;
static DEFINE_PER_CPU ( struct lguest * , last_guest ) ;
/* FIXME: Make dynamic. */
# define MAX_LGUEST_GUESTS 16
struct lguest lguests [ MAX_LGUEST_GUESTS ] ;
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset ( void )
{
return SWITCHER_ADDR - ( unsigned long ) start_switcher_text ;
}
/* This cpu's struct lguest_pages. */
static struct lguest_pages * lguest_pages ( unsigned int cpu )
{
return & ( ( ( struct lguest_pages * )
( SWITCHER_ADDR + SHARED_SWITCHER_PAGES * PAGE_SIZE ) ) [ cpu ] ) ;
}
2007-07-26 21:41:04 +04:00
/*H:010 We need to set up the Switcher at a high virtual address. Remember the
* Switcher is a few hundred bytes of assembler code which actually changes the
* CPU to run the Guest , and then changes back to the Host when a trap or
* interrupt happens .
*
* The Switcher code must be at the same virtual address in the Guest as the
* Host since it will be running as the switchover occurs .
*
* Trying to map memory at a particular address is an unusual thing to do , so
* it ' s not a simple one - liner . We also set up the per - cpu parts of the
* Switcher here .
*/
2007-07-19 12:49:23 +04:00
static __init int map_switcher ( void )
{
int i , err ;
struct page * * pagep ;
2007-07-26 21:41:04 +04:00
/*
* Map the Switcher in to high memory .
*
* It turns out that if we choose the address 0xFFC00000 ( 4 MB under the
* top virtual address ) , it makes setting up the page tables really
* easy .
*/
/* We allocate an array of "struct page"s. map_vm_area() wants the
* pages in this form , rather than just an array of pointers . */
2007-07-19 12:49:23 +04:00
switcher_page = kmalloc ( sizeof ( switcher_page [ 0 ] ) * TOTAL_SWITCHER_PAGES ,
GFP_KERNEL ) ;
if ( ! switcher_page ) {
err = - ENOMEM ;
goto out ;
}
2007-07-26 21:41:04 +04:00
/* Now we actually allocate the pages. The Guest will see these pages,
* so we make sure they ' re zeroed . */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < TOTAL_SWITCHER_PAGES ; i + + ) {
unsigned long addr = get_zeroed_page ( GFP_KERNEL ) ;
if ( ! addr ) {
err = - ENOMEM ;
goto free_some_pages ;
}
switcher_page [ i ] = virt_to_page ( addr ) ;
}
2007-07-26 21:41:04 +04:00
/* Now we reserve the "virtual memory area" we want: 0xFFC00000
* ( SWITCHER_ADDR ) . We might not get it in theory , but in practice
* it ' s worked so far . */
2007-07-19 12:49:23 +04:00
switcher_vma = __get_vm_area ( TOTAL_SWITCHER_PAGES * PAGE_SIZE ,
VM_ALLOC , SWITCHER_ADDR , VMALLOC_END ) ;
if ( ! switcher_vma ) {
err = - ENOMEM ;
printk ( " lguest: could not map switcher pages high \n " ) ;
goto free_pages ;
}
2007-07-26 21:41:04 +04:00
/* This code actually sets up the pages we've allocated to appear at
* SWITCHER_ADDR . map_vm_area ( ) takes the vma we allocated above , the
* kind of pages we ' re mapping ( kernel pages ) , and a pointer to our
* array of struct pages . It increments that pointer , but we don ' t
* care . */
2007-07-19 12:49:23 +04:00
pagep = switcher_page ;
err = map_vm_area ( switcher_vma , PAGE_KERNEL , & pagep ) ;
if ( err ) {
printk ( " lguest: map_vm_area failed: %i \n " , err ) ;
goto free_vma ;
}
2007-07-26 21:41:04 +04:00
/* Now the switcher is mapped at the right address, we can't fail!
* Copy in the compiled - in Switcher code ( from switcher . S ) . */
2007-07-19 12:49:23 +04:00
memcpy ( switcher_vma - > addr , start_switcher_text ,
end_switcher_text - start_switcher_text ) ;
2007-07-26 21:41:04 +04:00
/* Most of the switcher.S doesn't care that it's been moved; on Intel,
* jumps are relative , and it doesn ' t access any references to external
* code or data .
*
* The only exception is the interrupt handlers in switcher . S : their
* addresses are placed in a table ( default_idt_entries ) , so we need to
* update the table with the new addresses . switcher_offset ( ) is a
* convenience function which returns the distance between the builtin
* switcher code and the high - mapped copy we just made . */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < IDT_ENTRIES ; i + + )
default_idt_entries [ i ] + = switcher_offset ( ) ;
2007-07-26 21:41:04 +04:00
/*
* Set up the Switcher ' s per - cpu areas .
*
* Each CPU gets two pages of its own within the high - mapped region
* ( aka . " struct lguest_pages " ) . Much of this can be initialized now ,
* but some depends on what Guest we are running ( which is set up in
* copy_in_guest_info ( ) ) .
*/
2007-07-19 12:49:23 +04:00
for_each_possible_cpu ( i ) {
2007-07-26 21:41:04 +04:00
/* lguest_pages() returns this CPU's two pages. */
2007-07-19 12:49:23 +04:00
struct lguest_pages * pages = lguest_pages ( i ) ;
2007-07-26 21:41:04 +04:00
/* This is a convenience pointer to make the code fit one
* statement to a line . */
2007-07-19 12:49:23 +04:00
struct lguest_ro_state * state = & pages - > state ;
2007-07-26 21:41:04 +04:00
/* The Global Descriptor Table: the Host has a different one
* for each CPU . We keep a descriptor for the GDT which says
* where it is and how big it is ( the size is actually the last
* byte , not the size , hence the " -1 " ) . */
2007-07-19 12:49:23 +04:00
state - > host_gdt_desc . size = GDT_SIZE - 1 ;
state - > host_gdt_desc . address = ( long ) get_cpu_gdt_table ( i ) ;
2007-07-26 21:41:04 +04:00
/* All CPUs on the Host use the same Interrupt Descriptor
* Table , so we just use store_idt ( ) , which gets this CPU ' s IDT
* descriptor . */
2007-07-19 12:49:23 +04:00
store_idt ( & state - > host_idt_desc ) ;
2007-07-26 21:41:04 +04:00
/* The descriptors for the Guest's GDT and IDT can be filled
* out now , too . We copy the GDT & IDT into - > guest_gdt and
* - > guest_idt before actually running the Guest . */
2007-07-19 12:49:23 +04:00
state - > guest_idt_desc . size = sizeof ( state - > guest_idt ) - 1 ;
state - > guest_idt_desc . address = ( long ) & state - > guest_idt ;
state - > guest_gdt_desc . size = sizeof ( state - > guest_gdt ) - 1 ;
state - > guest_gdt_desc . address = ( long ) & state - > guest_gdt ;
2007-07-26 21:41:04 +04:00
/* We know where we want the stack to be when the Guest enters
* the switcher : in pages - > regs . The stack grows upwards , so
* we start it at the end of that structure . */
2007-07-19 12:49:23 +04:00
state - > guest_tss . esp0 = ( long ) ( & pages - > regs + 1 ) ;
2007-07-26 21:41:04 +04:00
/* And this is the GDT entry to use for the stack: we keep a
* couple of special LGUEST entries . */
2007-07-19 12:49:23 +04:00
state - > guest_tss . ss0 = LGUEST_DS ;
2007-07-26 21:41:04 +04:00
/* x86 can have a finegrained bitmap which indicates what I/O
* ports the process can use . We set it to the end of our
* structure , meaning " none " . */
2007-07-19 12:49:23 +04:00
state - > guest_tss . io_bitmap_base = sizeof ( state - > guest_tss ) ;
2007-07-26 21:41:04 +04:00
/* Some GDT entries are the same across all Guests, so we can
* set them up now . */
2007-07-19 12:49:23 +04:00
setup_default_gdt_entries ( state ) ;
2007-07-26 21:41:04 +04:00
/* Most IDT entries are the same for all Guests, too.*/
2007-07-19 12:49:23 +04:00
setup_default_idt_entries ( state , default_idt_entries ) ;
2007-07-26 21:41:04 +04:00
/* The Host needs to be able to use the LGUEST segments on this
* CPU , too , so put them in the Host GDT . */
2007-07-19 12:49:23 +04:00
get_cpu_gdt_table ( i ) [ GDT_ENTRY_LGUEST_CS ] = FULL_EXEC_SEGMENT ;
get_cpu_gdt_table ( i ) [ GDT_ENTRY_LGUEST_DS ] = FULL_SEGMENT ;
}
2007-07-26 21:41:04 +04:00
/* In the Switcher, we want the %cs segment register to use the
* LGUEST_CS GDT entry : we ' ve put that in the Host and Guest GDTs , so
* it will be undisturbed when we switch . To change % cs and jump we
* need this structure to feed to Intel ' s " lcall " instruction . */
2007-07-19 12:49:23 +04:00
lguest_entry . offset = ( long ) switch_to_guest + switcher_offset ( ) ;
lguest_entry . segment = LGUEST_CS ;
printk ( KERN_INFO " lguest: mapped switcher at %p \n " ,
switcher_vma - > addr ) ;
2007-07-26 21:41:04 +04:00
/* And we succeeded... */
2007-07-19 12:49:23 +04:00
return 0 ;
free_vma :
vunmap ( switcher_vma - > addr ) ;
free_pages :
i = TOTAL_SWITCHER_PAGES ;
free_some_pages :
for ( - - i ; i > = 0 ; i - - )
__free_pages ( switcher_page [ i ] , 0 ) ;
kfree ( switcher_page ) ;
out :
return err ;
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* Cleaning up the mapping when the module is unloaded is almost...
* too easy . */
2007-07-19 12:49:23 +04:00
static void unmap_switcher ( void )
{
unsigned int i ;
2007-07-26 21:41:04 +04:00
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
2007-07-19 12:49:23 +04:00
vunmap ( switcher_vma - > addr ) ;
2007-07-26 21:41:04 +04:00
/* Now we just need to free the pages we copied the switcher into */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < TOTAL_SWITCHER_PAGES ; i + + )
__free_pages ( switcher_page [ i ] , 0 ) ;
}
2007-07-26 21:41:04 +04:00
/*H:130 Our Guest is usually so well behaved; it never tries to do things it
* isn ' t allowed to . Unfortunately , " struct paravirt_ops " isn ' t quite
* complete , because it doesn ' t contain replacements for the Intel I / O
* instructions . As a result , the Guest sometimes fumbles across one during
* the boot process as it probes for various things which are usually attached
* to a PC .
*
* When the Guest uses one of these instructions , we get trap # 13 ( General
* Protection Fault ) and come here . We see if it ' s one of those troublesome
* instructions and skip over it . We return true if we did . */
2007-07-19 12:49:23 +04:00
static int emulate_insn ( struct lguest * lg )
{
u8 insn ;
unsigned int insnlen = 0 , in = 0 , shift = 0 ;
2007-07-26 21:41:04 +04:00
/* The eip contains the *virtual* address of the Guest's instruction:
* guest_pa just subtracts the Guest ' s page_offset . */
2007-07-19 12:49:23 +04:00
unsigned long physaddr = guest_pa ( lg , lg - > regs - > eip ) ;
2007-07-26 21:41:04 +04:00
/* The guest_pa() function only works for Guest kernel addresses, but
* that ' s all we ' re trying to do anyway . */
2007-07-19 12:49:23 +04:00
if ( lg - > regs - > eip < lg - > page_offset )
return 0 ;
2007-07-26 21:41:04 +04:00
/* Decoding x86 instructions is icky. */
2007-07-19 12:49:23 +04:00
lgread ( lg , & insn , physaddr , 1 ) ;
2007-07-26 21:41:04 +04:00
/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
of the eax register . */
2007-07-19 12:49:23 +04:00
if ( insn = = 0x66 ) {
shift = 16 ;
2007-07-26 21:41:04 +04:00
/* The instruction is 1 byte so far, read the next byte. */
2007-07-19 12:49:23 +04:00
insnlen = 1 ;
lgread ( lg , & insn , physaddr + insnlen , 1 ) ;
}
2007-07-26 21:41:04 +04:00
/* We can ignore the lower bit for the moment and decode the 4 opcodes
* we need to emulate . */
2007-07-19 12:49:23 +04:00
switch ( insn & 0xFE ) {
case 0xE4 : /* in <next byte>,%al */
insnlen + = 2 ;
in = 1 ;
break ;
case 0xEC : /* in (%dx),%al */
insnlen + = 1 ;
in = 1 ;
break ;
case 0xE6 : /* out %al,<next byte> */
insnlen + = 2 ;
break ;
case 0xEE : /* out %al,(%dx) */
insnlen + = 1 ;
break ;
default :
2007-07-26 21:41:04 +04:00
/* OK, we don't know what this is, can't emulate. */
2007-07-19 12:49:23 +04:00
return 0 ;
}
2007-07-26 21:41:04 +04:00
/* If it was an "IN" instruction, they expect the result to be read
* into % eax , so we change % eax . We always return all - ones , which
* traditionally means " there's nothing there " . */
2007-07-19 12:49:23 +04:00
if ( in ) {
/* Lower bit tells is whether it's a 16 or 32 bit access */
if ( insn & 0x1 )
lg - > regs - > eax = 0xFFFFFFFF ;
else
lg - > regs - > eax | = ( 0xFFFF < < shift ) ;
}
2007-07-26 21:41:04 +04:00
/* Finally, we've "done" the instruction, so move past it. */
2007-07-19 12:49:23 +04:00
lg - > regs - > eip + = insnlen ;
2007-07-26 21:41:04 +04:00
/* Success! */
2007-07-19 12:49:23 +04:00
return 1 ;
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:03 +04:00
/*L:305
* Dealing With Guest Memory .
*
* When the Guest gives us ( what it thinks is ) a physical address , we can use
* the normal copy_from_user ( ) & copy_to_user ( ) on that address : remember ,
* Guest physical = = Launcher virtual .
*
* But we can ' t trust the Guest : it might be trying to access the Launcher
* code . We have to check that the range is below the pfn_limit the Launcher
* gave us . We have to make sure that addr + len doesn ' t give us a false
* positive by overflowing , too . */
2007-07-19 12:49:23 +04:00
int lguest_address_ok ( const struct lguest * lg ,
unsigned long addr , unsigned long len )
{
return ( addr + len ) / PAGE_SIZE < lg - > pfn_limit & & ( addr + len > = addr ) ;
}
2007-07-26 21:41:03 +04:00
/* This is a convenient routine to get a 32-bit value from the Guest (a very
* common operation ) . Here we can see how useful the kill_lguest ( ) routine we
* met in the Launcher can be : we return a random value ( 0 ) instead of needing
* to return an error . */
2007-07-19 12:49:23 +04:00
u32 lgread_u32 ( struct lguest * lg , unsigned long addr )
{
u32 val = 0 ;
2007-07-26 21:41:03 +04:00
/* Don't let them access lguest binary. */
2007-07-19 12:49:23 +04:00
if ( ! lguest_address_ok ( lg , addr , sizeof ( val ) )
| | get_user ( val , ( u32 __user * ) addr ) ! = 0 )
kill_guest ( lg , " bad read address %#lx " , addr ) ;
return val ;
}
2007-07-26 21:41:03 +04:00
/* Same thing for writing a value. */
2007-07-19 12:49:23 +04:00
void lgwrite_u32 ( struct lguest * lg , unsigned long addr , u32 val )
{
if ( ! lguest_address_ok ( lg , addr , sizeof ( val ) )
| | put_user ( val , ( u32 __user * ) addr ) ! = 0 )
kill_guest ( lg , " bad write address %#lx " , addr ) ;
}
2007-07-26 21:41:03 +04:00
/* This routine is more generic, and copies a range of Guest bytes into a
* buffer . If the copy_from_user ( ) fails , we fill the buffer with zeroes , so
* the caller doesn ' t end up using uninitialized kernel memory . */
2007-07-19 12:49:23 +04:00
void lgread ( struct lguest * lg , void * b , unsigned long addr , unsigned bytes )
{
if ( ! lguest_address_ok ( lg , addr , bytes )
| | copy_from_user ( b , ( void __user * ) addr , bytes ) ! = 0 ) {
/* copy_from_user should do this, but as we rely on it... */
memset ( b , 0 , bytes ) ;
kill_guest ( lg , " bad read address %#lx len %u " , addr , bytes ) ;
}
}
2007-07-26 21:41:03 +04:00
/* Similarly, our generic routine to copy into a range of Guest bytes. */
2007-07-19 12:49:23 +04:00
void lgwrite ( struct lguest * lg , unsigned long addr , const void * b ,
unsigned bytes )
{
if ( ! lguest_address_ok ( lg , addr , bytes )
| | copy_to_user ( ( void __user * ) addr , b , bytes ) ! = 0 )
kill_guest ( lg , " bad write address %#lx len %u " , addr , bytes ) ;
}
2007-07-26 21:41:03 +04:00
/* (end of memory access helper routines) :*/
2007-07-19 12:49:23 +04:00
static void set_ts ( void )
{
u32 cr0 ;
cr0 = read_cr0 ( ) ;
if ( ! ( cr0 & 8 ) )
write_cr0 ( cr0 | 8 ) ;
}
2007-07-26 21:41:04 +04:00
/*S:010
* We are getting close to the Switcher .
*
* Remember that each CPU has two pages which are visible to the Guest when it
* runs on that CPU . This has to contain the state for that Guest : we copy the
* state in just before we run the Guest .
*
* Each Guest has " changed " flags which indicate what has changed in the Guest
* since it last ran . We saw this set in interrupts_and_traps . c and
* segments . c .
*/
2007-07-19 12:49:23 +04:00
static void copy_in_guest_info ( struct lguest * lg , struct lguest_pages * pages )
{
2007-07-26 21:41:04 +04:00
/* Copying all this data can be quite expensive. We usually run the
* same Guest we ran last time ( and that Guest hasn ' t run anywhere else
* meanwhile ) . If that ' s not the case , we pretend everything in the
* Guest has changed . */
2007-07-19 12:49:23 +04:00
if ( __get_cpu_var ( last_guest ) ! = lg | | lg - > last_pages ! = pages ) {
__get_cpu_var ( last_guest ) = lg ;
lg - > last_pages = pages ;
lg - > changed = CHANGED_ALL ;
}
2007-07-26 21:41:04 +04:00
/* These copies are pretty cheap, so we do them unconditionally: */
/* Save the current Host top-level page directory. */
2007-07-19 12:49:23 +04:00
pages - > state . host_cr3 = __pa ( current - > mm - > pgd ) ;
2007-07-26 21:41:04 +04:00
/* Set up the Guest's page tables to see this CPU's pages (and no
* other CPU ' s pages ) . */
2007-07-19 12:49:23 +04:00
map_switcher_in_guest ( lg , pages ) ;
2007-07-26 21:41:04 +04:00
/* Set up the two "TSS" members which tell the CPU what stack to use
* for traps which do directly into the Guest ( ie . traps at privilege
* level 1 ) . */
2007-07-19 12:49:23 +04:00
pages - > state . guest_tss . esp1 = lg - > esp1 ;
pages - > state . guest_tss . ss1 = lg - > ss1 ;
2007-07-26 21:41:04 +04:00
/* Copy direct-to-Guest trap entries. */
2007-07-19 12:49:23 +04:00
if ( lg - > changed & CHANGED_IDT )
copy_traps ( lg , pages - > state . guest_idt , default_idt_entries ) ;
2007-07-26 21:41:04 +04:00
/* Copy all GDT entries which the Guest can change. */
2007-07-19 12:49:23 +04:00
if ( lg - > changed & CHANGED_GDT )
copy_gdt ( lg , pages - > state . guest_gdt ) ;
/* If only the TLS entries have changed, copy them. */
else if ( lg - > changed & CHANGED_GDT_TLS )
copy_gdt_tls ( lg , pages - > state . guest_gdt ) ;
2007-07-26 21:41:04 +04:00
/* Mark the Guest as unchanged for next time. */
2007-07-19 12:49:23 +04:00
lg - > changed = 0 ;
}
2007-07-26 21:41:04 +04:00
/* Finally: the code to actually call into the Switcher to run the Guest. */
2007-07-19 12:49:23 +04:00
static void run_guest_once ( struct lguest * lg , struct lguest_pages * pages )
{
2007-07-26 21:41:04 +04:00
/* This is a dummy value we need for GCC's sake. */
2007-07-19 12:49:23 +04:00
unsigned int clobber ;
2007-07-26 21:41:04 +04:00
/* Copy the guest-specific information into this CPU's "struct
* lguest_pages " . */
2007-07-19 12:49:23 +04:00
copy_in_guest_info ( lg , pages ) ;
2007-08-09 14:57:13 +04:00
/* Set the trap number to 256 (impossible value). If we fault while
* switching to the Guest ( bad segment registers or bug ) , this will
* cause us to abort the Guest . */
lg - > regs - > trapnum = 256 ;
2007-07-26 21:41:04 +04:00
/* Now: we push the "eflags" register on the stack, then do an "lcall".
* This is how we change from using the kernel code segment to using
* the dedicated lguest code segment , as well as jumping into the
* Switcher .
*
* The lcall also pushes the old code segment ( KERNEL_CS ) onto the
* stack , then the address of this call . This stack layout happens to
* exactly match the stack of an interrupt . . . */
2007-07-19 12:49:23 +04:00
asm volatile ( " pushf; lcall *lguest_entry "
2007-07-26 21:41:04 +04:00
/* This is how we tell GCC that %eax ("a") and %ebx ("b")
* are changed by this routine . The " = " means output . */
2007-07-19 12:49:23 +04:00
: " =a " ( clobber ) , " =b " ( clobber )
2007-07-26 21:41:04 +04:00
/* %eax contains the pages pointer. ("0" refers to the
* 0 - th argument above , ie " a " ) . % ebx contains the
* physical address of the Guest ' s top - level page
* directory . */
2007-07-19 12:49:23 +04:00
: " 0 " ( pages ) , " 1 " ( __pa ( lg - > pgdirs [ lg - > pgdidx ] . pgdir ) )
2007-07-26 21:41:04 +04:00
/* We tell gcc that all these registers could change,
* which means we don ' t have to save and restore them in
* the Switcher . */
2007-07-19 12:49:23 +04:00
: " memory " , " %edx " , " %ecx " , " %edi " , " %esi " ) ;
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/*H:030 Let's jump straight to the the main loop which runs the Guest.
* Remember , this is called by the Launcher reading / dev / lguest , and we keep
* going around and around until something interesting happens . */
2007-07-19 12:49:23 +04:00
int run_guest ( struct lguest * lg , unsigned long __user * user )
{
2007-07-26 21:41:04 +04:00
/* We stop running once the Guest is dead. */
2007-07-19 12:49:23 +04:00
while ( ! lg - > dead ) {
2007-07-26 21:41:04 +04:00
/* We need to initialize this, otherwise gcc complains. It's
* not ( yet ) clever enough to see that it ' s initialized when we
* need it . */
2007-07-19 12:49:23 +04:00
unsigned int cr2 = 0 ; /* Damn gcc */
2007-07-26 21:41:04 +04:00
/* First we run any hypercalls the Guest wants done: either in
* the hypercall ring in " struct lguest_data " , or directly by
* using int 31 ( LGUEST_TRAP_ENTRY ) . */
2007-07-19 12:49:23 +04:00
do_hypercalls ( lg ) ;
2007-07-26 21:41:04 +04:00
/* It's possible the Guest did a SEND_DMA hypercall to the
* Launcher , in which case we return from the read ( ) now . */
2007-07-19 12:49:23 +04:00
if ( lg - > dma_is_pending ) {
if ( put_user ( lg - > pending_dma , user ) | |
put_user ( lg - > pending_key , user + 1 ) )
return - EFAULT ;
return sizeof ( unsigned long ) * 2 ;
}
2007-07-26 21:41:04 +04:00
/* Check for signals */
2007-07-19 12:49:23 +04:00
if ( signal_pending ( current ) )
return - ERESTARTSYS ;
/* If Waker set break_out, return to Launcher. */
if ( lg - > break_out )
return - EAGAIN ;
2007-07-26 21:41:04 +04:00
/* Check if there are any interrupts which can be delivered
* now : if so , this sets up the hander to be executed when we
* next run the Guest . */
2007-07-19 12:49:23 +04:00
maybe_do_interrupt ( lg ) ;
2007-07-26 21:41:04 +04:00
/* All long-lived kernel loops need to check with this horrible
* thing called the freezer . If the Host is trying to suspend ,
* it stops us . */
2007-07-19 12:49:23 +04:00
try_to_freeze ( ) ;
2007-07-26 21:41:04 +04:00
/* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal , for example . */
2007-07-19 12:49:23 +04:00
if ( lg - > dead )
break ;
2007-07-26 21:41:04 +04:00
/* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer or LHCALL_BREAK from the Waker will wake us . */
2007-07-19 12:49:23 +04:00
if ( lg - > halted ) {
set_current_state ( TASK_INTERRUPTIBLE ) ;
schedule ( ) ;
continue ;
}
2007-07-26 21:41:04 +04:00
/* OK, now we're ready to jump into the Guest. First we put up
* the " Do Not Disturb " sign : */
2007-07-19 12:49:23 +04:00
local_irq_disable ( ) ;
2007-07-26 21:41:04 +04:00
/* Remember the awfully-named TS bit? If the Guest has asked
* to set it we set it now , so we can trap and pass that trap
* to the Guest if it uses the FPU . */
2007-07-19 12:49:23 +04:00
if ( lg - > ts )
set_ts ( ) ;
2007-07-26 21:41:04 +04:00
/* SYSENTER is an optimized way of doing system calls. We
* can ' t allow it because it always jumps to privilege level 0.
* A normal Guest won ' t try it because we don ' t advertise it in
* CPUID , but a malicious Guest ( or malicious Guest userspace
* program ) could , so we tell the CPU to disable it before
* running the Guest . */
2007-07-19 12:49:23 +04:00
if ( boot_cpu_has ( X86_FEATURE_SEP ) )
wrmsr ( MSR_IA32_SYSENTER_CS , 0 , 0 ) ;
2007-07-26 21:41:04 +04:00
/* Now we actually run the Guest. It will pop back out when
* something interesting happens , and we can examine its
* registers to see what it was doing . */
2007-07-19 12:49:23 +04:00
run_guest_once ( lg , lguest_pages ( raw_smp_processor_id ( ) ) ) ;
2007-07-26 21:41:04 +04:00
/* The "regs" pointer contains two extra entries which are not
* really registers : a trap number which says what interrupt or
* trap made the switcher code come back , and an error code
* which some traps set . */
/* If the Guest page faulted, then the cr2 register will tell
* us the bad virtual address . We have to grab this now ,
* because once we re - enable interrupts an interrupt could
* fault and thus overwrite cr2 , or we could even move off to a
* different CPU . */
2007-07-19 12:49:23 +04:00
if ( lg - > regs - > trapnum = = 14 )
cr2 = read_cr2 ( ) ;
2007-07-26 21:41:04 +04:00
/* Similarly, if we took a trap because the Guest used the FPU,
* we have to restore the FPU it expects to see . */
2007-07-19 12:49:23 +04:00
else if ( lg - > regs - > trapnum = = 7 )
math_state_restore ( ) ;
2007-07-26 21:41:04 +04:00
/* Restore SYSENTER if it's supposed to be on. */
2007-07-19 12:49:23 +04:00
if ( boot_cpu_has ( X86_FEATURE_SEP ) )
wrmsr ( MSR_IA32_SYSENTER_CS , __KERNEL_CS , 0 ) ;
2007-07-26 21:41:04 +04:00
/* Now we're ready to be interrupted or moved to other CPUs */
2007-07-19 12:49:23 +04:00
local_irq_enable ( ) ;
2007-07-26 21:41:04 +04:00
/* OK, so what happened? */
2007-07-19 12:49:23 +04:00
switch ( lg - > regs - > trapnum ) {
case 13 : /* We've intercepted a GPF. */
2007-07-26 21:41:04 +04:00
/* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate . If so , we
* just go back into the Guest after we ' ve done it . */
2007-07-19 12:49:23 +04:00
if ( lg - > regs - > errcode = = 0 ) {
if ( emulate_insn ( lg ) )
continue ;
}
break ;
case 14 : /* We've intercepted a page fault. */
2007-07-26 21:41:04 +04:00
/* The Guest accessed a virtual address that wasn't
* mapped . This happens a lot : we don ' t actually set
* up most of the page tables for the Guest at all when
* we start : as it runs it asks for more and more , and
* we set them up as required . In this case , we don ' t
* even tell the Guest that the fault happened .
*
* The errcode tells whether this was a read or a
* write , and whether kernel or userspace code . */
2007-07-19 12:49:23 +04:00
if ( demand_page ( lg , cr2 , lg - > regs - > errcode ) )
continue ;
2007-07-26 21:41:04 +04:00
/* OK, it's really not there (or not OK): the Guest
* needs to know . We write out the cr2 value so it
* knows where the fault occurred .
*
* Note that if the Guest were really messed up , this
* could happen before it ' s done the INITIALIZE
* hypercall , so lg - > lguest_data will be NULL , so
* & lg - > lguest_data - > cr2 will be address 8. Writing
* into that address won ' t hurt the Host at all ,
* though . */
2007-07-19 12:49:23 +04:00
if ( put_user ( cr2 , & lg - > lguest_data - > cr2 ) )
kill_guest ( lg , " Writing cr2 " ) ;
break ;
case 7 : /* We've intercepted a Device Not Available fault. */
2007-07-26 21:41:04 +04:00
/* If the Guest doesn't want to know, we already
* restored the Floating Point Unit , so we just
* continue without telling it . */
2007-07-19 12:49:23 +04:00
if ( ! lg - > ts )
continue ;
break ;
2007-07-26 21:41:04 +04:00
case 32 . . . 255 :
/* These values mean a real interrupt occurred, in
* which case the Host handler has already been run .
* We just do a friendly check if another process
* should now be run , then fall through to loop
* around : */
2007-07-19 12:49:23 +04:00
cond_resched ( ) ;
case LGUEST_TRAP_ENTRY : /* Handled at top of loop */
continue ;
}
2007-07-26 21:41:04 +04:00
/* If we get here, it's a trap the Guest wants to know
* about . */
2007-07-19 12:49:23 +04:00
if ( deliver_trap ( lg , lg - > regs - > trapnum ) )
continue ;
2007-07-26 21:41:04 +04:00
/* If the Guest doesn't have a handler (either it hasn't
* registered any yet , or it ' s one of the faults we don ' t let
* it handle ) , it dies with a cryptic error message . */
2007-07-19 12:49:23 +04:00
kill_guest ( lg , " unhandled trap %li at %#lx (%#lx) " ,
lg - > regs - > trapnum , lg - > regs - > eip ,
lg - > regs - > trapnum = = 14 ? cr2 : lg - > regs - > errcode ) ;
}
2007-07-26 21:41:04 +04:00
/* The Guest is dead => "No such file or directory" */
2007-07-19 12:49:23 +04:00
return - ENOENT ;
}
2007-07-26 21:41:04 +04:00
/* Now we can look at each of the routines this calls, in increasing order of
* complexity : do_hypercalls ( ) , emulate_insn ( ) , maybe_do_interrupt ( ) ,
* deliver_trap ( ) and demand_page ( ) . After all those , we ' ll be ready to
* examine the Switcher , and our philosophical understanding of the Host / Guest
* duality will be complete . : */
2007-07-19 12:49:23 +04:00
int find_free_guest ( void )
{
unsigned int i ;
for ( i = 0 ; i < MAX_LGUEST_GUESTS ; i + + )
if ( ! lguests [ i ] . tsk )
return i ;
return - 1 ;
}
static void adjust_pge ( void * on )
{
if ( on )
write_cr4 ( read_cr4 ( ) | X86_CR4_PGE ) ;
else
write_cr4 ( read_cr4 ( ) & ~ X86_CR4_PGE ) ;
}
2007-07-26 21:41:04 +04:00
/*H:000
* Welcome to the Host !
*
* By this point your brain has been tickled by the Guest code and numbed by
* the Launcher code ; prepare for it to be stretched by the Host code . This is
* the heart . Let ' s begin at the initialization routine for the Host ' s lg
* module .
*/
2007-07-19 12:49:23 +04:00
static int __init init ( void )
{
int err ;
2007-07-26 21:41:04 +04:00
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
2007-07-19 12:49:23 +04:00
if ( paravirt_enabled ( ) ) {
printk ( " lguest is afraid of %s \n " , paravirt_ops . name ) ;
return - EPERM ;
}
2007-07-26 21:41:04 +04:00
/* First we put the Switcher up in very high virtual memory. */
2007-07-19 12:49:23 +04:00
err = map_switcher ( ) ;
if ( err )
return err ;
2007-07-26 21:41:04 +04:00
/* Now we set up the pagetable implementation for the Guests. */
2007-07-19 12:49:23 +04:00
err = init_pagetables ( switcher_page , SHARED_SWITCHER_PAGES ) ;
if ( err ) {
unmap_switcher ( ) ;
return err ;
}
2007-07-26 21:41:04 +04:00
/* The I/O subsystem needs some things initialized. */
2007-07-19 12:49:23 +04:00
lguest_io_init ( ) ;
2007-07-26 21:41:04 +04:00
/* /dev/lguest needs to be registered. */
2007-07-19 12:49:23 +04:00
err = lguest_device_init ( ) ;
if ( err ) {
free_pagetables ( ) ;
unmap_switcher ( ) ;
return err ;
}
2007-07-26 21:41:04 +04:00
/* Finally, we need to turn off "Page Global Enable". PGE is an
* optimization where page table entries are specially marked to show
* they never change . The Host kernel marks all the kernel pages this
* way because it ' s always present , even when userspace is running .
*
* Lguest breaks this : unbeknownst to the rest of the Host kernel , we
* switch to the Guest kernel . If you don ' t disable this on all CPUs ,
* you ' ll get really weird bugs that you ' ll chase for two days .
*
* I used to turn PGE off every time we switched to the Guest and back
* on when we return , but that slowed the Switcher down noticibly . */
/* We don't need the complexity of CPUs coming and going while we're
* doing this . */
2007-07-19 12:49:23 +04:00
lock_cpu_hotplug ( ) ;
if ( cpu_has_pge ) { /* We have a broader idea of "global". */
2007-07-26 21:41:04 +04:00
/* Remember that this was originally set (for cleanup). */
2007-07-19 12:49:23 +04:00
cpu_had_pge = 1 ;
2007-07-26 21:41:04 +04:00
/* adjust_pge is a helper function which sets or unsets the PGE
* bit on its CPU , depending on the argument ( 0 = = unset ) . */
2007-07-19 12:49:23 +04:00
on_each_cpu ( adjust_pge , ( void * ) 0 , 0 , 1 ) ;
2007-07-26 21:41:04 +04:00
/* Turn off the feature in the global feature set. */
2007-07-19 12:49:23 +04:00
clear_bit ( X86_FEATURE_PGE , boot_cpu_data . x86_capability ) ;
}
unlock_cpu_hotplug ( ) ;
2007-07-26 21:41:04 +04:00
/* All good! */
2007-07-19 12:49:23 +04:00
return 0 ;
}
2007-07-26 21:41:04 +04:00
/* Cleaning up is just the same code, backwards. With a little French. */
2007-07-19 12:49:23 +04:00
static void __exit fini ( void )
{
lguest_device_remove ( ) ;
free_pagetables ( ) ;
unmap_switcher ( ) ;
2007-07-26 21:41:04 +04:00
/* If we had PGE before we started, turn it back on now. */
2007-07-19 12:49:23 +04:00
lock_cpu_hotplug ( ) ;
if ( cpu_had_pge ) {
set_bit ( X86_FEATURE_PGE , boot_cpu_data . x86_capability ) ;
2007-07-26 21:41:04 +04:00
/* adjust_pge's argument "1" means set PGE. */
2007-07-19 12:49:23 +04:00
on_each_cpu ( adjust_pge , ( void * ) 1 , 0 , 1 ) ;
}
unlock_cpu_hotplug ( ) ;
}
2007-07-26 21:41:04 +04:00
/* The Host side of lguest can be a module. This is a nice way for people to
* play with it . */
2007-07-19 12:49:23 +04:00
module_init ( init ) ;
module_exit ( fini ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_AUTHOR ( " Rusty Russell <rusty@rustcorp.com.au> " ) ;