2007-10-22 11:03:28 +10:00
/*
* Copyright ( C ) 2006 , Rusty Russell < rusty @ rustcorp . com . au > IBM Corporation .
* Copyright ( C ) 2007 , Jes Sorensen < jes @ sgi . com > SGI .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE , GOOD TITLE or
* NON INFRINGEMENT . See the GNU General Public License for more
* details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*/
2009-07-30 16:03:45 -06:00
/*P:450
* This file contains the x86 - specific lguest code . It used to be all
2008-03-28 11:05:53 -05:00
* mixed in with drivers / lguest / core . c but several foolhardy code slashers
* wrestled most of the dependencies out to here in preparation for porting
* lguest to other architectures ( see what I mean by foolhardy ? ) .
*
* This also contains a couple of non - obvious setup and teardown pieces which
2009-07-30 16:03:45 -06:00
* were implemented after days of debugging pain .
: */
2007-10-22 11:03:28 +10:00
# include <linux/kernel.h>
# include <linux/start_kernel.h>
# include <linux/string.h>
# include <linux/console.h>
# include <linux/screen_info.h>
# include <linux/irq.h>
# include <linux/interrupt.h>
# include <linux/clocksource.h>
# include <linux/clockchips.h>
# include <linux/cpu.h>
# include <linux/lguest.h>
# include <linux/lguest_launcher.h>
# include <asm/paravirt.h>
# include <asm/param.h>
# include <asm/page.h>
# include <asm/pgtable.h>
# include <asm/desc.h>
# include <asm/setup.h>
# include <asm/lguest.h>
# include <asm/uaccess.h>
# include <asm/i387.h>
# include "../lg.h"
static int cpu_had_pge ;
static struct {
unsigned long offset ;
unsigned short segment ;
} lguest_entry ;
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset ( void )
{
return SWITCHER_ADDR - ( unsigned long ) start_switcher_text ;
}
/* This cpu's struct lguest_pages. */
static struct lguest_pages * lguest_pages ( unsigned int cpu )
{
return & ( ( ( struct lguest_pages * )
( SWITCHER_ADDR + SHARED_SWITCHER_PAGES * PAGE_SIZE ) ) [ cpu ] ) ;
}
2009-10-29 22:34:14 +09:00
static DEFINE_PER_CPU ( struct lg_cpu * , lg_last_cpu ) ;
2007-10-22 11:03:28 +10:00
/*S:010
2007-10-25 15:02:50 +10:00
* We approach the Switcher .
2007-10-22 11:03:28 +10:00
*
* Remember that each CPU has two pages which are visible to the Guest when it
* runs on that CPU . This has to contain the state for that Guest : we copy the
* state in just before we run the Guest .
*
* Each Guest has " changed " flags which indicate what has changed in the Guest
* since it last ran . We saw this set in interrupts_and_traps . c and
* segments . c .
*/
2008-01-07 11:05:25 -02:00
static void copy_in_guest_info ( struct lg_cpu * cpu , struct lguest_pages * pages )
2007-10-22 11:03:28 +10:00
{
2009-07-30 16:03:45 -06:00
/*
* Copying all this data can be quite expensive . We usually run the
2007-10-22 11:03:28 +10:00
* same Guest we ran last time ( and that Guest hasn ' t run anywhere else
* meanwhile ) . If that ' s not the case , we pretend everything in the
2009-07-30 16:03:45 -06:00
* Guest has changed .
*/
2009-10-29 22:34:14 +09:00
if ( __get_cpu_var ( lg_last_cpu ) ! = cpu | | cpu - > last_pages ! = pages ) {
__get_cpu_var ( lg_last_cpu ) = cpu ;
2008-01-17 19:13:26 -02:00
cpu - > last_pages = pages ;
2008-01-17 19:14:46 -02:00
cpu - > changed = CHANGED_ALL ;
2007-10-22 11:03:28 +10:00
}
2009-07-30 16:03:45 -06:00
/*
* These copies are pretty cheap , so we do them unconditionally : */
/* Save the current Host top-level page directory.
*/
2007-10-22 11:03:28 +10:00
pages - > state . host_cr3 = __pa ( current - > mm - > pgd ) ;
2009-07-30 16:03:45 -06:00
/*
* Set up the Guest ' s page tables to see this CPU ' s pages ( and no
* other CPU ' s pages ) .
*/
2008-01-07 11:05:30 -02:00
map_switcher_in_guest ( cpu , pages ) ;
2009-07-30 16:03:45 -06:00
/*
* Set up the two " TSS " members which tell the CPU what stack to use
2007-10-22 11:03:28 +10:00
* for traps which do directly into the Guest ( ie . traps at privilege
2009-07-30 16:03:45 -06:00
* level 1 ) .
*/
2008-01-31 18:00:47 +11:00
pages - > state . guest_tss . sp1 = cpu - > esp1 ;
2008-01-07 11:05:35 -02:00
pages - > state . guest_tss . ss1 = cpu - > ss1 ;
2007-10-22 11:03:28 +10:00
/* Copy direct-to-Guest trap entries. */
2008-01-17 19:14:46 -02:00
if ( cpu - > changed & CHANGED_IDT )
2008-01-07 11:05:33 -02:00
copy_traps ( cpu , pages - > state . guest_idt , default_idt_entries ) ;
2007-10-22 11:03:28 +10:00
/* Copy all GDT entries which the Guest can change. */
2008-01-17 19:14:46 -02:00
if ( cpu - > changed & CHANGED_GDT )
2008-01-07 11:05:33 -02:00
copy_gdt ( cpu , pages - > state . guest_gdt ) ;
2007-10-22 11:03:28 +10:00
/* If only the TLS entries have changed, copy them. */
2008-01-17 19:14:46 -02:00
else if ( cpu - > changed & CHANGED_GDT_TLS )
2008-01-07 11:05:33 -02:00
copy_gdt_tls ( cpu , pages - > state . guest_gdt ) ;
2007-10-22 11:03:28 +10:00
/* Mark the Guest as unchanged for next time. */
2008-01-17 19:14:46 -02:00
cpu - > changed = 0 ;
2007-10-22 11:03:28 +10:00
}
/* Finally: the code to actually call into the Switcher to run the Guest. */
2008-01-07 11:05:25 -02:00
static void run_guest_once ( struct lg_cpu * cpu , struct lguest_pages * pages )
2007-10-22 11:03:28 +10:00
{
/* This is a dummy value we need for GCC's sake. */
unsigned int clobber ;
2009-07-30 16:03:45 -06:00
/*
* Copy the guest - specific information into this CPU ' s " struct
* lguest_pages " .
*/
2008-01-07 11:05:25 -02:00
copy_in_guest_info ( cpu , pages ) ;
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* Set the trap number to 256 ( impossible value ) . If we fault while
2007-10-22 11:03:28 +10:00
* switching to the Guest ( bad segment registers or bug ) , this will
2009-07-30 16:03:45 -06:00
* cause us to abort the Guest .
*/
2008-01-07 11:05:32 -02:00
cpu - > regs - > trapnum = 256 ;
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* Now : we push the " eflags " register on the stack , then do an " lcall " .
2007-10-22 11:03:28 +10:00
* This is how we change from using the kernel code segment to using
* the dedicated lguest code segment , as well as jumping into the
* Switcher .
*
* The lcall also pushes the old code segment ( KERNEL_CS ) onto the
* stack , then the address of this call . This stack layout happens to
2009-07-30 16:03:45 -06:00
* exactly match the stack layout created by an interrupt . . .
*/
2007-10-22 11:03:28 +10:00
asm volatile ( " pushf; lcall *lguest_entry "
2009-07-30 16:03:45 -06:00
/*
* This is how we tell GCC that % eax ( " a " ) and % ebx ( " b " )
* are changed by this routine . The " = " means output .
*/
2007-10-22 11:03:28 +10:00
: " =a " ( clobber ) , " =b " ( clobber )
2009-07-30 16:03:45 -06:00
/*
* % eax contains the pages pointer . ( " 0 " refers to the
2007-10-22 11:03:28 +10:00
* 0 - th argument above , ie " a " ) . % ebx contains the
* physical address of the Guest ' s top - level page
2009-07-30 16:03:45 -06:00
* directory .
*/
2008-01-17 19:19:42 -02:00
: " 0 " ( pages ) , " 1 " ( __pa ( cpu - > lg - > pgdirs [ cpu - > cpu_pgd ] . pgdir ) )
2009-07-30 16:03:45 -06:00
/*
* We tell gcc that all these registers could change ,
2007-10-22 11:03:28 +10:00
* which means we don ' t have to save and restore them in
2009-07-30 16:03:45 -06:00
* the Switcher .
*/
2007-10-22 11:03:28 +10:00
: " memory " , " %edx " , " %ecx " , " %edi " , " %esi " ) ;
}
/*:*/
2009-07-30 16:03:45 -06:00
/*M:002
* There are hooks in the scheduler which we can register to tell when we
2007-10-25 15:02:50 +10:00
* get kicked off the CPU ( preempt_notifier_register ( ) ) . This would allow us
* to lazily disable SYSENTER which would regain some performance , and should
* also simplify copy_in_guest_info ( ) . Note that we ' d still need to restore
* things when we exit to Launcher userspace , but that ' s fairly easy .
*
2009-07-30 16:03:45 -06:00
* We could also try using these hooks for PGE , but that might be too expensive .
2008-03-28 11:05:53 -05:00
*
2009-07-30 16:03:45 -06:00
* The hooks were designed for KVM , but we can also put them to good use .
: */
2007-10-25 15:02:50 +10:00
2009-07-30 16:03:45 -06:00
/*H:040
* This is the i386 - specific code to setup and run the Guest . Interrupts
* are disabled : we own the CPU .
*/
2008-01-07 11:05:25 -02:00
void lguest_arch_run_guest ( struct lg_cpu * cpu )
2007-10-22 11:03:28 +10:00
{
2009-07-30 16:03:45 -06:00
/*
* Remember the awfully - named TS bit ? If the Guest has asked to set it
2007-10-25 15:02:50 +10:00
* we set it now , so we can trap and pass that trap to the Guest if it
2009-07-30 16:03:45 -06:00
* uses the FPU .
*/
2008-01-07 11:05:35 -02:00
if ( cpu - > ts )
2008-06-19 09:41:22 -07:00
unlazy_fpu ( current ) ;
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* SYSENTER is an optimized way of doing system calls . We can ' t allow
2007-10-25 15:02:50 +10:00
* it because it always jumps to privilege level 0. A normal Guest
* won ' t try it because we don ' t advertise it in CPUID , but a malicious
* Guest ( or malicious Guest userspace program ) could , so we tell the
2009-07-30 16:03:45 -06:00
* CPU to disable it before running the Guest .
*/
2007-10-22 11:03:28 +10:00
if ( boot_cpu_has ( X86_FEATURE_SEP ) )
wrmsr ( MSR_IA32_SYSENTER_CS , 0 , 0 ) ;
2009-07-30 16:03:45 -06:00
/*
* Now we actually run the Guest . It will return when something
2007-10-25 15:02:50 +10:00
* interesting happens , and we can examine its registers to see what it
2009-07-30 16:03:45 -06:00
* was doing .
*/
2008-01-07 11:05:25 -02:00
run_guest_once ( cpu , lguest_pages ( raw_smp_processor_id ( ) ) ) ;
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* Note that the " regs " structure contains two extra entries which are
2007-10-25 15:02:50 +10:00
* not really registers : a trap number which says what interrupt or
* trap made the switcher code come back , and an error code which some
2009-07-30 16:03:45 -06:00
* traps set .
*/
2007-10-22 11:03:28 +10:00
2008-06-19 09:41:22 -07:00
/* Restore SYSENTER if it's supposed to be on. */
if ( boot_cpu_has ( X86_FEATURE_SEP ) )
wrmsr ( MSR_IA32_SYSENTER_CS , __KERNEL_CS , 0 ) ;
2009-07-30 16:03:45 -06:00
/*
* If the Guest page faulted , then the cr2 register will tell us the
2007-10-25 15:02:50 +10:00
* bad virtual address . We have to grab this now , because once we
* re - enable interrupts an interrupt could fault and thus overwrite
2009-07-30 16:03:45 -06:00
* cr2 , or we could even move off to a different CPU .
*/
2008-01-07 11:05:32 -02:00
if ( cpu - > regs - > trapnum = = 14 )
2008-01-07 11:05:33 -02:00
cpu - > arch . last_pagefault = read_cr2 ( ) ;
2009-07-30 16:03:45 -06:00
/*
* Similarly , if we took a trap because the Guest used the FPU ,
2008-06-19 09:41:22 -07:00
* we have to restore the FPU it expects to see .
* math_state_restore ( ) may sleep and we may even move off to
* a different CPU . So all the critical stuff should be done
2009-07-30 16:03:45 -06:00
* before this .
*/
2008-01-07 11:05:32 -02:00
else if ( cpu - > regs - > trapnum = = 7 )
2007-10-22 11:03:28 +10:00
math_state_restore ( ) ;
}
2009-07-30 16:03:45 -06:00
/*H:130
* Now we ' ve examined the hypercall code ; our Guest can make requests .
2007-10-25 15:02:50 +10:00
* Our Guest is usually so well behaved ; it never tries to do things it isn ' t
* allowed to , and uses hypercalls instead . Unfortunately , Linux ' s paravirtual
* infrastructure isn ' t quite complete , because it doesn ' t contain replacements
* for the Intel I / O instructions . As a result , the Guest sometimes fumbles
* across one during the boot process as it probes for various things which are
* usually attached to a PC .
2007-10-22 11:03:28 +10:00
*
2007-10-25 15:02:50 +10:00
* When the Guest uses one of these instructions , we get a trap ( General
2007-10-22 11:03:28 +10:00
* Protection Fault ) and come here . We see if it ' s one of those troublesome
2009-07-30 16:03:45 -06:00
* instructions and skip over it . We return true if we did .
*/
2008-01-07 11:05:31 -02:00
static int emulate_insn ( struct lg_cpu * cpu )
2007-10-22 11:03:28 +10:00
{
u8 insn ;
unsigned int insnlen = 0 , in = 0 , shift = 0 ;
2009-07-30 16:03:45 -06:00
/*
* The eip contains the * virtual * address of the Guest ' s instruction :
* guest_pa just subtracts the Guest ' s page_offset .
*/
2008-01-07 11:05:37 -02:00
unsigned long physaddr = guest_pa ( cpu , cpu - > regs - > eip ) ;
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* This must be the Guest kernel trying to do something , not userspace !
2007-10-22 11:03:36 +10:00
* The bottom two bits of the CS segment register are the privilege
2009-07-30 16:03:45 -06:00
* level .
*/
2008-01-07 11:05:32 -02:00
if ( ( cpu - > regs - > cs & 3 ) ! = GUEST_PL )
2007-10-22 11:03:28 +10:00
return 0 ;
/* Decoding x86 instructions is icky. */
2008-01-17 19:19:42 -02:00
insn = lgread ( cpu , physaddr , u8 ) ;
2007-10-22 11:03:28 +10:00
2010-04-14 21:43:53 -06:00
/*
* Around 2.6 .33 , the kernel started using an emulation for the
* cmpxchg8b instruction in early boot on many configurations . This
* code isn ' t paravirtualized , and it tries to disable interrupts .
* Ignore it , which will Mostly Work .
*/
if ( insn = = 0xfa ) {
/* "cli", or Clear Interrupt Enable instruction. Skip it. */
cpu - > regs - > eip + + ;
return 1 ;
}
2009-07-30 16:03:45 -06:00
/*
* 0x66 is an " operand prefix " . It means it ' s using the upper 16 bits
* of the eax register .
*/
2007-10-22 11:03:28 +10:00
if ( insn = = 0x66 ) {
shift = 16 ;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1 ;
2008-01-17 19:19:42 -02:00
insn = lgread ( cpu , physaddr + insnlen , u8 ) ;
2007-10-22 11:03:28 +10:00
}
2009-07-30 16:03:45 -06:00
/*
* We can ignore the lower bit for the moment and decode the 4 opcodes
* we need to emulate .
*/
2007-10-22 11:03:28 +10:00
switch ( insn & 0xFE ) {
case 0xE4 : /* in <next byte>,%al */
insnlen + = 2 ;
in = 1 ;
break ;
case 0xEC : /* in (%dx),%al */
insnlen + = 1 ;
in = 1 ;
break ;
case 0xE6 : /* out %al,<next byte> */
insnlen + = 2 ;
break ;
case 0xEE : /* out %al,(%dx) */
insnlen + = 1 ;
break ;
default :
/* OK, we don't know what this is, can't emulate. */
return 0 ;
}
2009-07-30 16:03:45 -06:00
/*
* If it was an " IN " instruction , they expect the result to be read
2007-10-22 11:03:28 +10:00
* into % eax , so we change % eax . We always return all - ones , which
2009-07-30 16:03:45 -06:00
* traditionally means " there's nothing there " .
*/
2007-10-22 11:03:28 +10:00
if ( in ) {
/* Lower bit tells is whether it's a 16 or 32 bit access */
if ( insn & 0x1 )
2008-01-07 11:05:32 -02:00
cpu - > regs - > eax = 0xFFFFFFFF ;
2007-10-22 11:03:28 +10:00
else
2008-01-07 11:05:32 -02:00
cpu - > regs - > eax | = ( 0xFFFF < < shift ) ;
2007-10-22 11:03:28 +10:00
}
/* Finally, we've "done" the instruction, so move past it. */
2008-01-07 11:05:32 -02:00
cpu - > regs - > eip + = insnlen ;
2007-10-22 11:03:28 +10:00
/* Success! */
return 1 ;
}
2009-07-30 16:03:45 -06:00
/*
* Our hypercalls mechanism used to be based on direct software interrupts .
2009-03-14 13:37:52 -02:00
* After Anthony ' s " Refactor hypercall infrastructure " kvm patch , we decided to
* change over to using kvm hypercalls .
*
* KVM_HYPERCALL is actually a " vmcall " instruction , which generates an invalid
* opcode fault ( fault 6 ) on non - VT cpus , so the easiest solution seemed to be
* an * emulation approach * : if the fault was really produced by an hypercall
* ( is_hypercall ( ) does exactly this check ) , we can just call the corresponding
* hypercall host implementation function .
*
* But these invalid opcode faults are notably slower than software interrupts .
* So we implemented the * patching ( or rewriting ) approach * : every time we hit
* the KVM_HYPERCALL opcode in Guest code , we patch it to the old " int 0x1f "
* opcode , so next time the Guest calls this hypercall it will use the
* faster trap mechanism .
*
* Matias even benchmarked it to convince you : this shows the average cycle
* cost of a hypercall . For each alternative solution mentioned above we ' ve
* made 5 runs of the benchmark :
*
* 1 ) direct software interrupt : 2915 , 2789 , 2764 , 2721 , 2898
* 2 ) emulation technique : 3410 , 3681 , 3466 , 3392 , 3780
* 3 ) patching ( rewrite ) technique : 2977 , 2975 , 2891 , 2637 , 2884
*
* One two - line function is worth a 20 % hypercall speed boost !
*/
static void rewrite_hypercall ( struct lg_cpu * cpu )
{
2009-07-30 16:03:45 -06:00
/*
* This are the opcodes we use to patch the Guest . The opcode for " int
2009-03-14 13:37:52 -02:00
* $ 0x1f " is " 0xcd 0x1f " but vmcall instruction is 3 bytes long, so we
2009-07-30 16:03:45 -06:00
* complete the sequence with a NOP ( 0x90 ) .
*/
2009-03-14 13:37:52 -02:00
u8 insn [ 3 ] = { 0xcd , 0x1f , 0x90 } ;
__lgwrite ( cpu , guest_pa ( cpu , cpu - > regs - > eip ) , insn , sizeof ( insn ) ) ;
2009-07-30 16:03:45 -06:00
/*
* The above write might have caused a copy of that page to be made
2009-04-08 17:58:39 -03:00
* ( if it was read - only ) . We need to make sure the Guest has
* up - to - date pagetables . As this doesn ' t happen often , we can just
2009-07-30 16:03:45 -06:00
* drop them all .
*/
2009-04-08 17:58:39 -03:00
guest_pagetable_clear_all ( cpu ) ;
2009-03-14 13:37:52 -02:00
}
static bool is_hypercall ( struct lg_cpu * cpu )
{
u8 insn [ 3 ] ;
2009-07-30 16:03:45 -06:00
/*
* This must be the Guest kernel trying to do something .
2009-03-14 13:37:52 -02:00
* The bottom two bits of the CS segment register are the privilege
2009-07-30 16:03:45 -06:00
* level .
*/
2009-03-14 13:37:52 -02:00
if ( ( cpu - > regs - > cs & 3 ) ! = GUEST_PL )
return false ;
/* Is it a vmcall? */
__lgread ( cpu , insn , guest_pa ( cpu , cpu - > regs - > eip ) , sizeof ( insn ) ) ;
return insn [ 0 ] = = 0x0f & & insn [ 1 ] = = 0x01 & & insn [ 2 ] = = 0xc1 ;
}
2007-10-22 11:03:28 +10:00
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
2008-01-07 11:05:27 -02:00
void lguest_arch_handle_trap ( struct lg_cpu * cpu )
2007-10-22 11:03:28 +10:00
{
2008-01-07 11:05:32 -02:00
switch ( cpu - > regs - > trapnum ) {
2007-10-25 15:02:50 +10:00
case 13 : /* We've intercepted a General Protection Fault. */
2009-07-30 16:03:45 -06:00
/*
* Check if this was one of those annoying IN or OUT
2007-10-25 15:02:50 +10:00
* instructions which we need to emulate . If so , we just go
2009-07-30 16:03:45 -06:00
* back into the Guest after we ' ve done it .
*/
2008-01-07 11:05:32 -02:00
if ( cpu - > regs - > errcode = = 0 ) {
2008-01-07 11:05:31 -02:00
if ( emulate_insn ( cpu ) )
2007-10-22 11:03:28 +10:00
return ;
}
2009-07-30 16:03:45 -06:00
/*
* If KVM is active , the vmcall instruction triggers a General
* Protection Fault . Normally it triggers an invalid opcode
* fault ( 6 ) :
*/
2009-05-26 20:54:41 +09:30
case 6 :
2009-07-30 16:03:45 -06:00
/*
* We need to check if ring = = GUEST_PL and faulting
* instruction = = vmcall .
*/
2009-05-26 20:54:41 +09:30
if ( is_hypercall ( cpu ) ) {
rewrite_hypercall ( cpu ) ;
return ;
}
2007-10-22 11:03:28 +10:00
break ;
2007-10-25 15:02:50 +10:00
case 14 : /* We've intercepted a Page Fault. */
2009-07-30 16:03:45 -06:00
/*
* The Guest accessed a virtual address that wasn ' t mapped .
2008-03-28 11:05:53 -05:00
* This happens a lot : we don ' t actually set up most of the page
* tables for the Guest at all when we start : as it runs it asks
* for more and more , and we set them up as required . In this
* case , we don ' t even tell the Guest that the fault happened .
2007-10-25 15:02:50 +10:00
*
* The errcode tells whether this was a read or a write , and
2009-07-30 16:03:45 -06:00
* whether kernel or userspace code .
*/
2008-01-07 11:05:37 -02:00
if ( demand_page ( cpu , cpu - > arch . last_pagefault ,
cpu - > regs - > errcode ) )
2007-10-22 11:03:28 +10:00
return ;
2009-07-30 16:03:45 -06:00
/*
* OK , it ' s really not there ( or not OK ) : the Guest needs to
2007-10-25 15:02:50 +10:00
* know . We write out the cr2 value so it knows where the
* fault occurred .
*
* Note that if the Guest were really messed up , this could
* happen before it ' s done the LHCALL_LGUEST_INIT hypercall , so
2009-07-30 16:03:45 -06:00
* lg - > lguest_data could be NULL
*/
2008-01-17 19:19:42 -02:00
if ( cpu - > lg - > lguest_data & &
put_user ( cpu - > arch . last_pagefault ,
& cpu - > lg - > lguest_data - > cr2 ) )
kill_guest ( cpu , " Writing cr2 " ) ;
2007-10-22 11:03:28 +10:00
break ;
case 7 : /* We've intercepted a Device Not Available fault. */
2009-07-30 16:03:45 -06:00
/*
* If the Guest doesn ' t want to know , we already restored the
* Floating Point Unit , so we just continue without telling it .
*/
2008-01-07 11:05:35 -02:00
if ( ! cpu - > ts )
2007-10-22 11:03:28 +10:00
return ;
break ;
case 32 . . . 255 :
2009-07-30 16:03:45 -06:00
/*
* These values mean a real interrupt occurred , in which case
2009-03-14 13:37:52 -02:00
* the Host handler has already been run . We just do a
2007-10-22 11:03:30 +10:00
* friendly check if another process should now be run , then
2009-07-30 16:03:45 -06:00
* return to run the Guest again
*/
2007-10-22 11:03:28 +10:00
cond_resched ( ) ;
2007-10-22 11:03:30 +10:00
return ;
case LGUEST_TRAP_ENTRY :
2009-07-30 16:03:45 -06:00
/*
* Our ' struct hcall_args ' maps directly over our regs : we set
* up the pointer now to indicate a hypercall is pending .
*/
2008-01-07 11:05:32 -02:00
cpu - > hcall = ( struct hcall_args * ) cpu - > regs ;
2007-10-22 11:03:28 +10:00
return ;
}
/* We didn't handle the trap, so it needs to go to the Guest. */
2008-01-07 11:05:32 -02:00
if ( ! deliver_trap ( cpu , cpu - > regs - > trapnum ) )
2009-07-30 16:03:45 -06:00
/*
* If the Guest doesn ' t have a handler ( either it hasn ' t
2007-10-22 11:03:28 +10:00
* registered any yet , or it ' s one of the faults we don ' t let
2009-07-30 16:03:45 -06:00
* it handle ) , it dies with this cryptic error message .
*/
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " unhandled trap %li at %#lx (%#lx) " ,
2008-01-07 11:05:32 -02:00
cpu - > regs - > trapnum , cpu - > regs - > eip ,
2008-01-07 11:05:33 -02:00
cpu - > regs - > trapnum = = 14 ? cpu - > arch . last_pagefault
2008-01-07 11:05:32 -02:00
: cpu - > regs - > errcode ) ;
2007-10-22 11:03:28 +10:00
}
2009-07-30 16:03:45 -06:00
/*
* Now we can look at each of the routines this calls , in increasing order of
2007-10-22 11:03:28 +10:00
* complexity : do_hypercalls ( ) , emulate_insn ( ) , maybe_do_interrupt ( ) ,
* deliver_trap ( ) and demand_page ( ) . After all those , we ' ll be ready to
* examine the Switcher , and our philosophical understanding of the Host / Guest
2009-07-30 16:03:45 -06:00
* duality will be complete .
: */
2007-10-22 11:03:28 +10:00
static void adjust_pge ( void * on )
{
if ( on )
write_cr4 ( read_cr4 ( ) | X86_CR4_PGE ) ;
else
write_cr4 ( read_cr4 ( ) & ~ X86_CR4_PGE ) ;
}
2009-07-30 16:03:45 -06:00
/*H:020
* Now the Switcher is mapped and every thing else is ready , we need to do
* some more i386 - specific initialization .
*/
2007-10-22 11:03:28 +10:00
void __init lguest_arch_host_init ( void )
{
int i ;
2009-07-30 16:03:45 -06:00
/*
* Most of the i386 / switcher . S doesn ' t care that it ' s been moved ; on
2007-10-22 11:03:28 +10:00
* Intel , jumps are relative , and it doesn ' t access any references to
* external code or data .
*
* The only exception is the interrupt handlers in switcher . S : their
* addresses are placed in a table ( default_idt_entries ) , so we need to
* update the table with the new addresses . switcher_offset ( ) is a
2008-03-28 11:05:53 -05:00
* convenience function which returns the distance between the
2009-07-30 16:03:45 -06:00
* compiled - in switcher code and the high - mapped copy we just made .
*/
2007-10-22 11:03:28 +10:00
for ( i = 0 ; i < IDT_ENTRIES ; i + + )
default_idt_entries [ i ] + = switcher_offset ( ) ;
/*
* Set up the Switcher ' s per - cpu areas .
*
* Each CPU gets two pages of its own within the high - mapped region
* ( aka . " struct lguest_pages " ) . Much of this can be initialized now ,
* but some depends on what Guest we are running ( which is set up in
* copy_in_guest_info ( ) ) .
*/
for_each_possible_cpu ( i ) {
/* lguest_pages() returns this CPU's two pages. */
struct lguest_pages * pages = lguest_pages ( i ) ;
2009-07-30 16:03:45 -06:00
/* This is a convenience pointer to make the code neater. */
2007-10-22 11:03:28 +10:00
struct lguest_ro_state * state = & pages - > state ;
2009-07-30 16:03:45 -06:00
/*
* The Global Descriptor Table : the Host has a different one
2007-10-22 11:03:28 +10:00
* for each CPU . We keep a descriptor for the GDT which says
* where it is and how big it is ( the size is actually the last
2009-07-30 16:03:45 -06:00
* byte , not the size , hence the " -1 " ) .
*/
2007-10-22 11:03:28 +10:00
state - > host_gdt_desc . size = GDT_SIZE - 1 ;
state - > host_gdt_desc . address = ( long ) get_cpu_gdt_table ( i ) ;
2009-07-30 16:03:45 -06:00
/*
* All CPUs on the Host use the same Interrupt Descriptor
2007-10-22 11:03:28 +10:00
* Table , so we just use store_idt ( ) , which gets this CPU ' s IDT
2009-07-30 16:03:45 -06:00
* descriptor .
*/
2007-10-22 11:03:28 +10:00
store_idt ( & state - > host_idt_desc ) ;
2009-07-30 16:03:45 -06:00
/*
* The descriptors for the Guest ' s GDT and IDT can be filled
2007-10-22 11:03:28 +10:00
* out now , too . We copy the GDT & IDT into - > guest_gdt and
2009-07-30 16:03:45 -06:00
* - > guest_idt before actually running the Guest .
*/
2007-10-22 11:03:28 +10:00
state - > guest_idt_desc . size = sizeof ( state - > guest_idt ) - 1 ;
state - > guest_idt_desc . address = ( long ) & state - > guest_idt ;
state - > guest_gdt_desc . size = sizeof ( state - > guest_gdt ) - 1 ;
state - > guest_gdt_desc . address = ( long ) & state - > guest_gdt ;
2009-07-30 16:03:45 -06:00
/*
* We know where we want the stack to be when the Guest enters
2008-03-28 11:05:53 -05:00
* the Switcher : in pages - > regs . The stack grows upwards , so
2009-07-30 16:03:45 -06:00
* we start it at the end of that structure .
*/
2008-01-30 13:31:02 +01:00
state - > guest_tss . sp0 = ( long ) ( & pages - > regs + 1 ) ;
2009-07-30 16:03:45 -06:00
/*
* And this is the GDT entry to use for the stack : we keep a
* couple of special LGUEST entries .
*/
2007-10-22 11:03:28 +10:00
state - > guest_tss . ss0 = LGUEST_DS ;
2009-07-30 16:03:45 -06:00
/*
* x86 can have a finegrained bitmap which indicates what I / O
2007-10-22 11:03:28 +10:00
* ports the process can use . We set it to the end of our
2009-07-30 16:03:45 -06:00
* structure , meaning " none " .
*/
2007-10-22 11:03:28 +10:00
state - > guest_tss . io_bitmap_base = sizeof ( state - > guest_tss ) ;
2009-07-30 16:03:45 -06:00
/*
* Some GDT entries are the same across all Guests , so we can
* set them up now .
*/
2007-10-22 11:03:28 +10:00
setup_default_gdt_entries ( state ) ;
/* Most IDT entries are the same for all Guests, too.*/
setup_default_idt_entries ( state , default_idt_entries ) ;
2009-07-30 16:03:45 -06:00
/*
* The Host needs to be able to use the LGUEST segments on this
* CPU , too , so put them in the Host GDT .
*/
2007-10-22 11:03:28 +10:00
get_cpu_gdt_table ( i ) [ GDT_ENTRY_LGUEST_CS ] = FULL_EXEC_SEGMENT ;
get_cpu_gdt_table ( i ) [ GDT_ENTRY_LGUEST_DS ] = FULL_SEGMENT ;
}
2009-07-30 16:03:45 -06:00
/*
* In the Switcher , we want the % cs segment register to use the
2007-10-22 11:03:28 +10:00
* LGUEST_CS GDT entry : we ' ve put that in the Host and Guest GDTs , so
* it will be undisturbed when we switch . To change % cs and jump we
2009-07-30 16:03:45 -06:00
* need this structure to feed to Intel ' s " lcall " instruction .
*/
2007-10-22 11:03:28 +10:00
lguest_entry . offset = ( long ) switch_to_guest + switcher_offset ( ) ;
lguest_entry . segment = LGUEST_CS ;
2009-07-30 16:03:45 -06:00
/*
* Finally , we need to turn off " Page Global Enable " . PGE is an
2007-10-22 11:03:28 +10:00
* optimization where page table entries are specially marked to show
* they never change . The Host kernel marks all the kernel pages this
* way because it ' s always present , even when userspace is running .
*
* Lguest breaks this : unbeknownst to the rest of the Host kernel , we
* switch to the Guest kernel . If you don ' t disable this on all CPUs ,
* you ' ll get really weird bugs that you ' ll chase for two days .
*
* I used to turn PGE off every time we switched to the Guest and back
2009-07-30 16:03:45 -06:00
* on when we return , but that slowed the Switcher down noticibly .
*/
2007-10-22 11:03:28 +10:00
2009-07-30 16:03:45 -06:00
/*
* We don ' t need the complexity of CPUs coming and going while we ' re
* doing this .
*/
2008-01-25 21:08:02 +01:00
get_online_cpus ( ) ;
2007-10-22 11:03:28 +10:00
if ( cpu_has_pge ) { /* We have a broader idea of "global". */
/* Remember that this was originally set (for cleanup). */
cpu_had_pge = 1 ;
2009-07-30 16:03:45 -06:00
/*
* adjust_pge is a helper function which sets or unsets the PGE
* bit on its CPU , depending on the argument ( 0 = = unset ) .
*/
2008-05-09 09:39:44 +02:00
on_each_cpu ( adjust_pge , ( void * ) 0 , 1 ) ;
2007-10-22 11:03:28 +10:00
/* Turn off the feature in the global feature set. */
2008-06-09 16:22:48 -07:00
clear_cpu_cap ( & boot_cpu_data , X86_FEATURE_PGE ) ;
2007-10-22 11:03:28 +10:00
}
2008-01-25 21:08:02 +01:00
put_online_cpus ( ) ;
2007-10-22 11:03:28 +10:00
} ;
/*:*/
void __exit lguest_arch_host_fini ( void )
{
/* If we had PGE before we started, turn it back on now. */
2008-01-25 21:08:02 +01:00
get_online_cpus ( ) ;
2007-10-22 11:03:28 +10:00
if ( cpu_had_pge ) {
2008-06-09 16:22:48 -07:00
set_cpu_cap ( & boot_cpu_data , X86_FEATURE_PGE ) ;
2007-10-22 11:03:28 +10:00
/* adjust_pge's argument "1" means set PGE. */
2008-05-09 09:39:44 +02:00
on_each_cpu ( adjust_pge , ( void * ) 1 , 1 ) ;
2007-10-22 11:03:28 +10:00
}
2008-01-25 21:08:02 +01:00
put_online_cpus ( ) ;
2007-10-22 11:03:28 +10:00
}
2007-10-22 11:03:31 +10:00
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
2008-01-07 11:05:27 -02:00
int lguest_arch_do_hcall ( struct lg_cpu * cpu , struct hcall_args * args )
2007-10-22 11:03:31 +10:00
{
switch ( args - > arg0 ) {
2009-04-19 23:14:00 -06:00
case LHCALL_LOAD_GDT_ENTRY :
load_guest_gdt_entry ( cpu , args - > arg1 , args - > arg2 , args - > arg3 ) ;
2007-10-22 11:03:31 +10:00
break ;
case LHCALL_LOAD_IDT_ENTRY :
2008-01-07 11:05:33 -02:00
load_guest_idt_entry ( cpu , args - > arg1 , args - > arg2 , args - > arg3 ) ;
2007-10-22 11:03:31 +10:00
break ;
case LHCALL_LOAD_TLS :
2008-01-07 11:05:33 -02:00
guest_load_tls ( cpu , args - > arg1 ) ;
2007-10-22 11:03:31 +10:00
break ;
default :
/* Bad Guest. Bad! */
return - EIO ;
}
return 0 ;
}
/*H:126 i386-specific hypercall initialization: */
2008-01-07 11:05:27 -02:00
int lguest_arch_init_hypercalls ( struct lg_cpu * cpu )
2007-10-22 11:03:31 +10:00
{
u32 tsc_speed ;
2009-07-30 16:03:45 -06:00
/*
* The pointer to the Guest ' s " struct lguest_data " is the only argument .
* We check that address now .
*/
2008-01-17 19:19:42 -02:00
if ( ! lguest_address_ok ( cpu - > lg , cpu - > hcall - > arg1 ,
sizeof ( * cpu - > lg - > lguest_data ) ) )
2007-10-22 11:03:31 +10:00
return - EFAULT ;
2009-07-30 16:03:45 -06:00
/*
* Having checked it , we simply set lg - > lguest_data to point straight
2007-10-22 11:03:31 +10:00
* into the Launcher ' s memory at the right place and then use
* copy_to_user / from_user from now on , instead of lgread / write . I put
* this in to show that I ' m not immune to writing stupid
2009-07-30 16:03:45 -06:00
* optimizations .
*/
2008-01-17 19:19:42 -02:00
cpu - > lg - > lguest_data = cpu - > lg - > mem_base + cpu - > hcall - > arg1 ;
2007-10-22 11:03:31 +10:00
2009-07-30 16:03:45 -06:00
/*
* We insist that the Time Stamp Counter exist and doesn ' t change with
2007-10-22 11:03:31 +10:00
* cpu frequency . Some devious chip manufacturers decided that TSC
* changes could be handled in software . I decided that time going
* backwards might be good for benchmarks , but it ' s bad for users .
*
* We also insist that the TSC be stable : the kernel detects unreliable
2009-07-30 16:03:45 -06:00
* TSCs for its own purposes , and we use that here .
*/
2007-10-22 11:03:31 +10:00
if ( boot_cpu_has ( X86_FEATURE_CONSTANT_TSC ) & & ! check_tsc_unstable ( ) )
tsc_speed = tsc_khz ;
else
tsc_speed = 0 ;
2008-01-17 19:19:42 -02:00
if ( put_user ( tsc_speed , & cpu - > lg - > lguest_data - > tsc_khz ) )
2007-10-22 11:03:31 +10:00
return - EFAULT ;
2007-10-22 11:03:35 +10:00
/* The interrupt code might not like the system call vector. */
2008-01-17 19:19:42 -02:00
if ( ! check_syscall_vector ( cpu - > lg ) )
kill_guest ( cpu , " bad syscall vector " ) ;
2007-10-22 11:03:35 +10:00
2007-10-22 11:03:31 +10:00
return 0 ;
}
2008-03-28 11:05:53 -05:00
/*:*/
2007-10-22 11:03:32 +10:00
2009-07-30 16:03:45 -06:00
/*L:030
* lguest_arch_setup_regs ( )
2007-10-22 11:03:32 +10:00
*
* Most of the Guest ' s registers are left alone : we used get_zeroed_page ( ) to
2009-07-30 16:03:45 -06:00
* allocate the structure , so they will be 0.
*/
2008-01-07 11:05:32 -02:00
void lguest_arch_setup_regs ( struct lg_cpu * cpu , unsigned long start )
2007-10-22 11:03:32 +10:00
{
2008-01-07 11:05:32 -02:00
struct lguest_regs * regs = cpu - > regs ;
2007-10-22 11:03:32 +10:00
2009-07-30 16:03:45 -06:00
/*
* There are four " segment " registers which the Guest needs to boot :
2007-10-22 11:03:32 +10:00
* The " code segment " register ( cs ) refers to the kernel code segment
* __KERNEL_CS , and the " data " , " extra " and " stack " segment registers
* refer to the kernel data segment __KERNEL_DS .
*
* The privilege level is packed into the lower bits . The Guest runs
2009-07-30 16:03:45 -06:00
* at privilege level 1 ( GUEST_PL ) .
*/
2007-10-22 11:03:32 +10:00
regs - > ds = regs - > es = regs - > ss = __KERNEL_DS | GUEST_PL ;
regs - > cs = __KERNEL_CS | GUEST_PL ;
2009-07-30 16:03:45 -06:00
/*
* The " eflags " register contains miscellaneous flags . Bit 1 ( 0x002 )
2007-10-22 11:03:32 +10:00
* is supposed to always be " 1 " . Bit 9 ( 0x200 ) controls whether
* interrupts are enabled . We always leave interrupts enabled while
2009-07-30 16:03:45 -06:00
* running the Guest .
*/
2007-10-25 14:09:53 +10:00
regs - > eflags = X86_EFLAGS_IF | 0x2 ;
2007-10-22 11:03:32 +10:00
2009-07-30 16:03:45 -06:00
/*
* The " Extended Instruction Pointer " register says where the Guest is
* running .
*/
2007-10-22 11:03:32 +10:00
regs - > eip = start ;
2009-07-30 16:03:45 -06:00
/*
* % esi points to our boot information , at physical address 0 , so don ' t
* touch it .
*/
2007-10-25 15:02:50 +10:00
2009-07-30 16:03:45 -06:00
/* There are a couple of GDT entries the Guest expects at boot. */
2008-01-07 11:05:33 -02:00
setup_guest_gdt ( cpu ) ;
2007-10-22 11:03:32 +10:00
}