2007-07-26 10:41:02 -07:00
/*P:700 The pagetable code, on the other hand, still shows the scars of
* previous encounters . It ' s functional , and as neat as it can be in the
* circumstances , but be wary , for these things are subtle and break easily .
* The Guest provides a virtual to physical mapping , but we can neither trust
2008-03-28 11:05:53 -05:00
* it nor use it : we verify and convert it here then point the CPU to the
* converted Guest pages when running the Guest . : */
2007-07-26 10:41:02 -07:00
/* Copyright (C) Rusty Russell IBM Corporation 2006.
2007-07-19 01:49:23 -07:00
* GPL v2 and any later version */
# include <linux/mm.h>
# include <linux/types.h>
# include <linux/spinlock.h>
# include <linux/random.h>
# include <linux/percpu.h>
# include <asm/tlbflush.h>
2007-10-22 11:03:36 +10:00
# include <asm/uaccess.h>
2008-09-29 01:40:07 -03:00
# include <asm/bootparam.h>
2007-07-19 01:49:23 -07:00
# include "lg.h"
2007-07-26 10:41:05 -07:00
/*M:008 We hold reference to pages, which prevents them from being swapped.
* It ' d be nice to have a callback in the " struct mm_struct " when Linux wants
* to swap out . If we had this , and a shrinker callback to trim PTE pages , we
* could probably consider launching Guests as non - root . : */
2007-07-26 10:41:04 -07:00
/*H:300
* The Page Table Code
*
* We use two - level page tables for the Guest . If you ' re not entirely
* comfortable with virtual addresses , physical addresses and page tables then
2007-10-25 15:02:50 +10:00
* I recommend you review arch / x86 / lguest / boot . c ' s " Page Table Handling " ( with
* diagrams ! ) .
2007-07-26 10:41:04 -07:00
*
* The Guest keeps page tables , but we maintain the actual ones here : these are
* called " shadow " page tables . Which is a very Guest - centric name : these are
* the real page tables the CPU uses , although we keep them up to date to
* reflect the Guest ' s . ( See what I mean about weird naming ? Since when do
* shadows reflect anything ? )
*
* Anyway , this is the most complicated part of the Host code . There are seven
* parts to this :
2007-10-25 15:02:50 +10:00
* ( i ) Looking up a page table entry when the Guest faults ,
* ( ii ) Making sure the Guest stack is mapped ,
* ( iii ) Setting up a page table entry when the Guest tells us one has changed ,
2007-07-26 10:41:04 -07:00
* ( iv ) Switching page tables ,
2007-10-25 15:02:50 +10:00
* ( v ) Flushing ( throwing away ) page tables ,
2007-07-26 10:41:04 -07:00
* ( vi ) Mapping the Switcher when the Guest is about to run ,
* ( vii ) Setting up the page tables initially .
: */
/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
* conveniently placed at the top 4 MB , so it uses a separate , complete PTE
* page . */
2007-10-22 11:03:33 +10:00
# define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* We actually need a separate PTE page for each CPU. Remember that after the
* Switcher code itself comes two pages for each CPU , and we don ' t want this
* CPU ' s guest to see the pages of any other CPU . */
2007-10-22 11:03:33 +10:00
static DEFINE_PER_CPU ( pte_t * , switcher_pte_pages ) ;
2007-07-19 01:49:23 -07:00
# define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
2007-10-25 15:02:50 +10:00
/*H:320 The page table code is curly enough to need helper functions to keep it
* clear and clean .
2007-07-26 10:41:04 -07:00
*
2007-10-22 11:03:33 +10:00
* There are two functions which return pointers to the shadow ( aka " real " )
2007-07-26 10:41:04 -07:00
* page tables .
*
* spgd_addr ( ) takes the virtual address and returns a pointer to the top - level
2007-10-25 15:02:50 +10:00
* page directory entry ( PGD ) for that address . Since we keep track of several
* page tables , the " i " argument tells us which one we ' re interested in ( it ' s
2007-07-26 10:41:04 -07:00
* usually the current one ) . */
2008-01-17 19:19:42 -02:00
static pgd_t * spgd_addr ( struct lg_cpu * cpu , u32 i , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
unsigned int index = pgd_index ( vaddr ) ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* We kill any Guest trying to touch the Switcher addresses. */
2007-07-19 01:49:23 -07:00
if ( index > = SWITCHER_PGD_INDEX ) {
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " attempt to access switcher pages " ) ;
2007-07-19 01:49:23 -07:00
index = 0 ;
}
2007-07-26 10:41:04 -07:00
/* Return a pointer index'th pgd entry for the i'th page table. */
2008-01-17 19:19:42 -02:00
return & cpu - > lg - > pgdirs [ i ] . pgdir [ index ] ;
2007-07-19 01:49:23 -07:00
}
2007-10-25 15:02:50 +10:00
/* This routine then takes the page directory entry returned above, which
* contains the address of the page table entry ( PTE ) page . It then returns a
* pointer to the PTE entry for the given address . */
2008-01-17 19:09:49 -02:00
static pte_t * spte_addr ( pgd_t spgd , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
pte_t * page = __va ( pgd_pfn ( spgd ) < < PAGE_SHIFT ) ;
2007-07-26 10:41:04 -07:00
/* You should never call this if the PGD entry wasn't valid */
2007-10-22 11:03:33 +10:00
BUG_ON ( ! ( pgd_flags ( spgd ) & _PAGE_PRESENT ) ) ;
return & page [ ( vaddr > > PAGE_SHIFT ) % PTRS_PER_PTE ] ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/* These two functions just like the above two, except they access the Guest
* page tables . Hence they return a Guest address . */
2008-01-07 11:05:37 -02:00
static unsigned long gpgd_addr ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
unsigned int index = vaddr > > ( PGDIR_SHIFT ) ;
2008-01-07 11:05:37 -02:00
return cpu - > lg - > pgdirs [ cpu - > cpu_pgd ] . gpgdir + index * sizeof ( pgd_t ) ;
2007-07-19 01:49:23 -07:00
}
2008-01-17 19:18:08 -02:00
static unsigned long gpte_addr ( pgd_t gpgd , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
unsigned long gpage = pgd_pfn ( gpgd ) < < PAGE_SHIFT ;
BUG_ON ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) ) ;
return gpage + ( ( vaddr > > PAGE_SHIFT ) % PTRS_PER_PTE ) * sizeof ( pte_t ) ;
2007-07-19 01:49:23 -07:00
}
2008-03-28 11:05:53 -05:00
/*:*/
2008-08-12 17:52:53 -05:00
/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
* an optimization ( ie . pre - faulting ) . : */
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/*H:350 This routine takes a page number given by the Guest and converts it to
* an actual , physical page number . It can fail for several reasons : the
* virtual address might not be mapped by the Launcher , the write flag is set
* and the page is read - only , or the write flag was set and the page was
* shared so had to be copied , but we ran out of memory .
*
2008-03-28 11:05:53 -05:00
* This holds a reference to the page , so release_pte ( ) is careful to put that
* back . */
2007-07-19 01:49:23 -07:00
static unsigned long get_pfn ( unsigned long virtpfn , int write )
{
struct page * page ;
2008-08-12 17:52:53 -05:00
/* gup me one page at this address please! */
if ( get_user_pages_fast ( virtpfn < < PAGE_SHIFT , 1 , write , & page ) = = 1 )
return page_to_pfn ( page ) ;
2007-07-26 10:41:04 -07:00
/* This value indicates failure. */
2008-08-12 17:52:53 -05:00
return - 1UL ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
* entry can be a little tricky . The flags are ( almost ) the same , but the
* Guest PTE contains a virtual page number : the CPU needs the real page
* number . */
2008-01-17 19:19:42 -02:00
static pte_t gpte_to_spte ( struct lg_cpu * cpu , pte_t gpte , int write )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
unsigned long pfn , base , flags ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* The Guest sets the global flag, because it thinks that it is using
* PGE . We only told it to use PGE so it would tell us whether it was
* flushing a kernel mapping or a userspace mapping . We don ' t actually
* use the global bit , so throw it away . */
2007-10-22 11:03:33 +10:00
flags = ( pte_flags ( gpte ) & ~ _PAGE_GLOBAL ) ;
2007-07-26 10:41:04 -07:00
2007-10-22 11:03:26 +10:00
/* The Guest's pages are offset inside the Launcher. */
2008-01-17 19:19:42 -02:00
base = ( unsigned long ) cpu - > lg - > mem_base / PAGE_SIZE ;
2007-10-22 11:03:26 +10:00
2007-07-26 10:41:04 -07:00
/* We need a temporary "unsigned long" variable to hold the answer from
* get_pfn ( ) , because it returns 0xFFFFFFFF on failure , which wouldn ' t
* fit in spte . pfn . get_pfn ( ) finds the real physical number of the
* page , given the virtual number . */
2007-10-22 11:03:33 +10:00
pfn = get_pfn ( base + pte_pfn ( gpte ) , write ) ;
2007-07-19 01:49:23 -07:00
if ( pfn = = - 1UL ) {
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " failed to get page %lu " , pte_pfn ( gpte ) ) ;
2007-07-26 10:41:04 -07:00
/* When we destroy the Guest, we'll go through the shadow page
* tables and release_pte ( ) them . Make sure we don ' t think
* this one is valid ! */
2007-10-22 11:03:33 +10:00
flags = 0 ;
2007-07-19 01:49:23 -07:00
}
2007-10-22 11:03:33 +10:00
/* Now we assemble our shadow PTE from the page number and flags. */
return pfn_pte ( pfn , __pgprot ( flags ) ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*H:460 And to complete the chain, release_pte() looks like this: */
2007-10-22 11:03:33 +10:00
static void release_pte ( pte_t pte )
2007-07-19 01:49:23 -07:00
{
2008-08-12 17:52:53 -05:00
/* Remember that get_user_pages_fast() took a reference to the page, in
2007-07-26 10:41:04 -07:00
* get_pfn ( ) ? We have to put it back now . */
2007-10-22 11:03:33 +10:00
if ( pte_flags ( pte ) & _PAGE_PRESENT )
put_page ( pfn_to_page ( pte_pfn ( pte ) ) ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*:*/
2007-07-19 01:49:23 -07:00
2008-01-17 19:19:42 -02:00
static void check_gpte ( struct lg_cpu * cpu , pte_t gpte )
2007-07-19 01:49:23 -07:00
{
2008-02-09 23:24:09 +01:00
if ( ( pte_flags ( gpte ) & _PAGE_PSE ) | |
pte_pfn ( gpte ) > = cpu - > lg - > pfn_limit )
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " bad page table entry " ) ;
2007-07-19 01:49:23 -07:00
}
2008-01-17 19:19:42 -02:00
static void check_gpgd ( struct lg_cpu * cpu , pgd_t gpgd )
2007-07-19 01:49:23 -07:00
{
2008-01-17 19:19:42 -02:00
if ( ( pgd_flags ( gpgd ) & ~ _PAGE_TABLE ) | |
( pgd_pfn ( gpgd ) > = cpu - > lg - > pfn_limit ) )
kill_guest ( cpu , " bad page directory entry " ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*H:330
2007-10-25 15:02:50 +10:00
* ( i ) Looking up a page table entry when the Guest faults .
2007-07-26 10:41:04 -07:00
*
* We saw this call in run_guest ( ) : when we see a page fault in the Guest , we
* come here . That ' s because we only set up the shadow page tables lazily as
* they ' re needed , so we get page faults all the time and quietly fix them up
* and return to the Guest without it knowing .
*
* If we fixed up the fault ( ie . we mapped the address ) , this routine returns
2007-10-25 15:02:50 +10:00
* true . Otherwise , it was a real fault and we need to tell the Guest . */
2009-03-18 13:38:35 -03:00
bool demand_page ( struct lg_cpu * cpu , unsigned long vaddr , int errcode )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
pgd_t gpgd ;
pgd_t * spgd ;
2007-07-19 01:49:23 -07:00
unsigned long gpte_ptr ;
2007-10-22 11:03:33 +10:00
pte_t gpte ;
pte_t * spte ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* First step: get the top-level Guest page table entry. */
2008-01-17 19:19:42 -02:00
gpgd = lgread ( cpu , gpgd_addr ( cpu , vaddr ) , pgd_t ) ;
2007-07-26 10:41:04 -07:00
/* Toplevel not present? We can't map it in. */
2007-10-22 11:03:33 +10:00
if ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) )
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Now look at the matching shadow entry. */
2008-01-17 19:19:42 -02:00
spgd = spgd_addr ( cpu , cpu - > cpu_pgd , vaddr ) ;
2007-10-22 11:03:33 +10:00
if ( ! ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) ) {
2007-07-26 10:41:04 -07:00
/* No shadow entry: allocate a new shadow PTE page. */
2007-07-19 01:49:23 -07:00
unsigned long ptepage = get_zeroed_page ( GFP_KERNEL ) ;
2007-07-26 10:41:04 -07:00
/* This is not really the Guest's fault, but killing it is
* simple for this corner case . */
2007-07-19 01:49:23 -07:00
if ( ! ptepage ) {
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " out of memory allocating pte page " ) ;
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/* We check that the Guest pgd is OK. */
2008-01-17 19:19:42 -02:00
check_gpgd ( cpu , gpgd ) ;
2007-07-26 10:41:04 -07:00
/* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated . */
2007-10-22 11:03:33 +10:00
* spgd = __pgd ( __pa ( ptepage ) | pgd_flags ( gpgd ) ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/* OK, now we look at the lower level in the Guest page table: keep its
* address , because we might update it later . */
2008-01-17 19:18:08 -02:00
gpte_ptr = gpte_addr ( gpgd , vaddr ) ;
2008-01-17 19:19:42 -02:00
gpte = lgread ( cpu , gpte_ptr , pte_t ) ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* If this page isn't in the Guest page tables, we can't page it in. */
2007-10-22 11:03:33 +10:00
if ( ! ( pte_flags ( gpte ) & _PAGE_PRESENT ) )
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Check they're not trying to write to a page the Guest wants
* read - only ( bit 2 of errcode = = write ) . */
2007-10-22 11:03:33 +10:00
if ( ( errcode & 2 ) & & ! ( pte_flags ( gpte ) & _PAGE_RW ) )
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
2007-10-25 15:02:50 +10:00
/* User access to a kernel-only page? (bit 3 == user access) */
2007-10-22 11:03:33 +10:00
if ( ( errcode & 4 ) & & ! ( pte_flags ( gpte ) & _PAGE_USER ) )
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit ( ie . not mapping the Launcher binary ) . */
2008-01-17 19:19:42 -02:00
check_gpte ( cpu , gpte ) ;
2007-10-25 15:02:50 +10:00
2007-07-26 10:41:04 -07:00
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
2007-10-22 11:03:33 +10:00
gpte = pte_mkyoung ( gpte ) ;
2007-07-19 01:49:23 -07:00
if ( errcode & 2 )
2007-10-22 11:03:33 +10:00
gpte = pte_mkdirty ( gpte ) ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Get the pointer to the shadow PTE entry we're going to set. */
2008-01-17 19:09:49 -02:00
spte = spte_addr ( * spgd , vaddr ) ;
2007-07-26 10:41:04 -07:00
/* If there was a valid shadow PTE entry here before, we release it.
* This can happen with a write to a previously read - only entry . */
2007-07-19 01:49:23 -07:00
release_pte ( * spte ) ;
2007-07-26 10:41:04 -07:00
/* If this is a write, we insist that the Guest page is writable (the
* final arg to gpte_to_spte ( ) ) . */
2007-10-22 11:03:33 +10:00
if ( pte_dirty ( gpte ) )
2008-01-17 19:19:42 -02:00
* spte = gpte_to_spte ( cpu , gpte , 1 ) ;
2007-10-22 11:03:33 +10:00
else
2007-07-26 10:41:04 -07:00
/* If this is a read, don't set the "writable" bit in the page
* table entry , even if the Guest says it ' s writable . That way
2007-10-25 15:02:50 +10:00
* we will come back here when a write does actually occur , so
* we can update the Guest ' s _PAGE_DIRTY flag . */
2008-01-17 19:19:42 -02:00
* spte = gpte_to_spte ( cpu , pte_wrprotect ( gpte ) , 0 ) ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags . */
2008-01-17 19:19:42 -02:00
lgwrite ( cpu , gpte_ptr , pte_t , gpte ) ;
2007-07-26 10:41:04 -07:00
2007-10-25 15:02:50 +10:00
/* The fault is fixed, the page table is populated, the mapping
* manipulated , the result returned and the code complete . A small
* delay and a trace of alliteration are the only indications the Guest
* has that a page fault occurred at all . */
2009-03-18 13:38:35 -03:00
return true ;
2007-07-19 01:49:23 -07:00
}
2007-10-25 15:02:50 +10:00
/*H:360
* ( ii ) Making sure the Guest stack is mapped .
2007-07-26 10:41:04 -07:00
*
2007-10-25 15:02:50 +10:00
* Remember that direct traps into the Guest need a mapped Guest kernel stack .
* pin_stack_pages ( ) calls us here : we could simply call demand_page ( ) , but as
* we ' ve seen that logic is quite long , and usually the stack pages are already
* mapped , so it ' s overkill .
2007-07-26 10:41:04 -07:00
*
* This is a quick version which answers the question : is this virtual address
* mapped by the shadow page tables , and is it writable ? */
2009-03-18 13:38:35 -03:00
static bool page_writable ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
pgd_t * spgd ;
2007-07-19 01:49:23 -07:00
unsigned long flags ;
2007-10-25 15:02:50 +10:00
/* Look at the current top level entry: is it present? */
2008-01-17 19:19:42 -02:00
spgd = spgd_addr ( cpu , cpu - > cpu_pgd , vaddr ) ;
2007-10-22 11:03:33 +10:00
if ( ! ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) )
2009-03-18 13:38:35 -03:00
return false ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Check the flags on the pte entry itself: it must be present and
* writable . */
2008-01-17 19:09:49 -02:00
flags = pte_flags ( * ( spte_addr ( * spgd , vaddr ) ) ) ;
2007-10-22 11:03:33 +10:00
2007-07-19 01:49:23 -07:00
return ( flags & ( _PAGE_PRESENT | _PAGE_RW ) ) = = ( _PAGE_PRESENT | _PAGE_RW ) ;
}
2007-07-26 10:41:04 -07:00
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
* in the page tables , and if not , we call demand_page ( ) with error code 2
* ( meaning " write " ) . */
2008-01-07 11:05:37 -02:00
void pin_page ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 01:49:23 -07:00
{
2008-01-07 11:05:37 -02:00
if ( ! page_writable ( cpu , vaddr ) & & ! demand_page ( cpu , vaddr , 2 ) )
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " bad stack page %#lx " , vaddr ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*H:450 If we chase down the release_pgd() code, it looks like this: */
2007-10-22 11:03:33 +10:00
static void release_pgd ( struct lguest * lg , pgd_t * spgd )
2007-07-19 01:49:23 -07:00
{
2007-07-26 10:41:04 -07:00
/* If the entry's not present, there's nothing to release. */
2007-10-22 11:03:33 +10:00
if ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) {
2007-07-19 01:49:23 -07:00
unsigned int i ;
2007-07-26 10:41:04 -07:00
/* Converting the pfn to find the actual PTE page is easy: turn
* the page number into a physical address , then convert to a
* virtual address ( easy for kernel pages like this one ) . */
2007-10-22 11:03:33 +10:00
pte_t * ptepage = __va ( pgd_pfn ( * spgd ) < < PAGE_SHIFT ) ;
2007-07-26 10:41:04 -07:00
/* For each entry in the page, we might need to release it. */
2007-10-22 11:03:33 +10:00
for ( i = 0 ; i < PTRS_PER_PTE ; i + + )
2007-07-19 01:49:23 -07:00
release_pte ( ptepage [ i ] ) ;
2007-07-26 10:41:04 -07:00
/* Now we can free the page of PTEs */
2007-07-19 01:49:23 -07:00
free_page ( ( long ) ptepage ) ;
2007-10-25 15:02:50 +10:00
/* And zero out the PGD entry so we never release it twice. */
2007-10-22 11:03:33 +10:00
* spgd = __pgd ( 0 ) ;
2007-07-19 01:49:23 -07:00
}
}
2007-10-25 15:02:50 +10:00
/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
* hypercall and once in new_pgdir ( ) when we re - used a top - level pgdir page .
* It simply releases every PTE page from 0 up to the Guest ' s kernel address . */
2007-07-19 01:49:23 -07:00
static void flush_user_mappings ( struct lguest * lg , int idx )
{
unsigned int i ;
2007-07-26 10:41:04 -07:00
/* Release every pgd entry up to the kernel's address. */
2007-10-22 11:03:36 +10:00
for ( i = 0 ; i < pgd_index ( lg - > kernel_address ) ; i + + )
2007-07-19 01:49:23 -07:00
release_pgd ( lg , lg - > pgdirs [ idx ] . pgdir + i ) ;
}
2007-10-25 15:02:50 +10:00
/*H:440 (v) Flushing (throwing away) page tables,
*
* The Guest has a hypercall to throw away the page tables : it ' s used when a
* large number of mappings have been changed . */
2008-01-07 11:05:37 -02:00
void guest_pagetable_flush_user ( struct lg_cpu * cpu )
2007-07-19 01:49:23 -07:00
{
2007-07-26 10:41:04 -07:00
/* Drop the userspace part of the current page table. */
2008-01-07 11:05:37 -02:00
flush_user_mappings ( cpu - > lg , cpu - > cpu_pgd ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*:*/
2007-07-19 01:49:23 -07:00
2007-10-22 11:03:36 +10:00
/* We walk down the guest page tables to get a guest-physical address */
2008-01-07 11:05:37 -02:00
unsigned long guest_pa ( struct lg_cpu * cpu , unsigned long vaddr )
2007-10-22 11:03:36 +10:00
{
pgd_t gpgd ;
pte_t gpte ;
/* First step: get the top-level Guest page table entry. */
2008-01-17 19:19:42 -02:00
gpgd = lgread ( cpu , gpgd_addr ( cpu , vaddr ) , pgd_t ) ;
2007-10-22 11:03:36 +10:00
/* Toplevel not present? We can't map it in. */
2009-03-30 21:55:23 -06:00
if ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) ) {
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " Bad address %#lx " , vaddr ) ;
2009-03-30 21:55:23 -06:00
return - 1UL ;
}
2007-10-22 11:03:36 +10:00
2008-01-17 19:19:42 -02:00
gpte = lgread ( cpu , gpte_addr ( gpgd , vaddr ) , pte_t ) ;
2007-10-22 11:03:36 +10:00
if ( ! ( pte_flags ( gpte ) & _PAGE_PRESENT ) )
2008-01-17 19:19:42 -02:00
kill_guest ( cpu , " Bad address %#lx " , vaddr ) ;
2007-10-22 11:03:36 +10:00
return pte_pfn ( gpte ) * PAGE_SIZE | ( vaddr & ~ PAGE_MASK ) ;
}
2007-07-26 10:41:04 -07:00
/* We keep several page tables. This is a simple routine to find the page
* table ( if any ) corresponding to this top - level address the Guest has given
* us . */
2007-07-19 01:49:23 -07:00
static unsigned int find_pgdir ( struct lguest * lg , unsigned long pgtable )
{
unsigned int i ;
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
2008-03-11 09:35:57 -05:00
if ( lg - > pgdirs [ i ] . pgdir & & lg - > pgdirs [ i ] . gpgdir = = pgtable )
2007-07-19 01:49:23 -07:00
break ;
return i ;
}
2007-07-26 10:41:04 -07:00
/*H:435 And this is us, creating the new page directory. If we really do
* allocate a new one ( and so the kernel parts are not there ) , we set
* blank_pgdir . */
2008-01-07 11:05:37 -02:00
static unsigned int new_pgdir ( struct lg_cpu * cpu ,
2007-10-22 11:03:34 +10:00
unsigned long gpgdir ,
2007-07-19 01:49:23 -07:00
int * blank_pgdir )
{
unsigned int next ;
2007-07-26 10:41:04 -07:00
/* We pick one entry at random to throw out. Choosing the Least
* Recently Used might be better , but this is easy . */
2008-01-17 19:19:42 -02:00
next = random32 ( ) % ARRAY_SIZE ( cpu - > lg - > pgdirs ) ;
2007-07-26 10:41:04 -07:00
/* If it's never been allocated at all before, try now. */
2008-01-17 19:19:42 -02:00
if ( ! cpu - > lg - > pgdirs [ next ] . pgdir ) {
cpu - > lg - > pgdirs [ next ] . pgdir =
( pgd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
2007-07-26 10:41:04 -07:00
/* If the allocation fails, just keep using the one we have */
2008-01-17 19:19:42 -02:00
if ( ! cpu - > lg - > pgdirs [ next ] . pgdir )
2008-01-07 11:05:37 -02:00
next = cpu - > cpu_pgd ;
2007-07-19 01:49:23 -07:00
else
2007-07-26 10:41:04 -07:00
/* This is a blank page, so there are no kernel
* mappings : caller must map the stack ! */
2007-07-19 01:49:23 -07:00
* blank_pgdir = 1 ;
}
2007-07-26 10:41:04 -07:00
/* Record which Guest toplevel this shadows. */
2008-01-17 19:19:42 -02:00
cpu - > lg - > pgdirs [ next ] . gpgdir = gpgdir ;
2007-07-19 01:49:23 -07:00
/* Release all the non-kernel mappings. */
2008-01-17 19:19:42 -02:00
flush_user_mappings ( cpu - > lg , next ) ;
2007-07-19 01:49:23 -07:00
return next ;
}
2007-07-26 10:41:04 -07:00
/*H:430 (iv) Switching page tables
*
2007-10-25 15:02:50 +10:00
* Now we ' ve seen all the page table setting and manipulation , let ' s see what
* what happens when the Guest changes page tables ( ie . changes the top - level
* pgdir ) . This occurs on almost every context switch . */
2008-01-07 11:05:35 -02:00
void guest_new_pagetable ( struct lg_cpu * cpu , unsigned long pgtable )
2007-07-19 01:49:23 -07:00
{
int newpgdir , repin = 0 ;
2007-07-26 10:41:04 -07:00
/* Look to see if we have this one already. */
2008-01-17 19:19:42 -02:00
newpgdir = find_pgdir ( cpu - > lg , pgtable ) ;
2007-07-26 10:41:04 -07:00
/* If not, we allocate or mug an existing one: if it's a fresh one,
* repin gets set to 1. */
2008-01-17 19:19:42 -02:00
if ( newpgdir = = ARRAY_SIZE ( cpu - > lg - > pgdirs ) )
2008-01-07 11:05:37 -02:00
newpgdir = new_pgdir ( cpu , pgtable , & repin ) ;
2007-07-26 10:41:04 -07:00
/* Change the current pgd index to the new one. */
2008-01-07 11:05:37 -02:00
cpu - > cpu_pgd = newpgdir ;
2007-07-26 10:41:04 -07:00
/* If it was completely blank, we map in the Guest kernel stack */
2007-07-19 01:49:23 -07:00
if ( repin )
2008-01-07 11:05:35 -02:00
pin_stack_pages ( cpu ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
2007-10-25 15:02:50 +10:00
* the shadow page tables , including the Guest ' s kernel mappings . This is used
* when we destroy the Guest . */
2007-07-19 01:49:23 -07:00
static void release_all_pagetables ( struct lguest * lg )
{
unsigned int i , j ;
2007-07-26 10:41:04 -07:00
/* Every shadow pagetable this Guest has */
2007-07-19 01:49:23 -07:00
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
if ( lg - > pgdirs [ i ] . pgdir )
2007-07-26 10:41:04 -07:00
/* Every PGD entry except the Switcher at the top */
2007-07-19 01:49:23 -07:00
for ( j = 0 ; j < SWITCHER_PGD_INDEX ; j + + )
release_pgd ( lg , lg - > pgdirs [ i ] . pgdir + j ) ;
}
2007-07-26 10:41:04 -07:00
/* We also throw away everything when a Guest tells us it's changed a kernel
* mapping . Since kernel mappings are in every page table , it ' s easiest to
2007-10-25 15:02:50 +10:00
* throw them all away . This traps the Guest in amber for a while as
* everything faults back in , but it ' s rare . */
2008-01-07 11:05:35 -02:00
void guest_pagetable_clear_all ( struct lg_cpu * cpu )
2007-07-19 01:49:23 -07:00
{
2008-01-07 11:05:35 -02:00
release_all_pagetables ( cpu - > lg ) ;
2007-07-26 10:41:04 -07:00
/* We need the Guest kernel stack mapped again. */
2008-01-07 11:05:35 -02:00
pin_stack_pages ( cpu ) ;
2007-07-19 01:49:23 -07:00
}
2007-10-25 15:02:50 +10:00
/*:*/
/*M:009 Since we throw away all mappings when a kernel mapping changes, our
* performance sucks for guests using highmem . In fact , a guest with
* PAGE_OFFSET 0xc0000000 ( the default ) and more than about 700 MB of RAM is
* usually slower than a Guest with less memory .
*
* This , of course , cannot be fixed . It would take some kind of . . . well , I
* don ' t know , but the term " puissant code-fu " comes to mind . : */
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/*H:420 This is the routine which actually sets the page table entry for then
* " idx " ' th shadow page table .
*
* Normally , we can just throw out the old entry and replace it with 0 : if they
* use it demand_page ( ) will put the new entry in . We need to do this anyway :
* The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
* is read from , and _PAGE_DIRTY when it ' s written to .
*
* But Avi Kivity pointed out that most Operating Systems ( Linux included ) set
* these bits on PTEs immediately anyway . This is done to save the CPU from
* having to update them , but it helps us the same way : if they set
* _PAGE_ACCESSED then we can put a read - only PTE entry in immediately , and if
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately .
*/
2008-01-17 19:19:42 -02:00
static void do_set_pte ( struct lg_cpu * cpu , int idx ,
2007-10-22 11:03:33 +10:00
unsigned long vaddr , pte_t gpte )
2007-07-19 01:49:23 -07:00
{
2007-10-25 15:02:50 +10:00
/* Look up the matching shadow page directory entry. */
2008-01-17 19:19:42 -02:00
pgd_t * spgd = spgd_addr ( cpu , idx , vaddr ) ;
2007-07-26 10:41:04 -07:00
/* If the top level isn't present, there's no entry to update. */
2007-10-22 11:03:33 +10:00
if ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) {
2007-07-26 10:41:04 -07:00
/* Otherwise, we start by releasing the existing entry. */
2008-01-17 19:09:49 -02:00
pte_t * spte = spte_addr ( * spgd , vaddr ) ;
2007-07-19 01:49:23 -07:00
release_pte ( * spte ) ;
2007-07-26 10:41:04 -07:00
/* If they're setting this entry as dirty or accessed, we might
* as well put that entry they ' ve given us in now . This shaves
* 10 % off a copy - on - write micro - benchmark . */
2007-10-22 11:03:33 +10:00
if ( pte_flags ( gpte ) & ( _PAGE_DIRTY | _PAGE_ACCESSED ) ) {
2008-01-17 19:19:42 -02:00
check_gpte ( cpu , gpte ) ;
* spte = gpte_to_spte ( cpu , gpte ,
2007-10-22 11:03:33 +10:00
pte_flags ( gpte ) & _PAGE_DIRTY ) ;
2007-07-19 01:49:23 -07:00
} else
2007-10-25 15:02:50 +10:00
/* Otherwise kill it and we can demand_page() it in
* later . */
2007-10-22 11:03:33 +10:00
* spte = __pte ( 0 ) ;
2007-07-19 01:49:23 -07:00
}
}
2007-07-26 10:41:04 -07:00
/*H:410 Updating a PTE entry is a little trickier.
*
* We keep track of several different page tables ( the Guest uses one for each
* process , so it makes sense to cache at least a few ) . Each of these have
* identical kernel parts : ie . every mapping above PAGE_OFFSET is the same for
* all processes . So when the page table above that address changes , we update
* all the page tables , not just the current one . This is rare .
*
2008-03-28 11:05:53 -05:00
* The benefit is that when we have to track a new page table , we can keep all
* the kernel mappings . This speeds up context switch immensely . */
2008-01-17 19:19:42 -02:00
void guest_set_pte ( struct lg_cpu * cpu ,
2007-10-22 11:03:34 +10:00
unsigned long gpgdir , unsigned long vaddr , pte_t gpte )
2007-07-19 01:49:23 -07:00
{
2008-03-28 11:05:53 -05:00
/* Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often . */
2008-01-17 19:19:42 -02:00
if ( vaddr > = cpu - > lg - > kernel_address ) {
2007-07-19 01:49:23 -07:00
unsigned int i ;
2008-01-17 19:19:42 -02:00
for ( i = 0 ; i < ARRAY_SIZE ( cpu - > lg - > pgdirs ) ; i + + )
if ( cpu - > lg - > pgdirs [ i ] . pgdir )
do_set_pte ( cpu , i , vaddr , gpte ) ;
2007-07-19 01:49:23 -07:00
} else {
2007-07-26 10:41:04 -07:00
/* Is this page table one we have a shadow for? */
2008-01-17 19:19:42 -02:00
int pgdir = find_pgdir ( cpu - > lg , gpgdir ) ;
if ( pgdir ! = ARRAY_SIZE ( cpu - > lg - > pgdirs ) )
2007-07-26 10:41:04 -07:00
/* If so, do the update. */
2008-01-17 19:19:42 -02:00
do_set_pte ( cpu , pgdir , vaddr , gpte ) ;
2007-07-19 01:49:23 -07:00
}
}
2007-07-26 10:41:04 -07:00
/*H:400
2007-10-25 15:02:50 +10:00
* ( iii ) Setting up a page table entry when the Guest tells us one has changed .
2007-07-26 10:41:04 -07:00
*
* Just like we did in interrupts_and_traps . c , it makes sense for us to deal
* with the other side of page tables while we ' re here : what happens when the
* Guest asks for a page table to be updated ?
*
* We already saw that demand_page ( ) will fill in the shadow page tables when
* needed , so we can simply remove shadow page table entries whenever the Guest
* tells us they ' ve changed . When the Guest tries to use the new entry it will
* fault and demand_page ( ) will fix it up .
*
* So with that in mind here ' s our code to to update a ( top - level ) PGD entry :
*/
2007-10-22 11:03:34 +10:00
void guest_set_pmd ( struct lguest * lg , unsigned long gpgdir , u32 idx )
2007-07-19 01:49:23 -07:00
{
int pgdir ;
2007-07-26 10:41:04 -07:00
/* The kernel seems to try to initialize this early on: we ignore its
* attempts to map over the Switcher . */
2007-07-19 01:49:23 -07:00
if ( idx > = SWITCHER_PGD_INDEX )
return ;
2007-07-26 10:41:04 -07:00
/* If they're talking about a page table we have a shadow for... */
2007-10-22 11:03:34 +10:00
pgdir = find_pgdir ( lg , gpgdir ) ;
2007-07-19 01:49:23 -07:00
if ( pgdir < ARRAY_SIZE ( lg - > pgdirs ) )
2007-07-26 10:41:04 -07:00
/* ... throw it away. */
2007-07-19 01:49:23 -07:00
release_pgd ( lg , lg - > pgdirs [ pgdir ] . pgdir + idx ) ;
}
2008-09-29 01:40:07 -03:00
/* Once we know how much memory we have we can construct simple identity
* ( which set virtual = = physical ) and linear mappings
* which will get the Guest far enough into the boot to create its own .
*
* We lay them out of the way , just below the initrd ( which is why we need to
* know its size here ) . */
static unsigned long setup_pagetables ( struct lguest * lg ,
unsigned long mem ,
unsigned long initrd_size )
{
pgd_t __user * pgdir ;
pte_t __user * linear ;
unsigned int mapped_pages , i , linear_pages , phys_linear ;
unsigned long mem_base = ( unsigned long ) lg - > mem_base ;
/* We have mapped_pages frames to map, so we need
* linear_pages page tables to map them . */
mapped_pages = mem / PAGE_SIZE ;
linear_pages = ( mapped_pages + PTRS_PER_PTE - 1 ) / PTRS_PER_PTE ;
/* We put the toplevel page directory page at the top of memory. */
pgdir = ( pgd_t * ) ( mem + mem_base - initrd_size - PAGE_SIZE ) ;
/* Now we use the next linear_pages pages as pte pages */
linear = ( void * ) pgdir - linear_pages * PAGE_SIZE ;
/* Linear mapping is easy: put every page's address into the
* mapping in order . */
for ( i = 0 ; i < mapped_pages ; i + + ) {
pte_t pte ;
pte = pfn_pte ( i , __pgprot ( _PAGE_PRESENT | _PAGE_RW | _PAGE_USER ) ) ;
if ( copy_to_user ( & linear [ i ] , & pte , sizeof ( pte ) ) ! = 0 )
return - EFAULT ;
}
/* The top level points to the linear page table pages above.
* We setup the identity and linear mappings here . */
phys_linear = ( unsigned long ) linear - mem_base ;
for ( i = 0 ; i < mapped_pages ; i + = PTRS_PER_PTE ) {
pgd_t pgd ;
pgd = __pgd ( ( phys_linear + i * sizeof ( pte_t ) ) |
( _PAGE_PRESENT | _PAGE_RW | _PAGE_USER ) ) ;
if ( copy_to_user ( & pgdir [ i / PTRS_PER_PTE ] , & pgd , sizeof ( pgd ) )
| | copy_to_user ( & pgdir [ pgd_index ( PAGE_OFFSET )
+ i / PTRS_PER_PTE ] ,
& pgd , sizeof ( pgd ) ) )
return - EFAULT ;
}
/* We return the top level (guest-physical) address: remember where
* this is . */
return ( unsigned long ) pgdir - mem_base ;
}
2007-07-26 10:41:04 -07:00
/*H:500 (vii) Setting up the page tables initially.
*
* When a Guest is first created , the Launcher tells us where the toplevel of
* its first page table is . We set some things up here : */
2008-09-29 01:40:07 -03:00
int init_guest_pagetable ( struct lguest * lg )
2007-07-19 01:49:23 -07:00
{
2008-09-29 01:40:07 -03:00
u64 mem ;
u32 initrd_size ;
struct boot_params __user * boot = ( struct boot_params * ) lg - > mem_base ;
/* Get the Guest memory size and the ramdisk size from the boot header
* located at lg - > mem_base ( Guest address 0 ) . */
if ( copy_from_user ( & mem , & boot - > e820_map [ 0 ] . size , sizeof ( mem ) )
| | get_user ( initrd_size , & boot - > hdr . ramdisk_size ) )
return - EFAULT ;
2007-07-26 10:41:04 -07:00
/* We start on the first shadow page table, and give it a blank PGD
* page . */
2008-09-29 01:40:07 -03:00
lg - > pgdirs [ 0 ] . gpgdir = setup_pagetables ( lg , mem , initrd_size ) ;
if ( IS_ERR_VALUE ( lg - > pgdirs [ 0 ] . gpgdir ) )
return lg - > pgdirs [ 0 ] . gpgdir ;
2008-01-07 11:05:37 -02:00
lg - > pgdirs [ 0 ] . pgdir = ( pgd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! lg - > pgdirs [ 0 ] . pgdir )
2007-07-19 01:49:23 -07:00
return - ENOMEM ;
2008-01-07 11:05:37 -02:00
lg - > cpus [ 0 ] . cpu_pgd = 0 ;
2007-07-19 01:49:23 -07:00
return 0 ;
}
2007-10-22 11:03:36 +10:00
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
2008-01-17 19:19:42 -02:00
void page_table_guest_data_init ( struct lg_cpu * cpu )
2007-10-22 11:03:36 +10:00
{
/* We get the kernel address: above this is all kernel memory. */
2008-01-17 19:19:42 -02:00
if ( get_user ( cpu - > lg - > kernel_address ,
& cpu - > lg - > lguest_data - > kernel_address )
2007-10-22 11:03:36 +10:00
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher . */
2008-01-17 19:19:42 -02:00
| | put_user ( 4U * 1024 * 1024 , & cpu - > lg - > lguest_data - > reserve_mem )
| | put_user ( cpu - > lg - > pgdirs [ 0 ] . gpgdir , & cpu - > lg - > lguest_data - > pgdir ) )
kill_guest ( cpu , " bad guest page %p " , cpu - > lg - > lguest_data ) ;
2007-10-22 11:03:36 +10:00
/* In flush_user_mappings() we loop from 0 to
* " pgd_index(lg->kernel_address) " . This assumes it won ' t hit the
* Switcher mappings , so check that now . */
2008-01-17 19:19:42 -02:00
if ( pgd_index ( cpu - > lg - > kernel_address ) > = SWITCHER_PGD_INDEX )
kill_guest ( cpu , " bad kernel address %#lx " ,
cpu - > lg - > kernel_address ) ;
2007-10-22 11:03:36 +10:00
}
2007-07-26 10:41:04 -07:00
/* When a Guest dies, our cleanup is fairly simple. */
2007-07-19 01:49:23 -07:00
void free_guest_pagetable ( struct lguest * lg )
{
unsigned int i ;
2007-07-26 10:41:04 -07:00
/* Throw away all page table pages. */
2007-07-19 01:49:23 -07:00
release_all_pagetables ( lg ) ;
2007-07-26 10:41:04 -07:00
/* Now free the top levels: free_page() can handle 0 just fine. */
2007-07-19 01:49:23 -07:00
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
free_page ( ( long ) lg - > pgdirs [ i ] . pgdir ) ;
}
2007-07-26 10:41:04 -07:00
/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
*
2007-10-25 15:02:50 +10:00
* The Switcher and the two pages for this CPU need to be visible in the
2007-07-26 10:41:04 -07:00
* Guest ( and not the pages for other CPUs ) . We have the appropriate PTE pages
2007-10-25 15:02:50 +10:00
* for each CPU already set up , we just need to hook them in now we know which
* Guest is about to run on this CPU . */
2008-01-07 11:05:30 -02:00
void map_switcher_in_guest ( struct lg_cpu * cpu , struct lguest_pages * pages )
2007-07-19 01:49:23 -07:00
{
2007-10-22 11:03:33 +10:00
pte_t * switcher_pte_page = __get_cpu_var ( switcher_pte_pages ) ;
pgd_t switcher_pgd ;
pte_t regs_pte ;
2008-01-07 11:05:32 -02:00
unsigned long pfn ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Make the last PGD entry for this Guest point to the Switcher's PTE
* page for this CPU ( with appropriate flags ) . */
2008-01-18 23:59:08 -02:00
switcher_pgd = __pgd ( __pa ( switcher_pte_page ) | __PAGE_KERNEL ) ;
2007-10-22 11:03:33 +10:00
2008-01-07 11:05:37 -02:00
cpu - > lg - > pgdirs [ cpu - > cpu_pgd ] . pgdir [ SWITCHER_PGD_INDEX ] = switcher_pgd ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* We also change the Switcher PTE page. When we're running the Guest,
* we want the Guest ' s " regs " page to appear where the first Switcher
* page for this CPU is . This is an optimization : when the Switcher
* saves the Guest registers , it saves them into the first page of this
* CPU ' s " struct lguest_pages " : if we make sure the Guest ' s register
* page is already mapped there , we don ' t have to copy them out
* again . */
2008-01-07 11:05:32 -02:00
pfn = __pa ( cpu - > regs_page ) > > PAGE_SHIFT ;
2008-01-18 23:59:08 -02:00
regs_pte = pfn_pte ( pfn , __pgprot ( __PAGE_KERNEL ) ) ;
2007-10-22 11:03:33 +10:00
switcher_pte_page [ ( unsigned long ) pages / PAGE_SIZE % PTRS_PER_PTE ] = regs_pte ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/*:*/
2007-07-19 01:49:23 -07:00
static void free_switcher_pte_pages ( void )
{
unsigned int i ;
for_each_possible_cpu ( i )
free_page ( ( long ) switcher_pte_page ( i ) ) ;
}
2007-07-26 10:41:04 -07:00
/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
* the CPU number and the " struct page " s for the Switcher code itself .
*
* Currently the Switcher is less than a page long , so " pages " is always 1. */
2007-07-19 01:49:23 -07:00
static __init void populate_switcher_pte_page ( unsigned int cpu ,
struct page * switcher_page [ ] ,
unsigned int pages )
{
unsigned int i ;
2007-10-22 11:03:33 +10:00
pte_t * pte = switcher_pte_page ( cpu ) ;
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* The first entries are easy: they map the Switcher code. */
2007-07-19 01:49:23 -07:00
for ( i = 0 ; i < pages ; i + + ) {
2007-10-22 11:03:33 +10:00
pte [ i ] = mk_pte ( switcher_page [ i ] ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED ) ) ;
2007-07-19 01:49:23 -07:00
}
2007-07-26 10:41:04 -07:00
/* The only other thing we map is this CPU's pair of pages. */
2007-07-19 01:49:23 -07:00
i = pages + cpu * 2 ;
2007-07-26 10:41:04 -07:00
/* First page (Guest registers) is writable from the Guest */
2007-10-22 11:03:33 +10:00
pte [ i ] = pfn_pte ( page_to_pfn ( switcher_page [ i ] ) ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_RW ) ) ;
2007-07-26 10:41:04 -07:00
/* The second page contains the "struct lguest_ro_state", and is
* read - only . */
2007-10-22 11:03:33 +10:00
pte [ i + 1 ] = pfn_pte ( page_to_pfn ( switcher_page [ i + 1 ] ) ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED ) ) ;
2007-07-19 01:49:23 -07:00
}
2007-10-25 15:02:50 +10:00
/* We've made it through the page table code. Perhaps our tired brains are
* still processing the details , or perhaps we ' re simply glad it ' s over .
*
2008-03-28 11:05:53 -05:00
* If nothing else , note that all this complexity in juggling shadow page tables
* in sync with the Guest ' s page tables is for one reason : for most Guests this
* page table dance determines how bad performance will be . This is why Xen
* uses exotic direct Guest pagetable manipulation , and why both Intel and AMD
* have implemented shadow page table support directly into hardware .
2007-10-25 15:02:50 +10:00
*
* There is just one file remaining in the Host . */
2007-07-26 10:41:04 -07:00
/*H:510 At boot or module load time, init_pagetables() allocates and populates
* the Switcher PTE page for each CPU . */
2007-07-19 01:49:23 -07:00
__init int init_pagetables ( struct page * * switcher_page , unsigned int pages )
{
unsigned int i ;
for_each_possible_cpu ( i ) {
2007-10-22 11:03:33 +10:00
switcher_pte_page ( i ) = ( pte_t * ) get_zeroed_page ( GFP_KERNEL ) ;
2007-07-19 01:49:23 -07:00
if ( ! switcher_pte_page ( i ) ) {
free_switcher_pte_pages ( ) ;
return - ENOMEM ;
}
populate_switcher_pte_page ( i , switcher_page , pages ) ;
}
return 0 ;
}
2007-07-26 10:41:04 -07:00
/*:*/
2007-07-19 01:49:23 -07:00
2007-07-26 10:41:04 -07:00
/* Cleaning up simply involves freeing the PTE page for each CPU. */
2007-07-19 01:49:23 -07:00
void free_pagetables ( void )
{
free_switcher_pte_pages ( ) ;
}