2009-07-31 02:03:45 +04:00
/*P:700
* The pagetable code , on the other hand , still shows the scars of
2007-07-26 21:41:02 +04:00
* previous encounters . It ' s functional , and as neat as it can be in the
* circumstances , but be wary , for these things are subtle and break easily .
* The Guest provides a virtual to physical mapping , but we can neither trust
2008-03-28 19:05:53 +03:00
* it nor use it : we verify and convert it here then point the CPU to the
2009-07-31 02:03:45 +04:00
* converted Guest pages when running the Guest .
: */
2007-07-26 21:41:02 +04:00
/* Copyright (C) Rusty Russell IBM Corporation 2006.
2007-07-19 12:49:23 +04:00
* GPL v2 and any later version */
# include <linux/mm.h>
# include <linux/types.h>
# include <linux/spinlock.h>
# include <linux/random.h>
# include <linux/percpu.h>
# include <asm/tlbflush.h>
2007-10-22 05:03:36 +04:00
# include <asm/uaccess.h>
2008-09-29 08:40:07 +04:00
# include <asm/bootparam.h>
2007-07-19 12:49:23 +04:00
# include "lg.h"
2009-07-31 02:03:45 +04:00
/*M:008
* We hold reference to pages , which prevents them from being swapped .
2007-07-26 21:41:05 +04:00
* It ' d be nice to have a callback in the " struct mm_struct " when Linux wants
* to swap out . If we had this , and a shrinker callback to trim PTE pages , we
2009-07-31 02:03:45 +04:00
* could probably consider launching Guests as non - root .
: */
2007-07-26 21:41:05 +04:00
2007-07-26 21:41:04 +04:00
/*H:300
* The Page Table Code
*
* We use two - level page tables for the Guest . If you ' re not entirely
* comfortable with virtual addresses , physical addresses and page tables then
2007-10-25 09:02:50 +04:00
* I recommend you review arch / x86 / lguest / boot . c ' s " Page Table Handling " ( with
* diagrams ! ) .
2007-07-26 21:41:04 +04:00
*
* The Guest keeps page tables , but we maintain the actual ones here : these are
* called " shadow " page tables . Which is a very Guest - centric name : these are
* the real page tables the CPU uses , although we keep them up to date to
* reflect the Guest ' s . ( See what I mean about weird naming ? Since when do
* shadows reflect anything ? )
*
* Anyway , this is the most complicated part of the Host code . There are seven
* parts to this :
2007-10-25 09:02:50 +04:00
* ( i ) Looking up a page table entry when the Guest faults ,
* ( ii ) Making sure the Guest stack is mapped ,
* ( iii ) Setting up a page table entry when the Guest tells us one has changed ,
2007-07-26 21:41:04 +04:00
* ( iv ) Switching page tables ,
2007-10-25 09:02:50 +04:00
* ( v ) Flushing ( throwing away ) page tables ,
2007-07-26 21:41:04 +04:00
* ( vi ) Mapping the Switcher when the Guest is about to run ,
* ( vii ) Setting up the page tables initially .
2009-07-31 02:03:45 +04:00
: */
2007-07-26 21:41:04 +04:00
2009-07-31 02:03:45 +04:00
/*
* 1024 entries in a page table page maps 1024 pages : 4 MB . The Switcher is
2007-07-26 21:41:04 +04:00
* conveniently placed at the top 4 MB , so it uses a separate , complete PTE
2009-07-31 02:03:45 +04:00
* page .
*/
2007-10-22 05:03:33 +04:00
# define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* For PAE we need the PMD index as well . We use the last 2 MB , so we
* will need the last pmd entry of the last pmd page .
*/
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
# define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
# define RESERVE_MEM 2U
# define CHECK_GPGD_MASK _PAGE_PRESENT
# else
# define RESERVE_MEM 4U
# define CHECK_GPGD_MASK _PAGE_TABLE
# endif
2009-07-31 02:03:45 +04:00
/*
* We actually need a separate PTE page for each CPU . Remember that after the
2007-07-26 21:41:04 +04:00
* Switcher code itself comes two pages for each CPU , and we don ' t want this
2009-07-31 02:03:45 +04:00
* CPU ' s guest to see the pages of any other CPU .
*/
2007-10-22 05:03:33 +04:00
static DEFINE_PER_CPU ( pte_t * , switcher_pte_pages ) ;
2007-07-19 12:49:23 +04:00
# define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
2009-07-31 02:03:45 +04:00
/*H:320
* The page table code is curly enough to need helper functions to keep it
2007-10-25 09:02:50 +04:00
* clear and clean .
2007-07-26 21:41:04 +04:00
*
2007-10-22 05:03:33 +04:00
* There are two functions which return pointers to the shadow ( aka " real " )
2007-07-26 21:41:04 +04:00
* page tables .
*
* spgd_addr ( ) takes the virtual address and returns a pointer to the top - level
2007-10-25 09:02:50 +04:00
* page directory entry ( PGD ) for that address . Since we keep track of several
* page tables , the " i " argument tells us which one we ' re interested in ( it ' s
2009-07-31 02:03:45 +04:00
* usually the current one ) .
*/
2008-01-18 00:19:42 +03:00
static pgd_t * spgd_addr ( struct lg_cpu * cpu , u32 i , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
unsigned int index = pgd_index ( vaddr ) ;
2007-07-19 12:49:23 +04:00
2009-06-13 08:27:07 +04:00
# ifndef CONFIG_X86_PAE
2007-07-26 21:41:04 +04:00
/* We kill any Guest trying to touch the Switcher addresses. */
2007-07-19 12:49:23 +04:00
if ( index > = SWITCHER_PGD_INDEX ) {
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " attempt to access switcher pages " ) ;
2007-07-19 12:49:23 +04:00
index = 0 ;
}
2009-06-13 08:27:07 +04:00
# endif
2007-07-26 21:41:04 +04:00
/* Return a pointer index'th pgd entry for the i'th page table. */
2008-01-18 00:19:42 +03:00
return & cpu - > lg - > pgdirs [ i ] . pgdir [ index ] ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
2009-07-31 02:03:45 +04:00
/*
* This routine then takes the PGD entry given above , which contains the
2009-06-13 08:27:07 +04:00
* address of the PMD page . It then returns a pointer to the PMD entry for the
2009-07-31 02:03:45 +04:00
* given address .
*/
2009-06-13 08:27:07 +04:00
static pmd_t * spmd_addr ( struct lg_cpu * cpu , pgd_t spgd , unsigned long vaddr )
{
unsigned int index = pmd_index ( vaddr ) ;
pmd_t * page ;
/* We kill any Guest trying to touch the Switcher addresses. */
if ( pgd_index ( vaddr ) = = SWITCHER_PGD_INDEX & &
index > = SWITCHER_PMD_INDEX ) {
kill_guest ( cpu , " attempt to access switcher pages " ) ;
index = 0 ;
}
/* You should never call this if the PGD entry wasn't valid */
BUG_ON ( ! ( pgd_flags ( spgd ) & _PAGE_PRESENT ) ) ;
page = __va ( pgd_pfn ( spgd ) < < PAGE_SHIFT ) ;
return & page [ index ] ;
}
# endif
2009-07-31 02:03:45 +04:00
/*
* This routine then takes the page directory entry returned above , which
2007-10-25 09:02:50 +04:00
* contains the address of the page table entry ( PTE ) page . It then returns a
2009-07-31 02:03:45 +04:00
* pointer to the PTE entry for the given address .
*/
2009-06-13 08:27:07 +04:00
static pte_t * spte_addr ( struct lg_cpu * cpu , pgd_t spgd , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t * pmd = spmd_addr ( cpu , spgd , vaddr ) ;
pte_t * page = __va ( pmd_pfn ( * pmd ) < < PAGE_SHIFT ) ;
/* You should never call this if the PMD entry wasn't valid */
BUG_ON ( ! ( pmd_flags ( * pmd ) & _PAGE_PRESENT ) ) ;
# else
2007-10-22 05:03:33 +04:00
pte_t * page = __va ( pgd_pfn ( spgd ) < < PAGE_SHIFT ) ;
2007-07-26 21:41:04 +04:00
/* You should never call this if the PGD entry wasn't valid */
2007-10-22 05:03:33 +04:00
BUG_ON ( ! ( pgd_flags ( spgd ) & _PAGE_PRESENT ) ) ;
2009-06-13 08:27:07 +04:00
# endif
2009-06-13 08:27:06 +04:00
return & page [ pte_index ( vaddr ) ] ;
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*
* These two functions just like the above two , except they access the Guest
* page tables . Hence they return a Guest address .
*/
2008-01-07 16:05:37 +03:00
static unsigned long gpgd_addr ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
unsigned int index = vaddr > > ( PGDIR_SHIFT ) ;
2008-01-07 16:05:37 +03:00
return cpu - > lg - > pgdirs [ cpu - > cpu_pgd ] . gpgdir + index * sizeof ( pgd_t ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
static unsigned long gpmd_addr ( pgd_t gpgd , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
unsigned long gpage = pgd_pfn ( gpgd ) < < PAGE_SHIFT ;
BUG_ON ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) ) ;
2009-06-13 08:27:07 +04:00
return gpage + pmd_index ( vaddr ) * sizeof ( pmd_t ) ;
}
static unsigned long gpte_addr ( struct lg_cpu * cpu ,
2009-06-13 08:27:08 +04:00
pmd_t gpmd , unsigned long vaddr )
2009-06-13 08:27:07 +04:00
{
2009-06-13 08:27:08 +04:00
unsigned long gpage = pmd_pfn ( gpmd ) < < PAGE_SHIFT ;
2009-06-13 08:27:07 +04:00
BUG_ON ( ! ( pmd_flags ( gpmd ) & _PAGE_PRESENT ) ) ;
2009-06-13 08:27:08 +04:00
return gpage + pte_index ( vaddr ) * sizeof ( pte_t ) ;
}
2009-06-13 08:27:07 +04:00
# else
2009-06-13 08:27:08 +04:00
static unsigned long gpte_addr ( struct lg_cpu * cpu ,
pgd_t gpgd , unsigned long vaddr )
{
unsigned long gpage = pgd_pfn ( gpgd ) < < PAGE_SHIFT ;
BUG_ON ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) ) ;
2009-06-13 08:27:06 +04:00
return gpage + pte_index ( vaddr ) * sizeof ( pte_t ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:08 +04:00
# endif
2008-03-28 19:05:53 +03:00
/*:*/
2009-07-31 02:03:45 +04:00
/*M:014
* get_pfn is slow : we could probably try to grab batches of pages here as
* an optimization ( ie . pre - faulting ) .
: */
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*H:350
* This routine takes a page number given by the Guest and converts it to
2007-07-26 21:41:04 +04:00
* an actual , physical page number . It can fail for several reasons : the
* virtual address might not be mapped by the Launcher , the write flag is set
* and the page is read - only , or the write flag was set and the page was
* shared so had to be copied , but we ran out of memory .
*
2008-03-28 19:05:53 +03:00
* This holds a reference to the page , so release_pte ( ) is careful to put that
2009-07-31 02:03:45 +04:00
* back .
*/
2007-07-19 12:49:23 +04:00
static unsigned long get_pfn ( unsigned long virtpfn , int write )
{
struct page * page ;
2008-08-13 02:52:53 +04:00
/* gup me one page at this address please! */
if ( get_user_pages_fast ( virtpfn < < PAGE_SHIFT , 1 , write , & page ) = = 1 )
return page_to_pfn ( page ) ;
2007-07-26 21:41:04 +04:00
/* This value indicates failure. */
2008-08-13 02:52:53 +04:00
return - 1UL ;
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*H:340
* Converting a Guest page table entry to a shadow ( ie . real ) page table
2007-07-26 21:41:04 +04:00
* entry can be a little tricky . The flags are ( almost ) the same , but the
* Guest PTE contains a virtual page number : the CPU needs the real page
2009-07-31 02:03:45 +04:00
* number .
*/
2008-01-18 00:19:42 +03:00
static pte_t gpte_to_spte ( struct lg_cpu * cpu , pte_t gpte , int write )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
unsigned long pfn , base , flags ;
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* The Guest sets the global flag , because it thinks that it is using
2007-07-26 21:41:04 +04:00
* PGE . We only told it to use PGE so it would tell us whether it was
* flushing a kernel mapping or a userspace mapping . We don ' t actually
2009-07-31 02:03:45 +04:00
* use the global bit , so throw it away .
*/
2007-10-22 05:03:33 +04:00
flags = ( pte_flags ( gpte ) & ~ _PAGE_GLOBAL ) ;
2007-07-26 21:41:04 +04:00
2007-10-22 05:03:26 +04:00
/* The Guest's pages are offset inside the Launcher. */
2008-01-18 00:19:42 +03:00
base = ( unsigned long ) cpu - > lg - > mem_base / PAGE_SIZE ;
2007-10-22 05:03:26 +04:00
2009-07-31 02:03:45 +04:00
/*
* We need a temporary " unsigned long " variable to hold the answer from
2007-07-26 21:41:04 +04:00
* get_pfn ( ) , because it returns 0xFFFFFFFF on failure , which wouldn ' t
* fit in spte . pfn . get_pfn ( ) finds the real physical number of the
2009-07-31 02:03:45 +04:00
* page , given the virtual number .
*/
2007-10-22 05:03:33 +04:00
pfn = get_pfn ( base + pte_pfn ( gpte ) , write ) ;
2007-07-19 12:49:23 +04:00
if ( pfn = = - 1UL ) {
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " failed to get page %lu " , pte_pfn ( gpte ) ) ;
2009-07-31 02:03:45 +04:00
/*
* When we destroy the Guest , we ' ll go through the shadow page
2007-07-26 21:41:04 +04:00
* tables and release_pte ( ) them . Make sure we don ' t think
2009-07-31 02:03:45 +04:00
* this one is valid !
*/
2007-10-22 05:03:33 +04:00
flags = 0 ;
2007-07-19 12:49:23 +04:00
}
2007-10-22 05:03:33 +04:00
/* Now we assemble our shadow PTE from the page number and flags. */
return pfn_pte ( pfn , __pgprot ( flags ) ) ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/*H:460 And to complete the chain, release_pte() looks like this: */
2007-10-22 05:03:33 +04:00
static void release_pte ( pte_t pte )
2007-07-19 12:49:23 +04:00
{
2009-07-31 02:03:45 +04:00
/*
* Remember that get_user_pages_fast ( ) took a reference to the page , in
* get_pfn ( ) ? We have to put it back now .
*/
2007-10-22 05:03:33 +04:00
if ( pte_flags ( pte ) & _PAGE_PRESENT )
2009-06-13 08:27:06 +04:00
put_page ( pte_page ( pte ) ) ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2008-01-18 00:19:42 +03:00
static void check_gpte ( struct lg_cpu * cpu , pte_t gpte )
2007-07-19 12:49:23 +04:00
{
2008-02-10 01:24:09 +03:00
if ( ( pte_flags ( gpte ) & _PAGE_PSE ) | |
pte_pfn ( gpte ) > = cpu - > lg - > pfn_limit )
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " bad page table entry " ) ;
2007-07-19 12:49:23 +04:00
}
2008-01-18 00:19:42 +03:00
static void check_gpgd ( struct lg_cpu * cpu , pgd_t gpgd )
2007-07-19 12:49:23 +04:00
{
2009-06-13 08:27:07 +04:00
if ( ( pgd_flags ( gpgd ) & ~ CHECK_GPGD_MASK ) | |
2008-01-18 00:19:42 +03:00
( pgd_pfn ( gpgd ) > = cpu - > lg - > pfn_limit ) )
kill_guest ( cpu , " bad page directory entry " ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
static void check_gpmd ( struct lg_cpu * cpu , pmd_t gpmd )
{
if ( ( pmd_flags ( gpmd ) & ~ _PAGE_TABLE ) | |
( pmd_pfn ( gpmd ) > = cpu - > lg - > pfn_limit ) )
kill_guest ( cpu , " bad page middle directory entry " ) ;
}
# endif
2007-07-26 21:41:04 +04:00
/*H:330
2007-10-25 09:02:50 +04:00
* ( i ) Looking up a page table entry when the Guest faults .
2007-07-26 21:41:04 +04:00
*
* We saw this call in run_guest ( ) : when we see a page fault in the Guest , we
* come here . That ' s because we only set up the shadow page tables lazily as
* they ' re needed , so we get page faults all the time and quietly fix them up
* and return to the Guest without it knowing .
*
* If we fixed up the fault ( ie . we mapped the address ) , this routine returns
2009-07-31 02:03:45 +04:00
* true . Otherwise , it was a real fault and we need to tell the Guest .
*/
2009-03-18 19:38:35 +03:00
bool demand_page ( struct lg_cpu * cpu , unsigned long vaddr , int errcode )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
pgd_t gpgd ;
pgd_t * spgd ;
2007-07-19 12:49:23 +04:00
unsigned long gpte_ptr ;
2007-10-22 05:03:33 +04:00
pte_t gpte ;
pte_t * spte ;
2007-07-19 12:49:23 +04:00
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t * spmd ;
pmd_t gpmd ;
# endif
2007-07-26 21:41:04 +04:00
/* First step: get the top-level Guest page table entry. */
2008-01-18 00:19:42 +03:00
gpgd = lgread ( cpu , gpgd_addr ( cpu , vaddr ) , pgd_t ) ;
2007-07-26 21:41:04 +04:00
/* Toplevel not present? We can't map it in. */
2007-10-22 05:03:33 +04:00
if ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) )
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* Now look at the matching shadow entry. */
2008-01-18 00:19:42 +03:00
spgd = spgd_addr ( cpu , cpu - > cpu_pgd , vaddr ) ;
2007-10-22 05:03:33 +04:00
if ( ! ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) ) {
2007-07-26 21:41:04 +04:00
/* No shadow entry: allocate a new shadow PTE page. */
2007-07-19 12:49:23 +04:00
unsigned long ptepage = get_zeroed_page ( GFP_KERNEL ) ;
2009-07-31 02:03:45 +04:00
/*
* This is not really the Guest ' s fault , but killing it is
* simple for this corner case .
*/
2007-07-19 12:49:23 +04:00
if ( ! ptepage ) {
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " out of memory allocating pte page " ) ;
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/* We check that the Guest pgd is OK. */
2008-01-18 00:19:42 +03:00
check_gpgd ( cpu , gpgd ) ;
2009-07-31 02:03:45 +04:00
/*
* And we copy the flags to the shadow PGD entry . The page
* number in the shadow PGD is the page we just allocated .
*/
2009-06-13 08:27:07 +04:00
set_pgd ( spgd , __pgd ( __pa ( ptepage ) | pgd_flags ( gpgd ) ) ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
gpmd = lgread ( cpu , gpmd_addr ( gpgd , vaddr ) , pmd_t ) ;
2009-07-31 02:03:45 +04:00
/* Middle level not present? We can't map it in. */
2009-06-13 08:27:07 +04:00
if ( ! ( pmd_flags ( gpmd ) & _PAGE_PRESENT ) )
return false ;
/* Now look at the matching shadow entry. */
spmd = spmd_addr ( cpu , * spgd , vaddr ) ;
if ( ! ( pmd_flags ( * spmd ) & _PAGE_PRESENT ) ) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page ( GFP_KERNEL ) ;
2009-07-31 02:03:45 +04:00
/*
* This is not really the Guest ' s fault , but killing it is
* simple for this corner case .
*/
2009-06-13 08:27:07 +04:00
if ( ! ptepage ) {
kill_guest ( cpu , " out of memory allocating pte page " ) ;
return false ;
}
/* We check that the Guest pmd is OK. */
check_gpmd ( cpu , gpmd ) ;
2009-07-31 02:03:45 +04:00
/*
* And we copy the flags to the shadow PMD entry . The page
* number in the shadow PMD is the page we just allocated .
*/
2009-06-13 08:27:07 +04:00
native_set_pmd ( spmd , __pmd ( __pa ( ptepage ) | pmd_flags ( gpmd ) ) ) ;
}
2009-06-13 08:27:08 +04:00
2009-07-31 02:03:45 +04:00
/*
* OK , now we look at the lower level in the Guest page table : keep its
* address , because we might update it later .
*/
2009-06-13 08:27:08 +04:00
gpte_ptr = gpte_addr ( cpu , gpmd , vaddr ) ;
# else
2009-07-31 02:03:45 +04:00
/*
* OK , now we look at the lower level in the Guest page table : keep its
* address , because we might update it later .
*/
2009-06-13 08:27:07 +04:00
gpte_ptr = gpte_addr ( cpu , gpgd , vaddr ) ;
2009-06-13 08:27:08 +04:00
# endif
2008-01-18 00:19:42 +03:00
gpte = lgread ( cpu , gpte_ptr , pte_t ) ;
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* If this page isn't in the Guest page tables, we can't page it in. */
2007-10-22 05:03:33 +04:00
if ( ! ( pte_flags ( gpte ) & _PAGE_PRESENT ) )
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* Check they ' re not trying to write to a page the Guest wants
* read - only ( bit 2 of errcode = = write ) .
*/
2007-10-22 05:03:33 +04:00
if ( ( errcode & 2 ) & & ! ( pte_flags ( gpte ) & _PAGE_RW ) )
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
2007-10-25 09:02:50 +04:00
/* User access to a kernel-only page? (bit 3 == user access) */
2007-10-22 05:03:33 +04:00
if ( ( errcode & 4 ) & & ! ( pte_flags ( gpte ) & _PAGE_USER ) )
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* Check that the Guest PTE flags are OK , and the page number is below
* the pfn_limit ( ie . not mapping the Launcher binary ) .
*/
2008-01-18 00:19:42 +03:00
check_gpte ( cpu , gpte ) ;
2007-10-25 09:02:50 +04:00
2007-07-26 21:41:04 +04:00
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
2007-10-22 05:03:33 +04:00
gpte = pte_mkyoung ( gpte ) ;
2007-07-19 12:49:23 +04:00
if ( errcode & 2 )
2007-10-22 05:03:33 +04:00
gpte = pte_mkdirty ( gpte ) ;
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* Get the pointer to the shadow PTE entry we're going to set. */
2009-06-13 08:27:07 +04:00
spte = spte_addr ( cpu , * spgd , vaddr ) ;
2009-07-31 02:03:45 +04:00
/*
* If there was a valid shadow PTE entry here before , we release it .
* This can happen with a write to a previously read - only entry .
*/
2007-07-19 12:49:23 +04:00
release_pte ( * spte ) ;
2009-07-31 02:03:45 +04:00
/*
* If this is a write , we insist that the Guest page is writable ( the
* final arg to gpte_to_spte ( ) ) .
*/
2007-10-22 05:03:33 +04:00
if ( pte_dirty ( gpte ) )
2008-01-18 00:19:42 +03:00
* spte = gpte_to_spte ( cpu , gpte , 1 ) ;
2007-10-22 05:03:33 +04:00
else
2009-07-31 02:03:45 +04:00
/*
* If this is a read , don ' t set the " writable " bit in the page
2007-07-26 21:41:04 +04:00
* table entry , even if the Guest says it ' s writable . That way
2007-10-25 09:02:50 +04:00
* we will come back here when a write does actually occur , so
2009-07-31 02:03:45 +04:00
* we can update the Guest ' s _PAGE_DIRTY flag .
*/
2009-06-13 08:27:06 +04:00
native_set_pte ( spte , gpte_to_spte ( cpu , pte_wrprotect ( gpte ) , 0 ) ) ;
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* Finally , we write the Guest PTE entry back : we ' ve set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags .
*/
2008-01-18 00:19:42 +03:00
lgwrite ( cpu , gpte_ptr , pte_t , gpte ) ;
2007-07-26 21:41:04 +04:00
2009-07-31 02:03:45 +04:00
/*
* The fault is fixed , the page table is populated , the mapping
2007-10-25 09:02:50 +04:00
* manipulated , the result returned and the code complete . A small
* delay and a trace of alliteration are the only indications the Guest
2009-07-31 02:03:45 +04:00
* has that a page fault occurred at all .
*/
2009-03-18 19:38:35 +03:00
return true ;
2007-07-19 12:49:23 +04:00
}
2007-10-25 09:02:50 +04:00
/*H:360
* ( ii ) Making sure the Guest stack is mapped .
2007-07-26 21:41:04 +04:00
*
2007-10-25 09:02:50 +04:00
* Remember that direct traps into the Guest need a mapped Guest kernel stack .
* pin_stack_pages ( ) calls us here : we could simply call demand_page ( ) , but as
* we ' ve seen that logic is quite long , and usually the stack pages are already
* mapped , so it ' s overkill .
2007-07-26 21:41:04 +04:00
*
* This is a quick version which answers the question : is this virtual address
2009-07-31 02:03:45 +04:00
* mapped by the shadow page tables , and is it writable ?
*/
2009-03-18 19:38:35 +03:00
static bool page_writable ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
pgd_t * spgd ;
2007-07-19 12:49:23 +04:00
unsigned long flags ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t * spmd ;
# endif
2007-10-25 09:02:50 +04:00
/* Look at the current top level entry: is it present? */
2008-01-18 00:19:42 +03:00
spgd = spgd_addr ( cpu , cpu - > cpu_pgd , vaddr ) ;
2007-10-22 05:03:33 +04:00
if ( ! ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) )
2009-03-18 19:38:35 +03:00
return false ;
2007-07-19 12:49:23 +04:00
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
spmd = spmd_addr ( cpu , * spgd , vaddr ) ;
if ( ! ( pmd_flags ( * spmd ) & _PAGE_PRESENT ) )
return false ;
# endif
2009-07-31 02:03:45 +04:00
/*
* Check the flags on the pte entry itself : it must be present and
* writable .
*/
2009-06-13 08:27:07 +04:00
flags = pte_flags ( * ( spte_addr ( cpu , * spgd , vaddr ) ) ) ;
2007-10-22 05:03:33 +04:00
2007-07-19 12:49:23 +04:00
return ( flags & ( _PAGE_PRESENT | _PAGE_RW ) ) = = ( _PAGE_PRESENT | _PAGE_RW ) ;
}
2009-07-31 02:03:45 +04:00
/*
* So , when pin_stack_pages ( ) asks us to pin a page , we check if it ' s already
2007-07-26 21:41:04 +04:00
* in the page tables , and if not , we call demand_page ( ) with error code 2
2009-07-31 02:03:45 +04:00
* ( meaning " write " ) .
*/
2008-01-07 16:05:37 +03:00
void pin_page ( struct lg_cpu * cpu , unsigned long vaddr )
2007-07-19 12:49:23 +04:00
{
2008-01-07 16:05:37 +03:00
if ( ! page_writable ( cpu , vaddr ) & & ! demand_page ( cpu , vaddr , 2 ) )
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " bad stack page %#lx " , vaddr ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
static void release_pmd ( pmd_t * spmd )
{
/* If the entry's not present, there's nothing to release. */
if ( pmd_flags ( * spmd ) & _PAGE_PRESENT ) {
unsigned int i ;
pte_t * ptepage = __va ( pmd_pfn ( * spmd ) < < PAGE_SHIFT ) ;
/* For each entry in the page, we might need to release it. */
for ( i = 0 ; i < PTRS_PER_PTE ; i + + )
release_pte ( ptepage [ i ] ) ;
/* Now we can free the page of PTEs */
free_page ( ( long ) ptepage ) ;
/* And zero out the PMD entry so we never release it twice. */
native_set_pmd ( spmd , __pmd ( 0 ) ) ;
}
}
static void release_pgd ( pgd_t * spgd )
{
/* If the entry's not present, there's nothing to release. */
if ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) {
unsigned int i ;
pmd_t * pmdpage = __va ( pgd_pfn ( * spgd ) < < PAGE_SHIFT ) ;
for ( i = 0 ; i < PTRS_PER_PMD ; i + + )
release_pmd ( & pmdpage [ i ] ) ;
/* Now we can free the page of PMDs */
free_page ( ( long ) pmdpage ) ;
/* And zero out the PGD entry so we never release it twice. */
set_pgd ( spgd , __pgd ( 0 ) ) ;
}
}
# else /* !CONFIG_X86_PAE */
2007-07-26 21:41:04 +04:00
/*H:450 If we chase down the release_pgd() code, it looks like this: */
2009-06-13 08:27:06 +04:00
static void release_pgd ( pgd_t * spgd )
2007-07-19 12:49:23 +04:00
{
2007-07-26 21:41:04 +04:00
/* If the entry's not present, there's nothing to release. */
2007-10-22 05:03:33 +04:00
if ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) {
2007-07-19 12:49:23 +04:00
unsigned int i ;
2009-07-31 02:03:45 +04:00
/*
* Converting the pfn to find the actual PTE page is easy : turn
2007-07-26 21:41:04 +04:00
* the page number into a physical address , then convert to a
2009-07-31 02:03:45 +04:00
* virtual address ( easy for kernel pages like this one ) .
*/
2007-10-22 05:03:33 +04:00
pte_t * ptepage = __va ( pgd_pfn ( * spgd ) < < PAGE_SHIFT ) ;
2007-07-26 21:41:04 +04:00
/* For each entry in the page, we might need to release it. */
2007-10-22 05:03:33 +04:00
for ( i = 0 ; i < PTRS_PER_PTE ; i + + )
2007-07-19 12:49:23 +04:00
release_pte ( ptepage [ i ] ) ;
2007-07-26 21:41:04 +04:00
/* Now we can free the page of PTEs */
2007-07-19 12:49:23 +04:00
free_page ( ( long ) ptepage ) ;
2007-10-25 09:02:50 +04:00
/* And zero out the PGD entry so we never release it twice. */
2007-10-22 05:03:33 +04:00
* spgd = __pgd ( 0 ) ;
2007-07-19 12:49:23 +04:00
}
}
2009-06-13 08:27:07 +04:00
# endif
2009-07-31 02:03:45 +04:00
/*H:445
* We saw flush_user_mappings ( ) twice : once from the flush_user_mappings ( )
2007-10-25 09:02:50 +04:00
* hypercall and once in new_pgdir ( ) when we re - used a top - level pgdir page .
2009-07-31 02:03:45 +04:00
* It simply releases every PTE page from 0 up to the Guest ' s kernel address .
*/
2007-07-19 12:49:23 +04:00
static void flush_user_mappings ( struct lguest * lg , int idx )
{
unsigned int i ;
2007-07-26 21:41:04 +04:00
/* Release every pgd entry up to the kernel's address. */
2007-10-22 05:03:36 +04:00
for ( i = 0 ; i < pgd_index ( lg - > kernel_address ) ; i + + )
2009-06-13 08:27:06 +04:00
release_pgd ( lg - > pgdirs [ idx ] . pgdir + i ) ;
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*H:440
* ( v ) Flushing ( throwing away ) page tables ,
2007-10-25 09:02:50 +04:00
*
* The Guest has a hypercall to throw away the page tables : it ' s used when a
2009-07-31 02:03:45 +04:00
* large number of mappings have been changed .
*/
2008-01-07 16:05:37 +03:00
void guest_pagetable_flush_user ( struct lg_cpu * cpu )
2007-07-19 12:49:23 +04:00
{
2007-07-26 21:41:04 +04:00
/* Drop the userspace part of the current page table. */
2008-01-07 16:05:37 +03:00
flush_user_mappings ( cpu - > lg , cpu - > cpu_pgd ) ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2007-10-22 05:03:36 +04:00
/* We walk down the guest page tables to get a guest-physical address */
2008-01-07 16:05:37 +03:00
unsigned long guest_pa ( struct lg_cpu * cpu , unsigned long vaddr )
2007-10-22 05:03:36 +04:00
{
pgd_t gpgd ;
pte_t gpte ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t gpmd ;
# endif
2007-10-22 05:03:36 +04:00
/* First step: get the top-level Guest page table entry. */
2008-01-18 00:19:42 +03:00
gpgd = lgread ( cpu , gpgd_addr ( cpu , vaddr ) , pgd_t ) ;
2007-10-22 05:03:36 +04:00
/* Toplevel not present? We can't map it in. */
2009-03-31 07:55:23 +04:00
if ( ! ( pgd_flags ( gpgd ) & _PAGE_PRESENT ) ) {
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " Bad address %#lx " , vaddr ) ;
2009-03-31 07:55:23 +04:00
return - 1UL ;
}
2007-10-22 05:03:36 +04:00
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
gpmd = lgread ( cpu , gpmd_addr ( gpgd , vaddr ) , pmd_t ) ;
if ( ! ( pmd_flags ( gpmd ) & _PAGE_PRESENT ) )
kill_guest ( cpu , " Bad address %#lx " , vaddr ) ;
2009-06-13 08:27:08 +04:00
gpte = lgread ( cpu , gpte_addr ( cpu , gpmd , vaddr ) , pte_t ) ;
# else
2009-06-13 08:27:07 +04:00
gpte = lgread ( cpu , gpte_addr ( cpu , gpgd , vaddr ) , pte_t ) ;
2009-06-13 08:27:08 +04:00
# endif
2007-10-22 05:03:36 +04:00
if ( ! ( pte_flags ( gpte ) & _PAGE_PRESENT ) )
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " Bad address %#lx " , vaddr ) ;
2007-10-22 05:03:36 +04:00
return pte_pfn ( gpte ) * PAGE_SIZE | ( vaddr & ~ PAGE_MASK ) ;
}
2009-07-31 02:03:45 +04:00
/*
* We keep several page tables . This is a simple routine to find the page
2007-07-26 21:41:04 +04:00
* table ( if any ) corresponding to this top - level address the Guest has given
2009-07-31 02:03:45 +04:00
* us .
*/
2007-07-19 12:49:23 +04:00
static unsigned int find_pgdir ( struct lguest * lg , unsigned long pgtable )
{
unsigned int i ;
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
2008-03-11 17:35:57 +03:00
if ( lg - > pgdirs [ i ] . pgdir & & lg - > pgdirs [ i ] . gpgdir = = pgtable )
2007-07-19 12:49:23 +04:00
break ;
return i ;
}
2009-07-31 02:03:45 +04:00
/*H:435
* And this is us , creating the new page directory . If we really do
2007-07-26 21:41:04 +04:00
* allocate a new one ( and so the kernel parts are not there ) , we set
2009-07-31 02:03:45 +04:00
* blank_pgdir .
*/
2008-01-07 16:05:37 +03:00
static unsigned int new_pgdir ( struct lg_cpu * cpu ,
2007-10-22 05:03:34 +04:00
unsigned long gpgdir ,
2007-07-19 12:49:23 +04:00
int * blank_pgdir )
{
unsigned int next ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t * pmd_table ;
# endif
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* We pick one entry at random to throw out . Choosing the Least
* Recently Used might be better , but this is easy .
*/
2008-01-18 00:19:42 +03:00
next = random32 ( ) % ARRAY_SIZE ( cpu - > lg - > pgdirs ) ;
2007-07-26 21:41:04 +04:00
/* If it's never been allocated at all before, try now. */
2008-01-18 00:19:42 +03:00
if ( ! cpu - > lg - > pgdirs [ next ] . pgdir ) {
cpu - > lg - > pgdirs [ next ] . pgdir =
( pgd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
2007-07-26 21:41:04 +04:00
/* If the allocation fails, just keep using the one we have */
2008-01-18 00:19:42 +03:00
if ( ! cpu - > lg - > pgdirs [ next ] . pgdir )
2008-01-07 16:05:37 +03:00
next = cpu - > cpu_pgd ;
2009-06-13 08:27:07 +04:00
else {
# ifdef CONFIG_X86_PAE
2009-07-31 02:03:45 +04:00
/*
* In PAE mode , allocate a pmd page and populate the
* last pgd entry .
*/
2009-06-13 08:27:07 +04:00
pmd_table = ( pmd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! pmd_table ) {
free_page ( ( long ) cpu - > lg - > pgdirs [ next ] . pgdir ) ;
set_pgd ( cpu - > lg - > pgdirs [ next ] . pgdir , __pgd ( 0 ) ) ;
next = cpu - > cpu_pgd ;
} else {
set_pgd ( cpu - > lg - > pgdirs [ next ] . pgdir +
SWITCHER_PGD_INDEX ,
__pgd ( __pa ( pmd_table ) | _PAGE_PRESENT ) ) ;
2009-07-31 02:03:45 +04:00
/*
* This is a blank page , so there are no kernel
* mappings : caller must map the stack !
*/
2009-06-13 08:27:07 +04:00
* blank_pgdir = 1 ;
}
# else
2007-07-19 12:49:23 +04:00
* blank_pgdir = 1 ;
2009-06-13 08:27:07 +04:00
# endif
}
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/* Record which Guest toplevel this shadows. */
2008-01-18 00:19:42 +03:00
cpu - > lg - > pgdirs [ next ] . gpgdir = gpgdir ;
2007-07-19 12:49:23 +04:00
/* Release all the non-kernel mappings. */
2008-01-18 00:19:42 +03:00
flush_user_mappings ( cpu - > lg , next ) ;
2007-07-19 12:49:23 +04:00
return next ;
}
2009-07-31 02:03:45 +04:00
/*H:430
* ( iv ) Switching page tables
2007-07-26 21:41:04 +04:00
*
2009-06-13 08:27:06 +04:00
* Now we ' ve seen all the page table setting and manipulation , let ' s see
2007-10-25 09:02:50 +04:00
* what happens when the Guest changes page tables ( ie . changes the top - level
2009-07-31 02:03:45 +04:00
* pgdir ) . This occurs on almost every context switch .
*/
2008-01-07 16:05:35 +03:00
void guest_new_pagetable ( struct lg_cpu * cpu , unsigned long pgtable )
2007-07-19 12:49:23 +04:00
{
int newpgdir , repin = 0 ;
2007-07-26 21:41:04 +04:00
/* Look to see if we have this one already. */
2008-01-18 00:19:42 +03:00
newpgdir = find_pgdir ( cpu - > lg , pgtable ) ;
2009-07-31 02:03:45 +04:00
/*
* If not , we allocate or mug an existing one : if it ' s a fresh one ,
* repin gets set to 1.
*/
2008-01-18 00:19:42 +03:00
if ( newpgdir = = ARRAY_SIZE ( cpu - > lg - > pgdirs ) )
2008-01-07 16:05:37 +03:00
newpgdir = new_pgdir ( cpu , pgtable , & repin ) ;
2007-07-26 21:41:04 +04:00
/* Change the current pgd index to the new one. */
2008-01-07 16:05:37 +03:00
cpu - > cpu_pgd = newpgdir ;
2007-07-26 21:41:04 +04:00
/* If it was completely blank, we map in the Guest kernel stack */
2007-07-19 12:49:23 +04:00
if ( repin )
2008-01-07 16:05:35 +03:00
pin_stack_pages ( cpu ) ;
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*H:470
* Finally , a routine which throws away everything : all PGD entries in all
2007-10-25 09:02:50 +04:00
* the shadow page tables , including the Guest ' s kernel mappings . This is used
2009-07-31 02:03:45 +04:00
* when we destroy the Guest .
*/
2007-07-19 12:49:23 +04:00
static void release_all_pagetables ( struct lguest * lg )
{
unsigned int i , j ;
2007-07-26 21:41:04 +04:00
/* Every shadow pagetable this Guest has */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
2009-06-13 08:27:07 +04:00
if ( lg - > pgdirs [ i ] . pgdir ) {
# ifdef CONFIG_X86_PAE
pgd_t * spgd ;
pmd_t * pmdpage ;
unsigned int k ;
/* Get the last pmd page. */
spgd = lg - > pgdirs [ i ] . pgdir + SWITCHER_PGD_INDEX ;
pmdpage = __va ( pgd_pfn ( * spgd ) < < PAGE_SHIFT ) ;
2009-07-31 02:03:45 +04:00
/*
* And release the pmd entries of that pmd page ,
* except for the switcher pmd .
*/
2009-06-13 08:27:07 +04:00
for ( k = 0 ; k < SWITCHER_PMD_INDEX ; k + + )
release_pmd ( & pmdpage [ k ] ) ;
# endif
2007-07-26 21:41:04 +04:00
/* Every PGD entry except the Switcher at the top */
2007-07-19 12:49:23 +04:00
for ( j = 0 ; j < SWITCHER_PGD_INDEX ; j + + )
2009-06-13 08:27:06 +04:00
release_pgd ( lg - > pgdirs [ i ] . pgdir + j ) ;
2009-06-13 08:27:07 +04:00
}
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*
* We also throw away everything when a Guest tells us it ' s changed a kernel
2007-07-26 21:41:04 +04:00
* mapping . Since kernel mappings are in every page table , it ' s easiest to
2007-10-25 09:02:50 +04:00
* throw them all away . This traps the Guest in amber for a while as
2009-07-31 02:03:45 +04:00
* everything faults back in , but it ' s rare .
*/
2008-01-07 16:05:35 +03:00
void guest_pagetable_clear_all ( struct lg_cpu * cpu )
2007-07-19 12:49:23 +04:00
{
2008-01-07 16:05:35 +03:00
release_all_pagetables ( cpu - > lg ) ;
2007-07-26 21:41:04 +04:00
/* We need the Guest kernel stack mapped again. */
2008-01-07 16:05:35 +03:00
pin_stack_pages ( cpu ) ;
2007-07-19 12:49:23 +04:00
}
2007-10-25 09:02:50 +04:00
/*:*/
2009-07-31 02:03:45 +04:00
/*M:009
* Since we throw away all mappings when a kernel mapping changes , our
2007-10-25 09:02:50 +04:00
* performance sucks for guests using highmem . In fact , a guest with
* PAGE_OFFSET 0xc0000000 ( the default ) and more than about 700 MB of RAM is
* usually slower than a Guest with less memory .
*
* This , of course , cannot be fixed . It would take some kind of . . . well , I
2009-07-31 02:03:45 +04:00
* don ' t know , but the term " puissant code-fu " comes to mind .
: */
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*H:420
* This is the routine which actually sets the page table entry for then
2007-07-26 21:41:04 +04:00
* " idx " ' th shadow page table .
*
* Normally , we can just throw out the old entry and replace it with 0 : if they
* use it demand_page ( ) will put the new entry in . We need to do this anyway :
* The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
* is read from , and _PAGE_DIRTY when it ' s written to .
*
* But Avi Kivity pointed out that most Operating Systems ( Linux included ) set
* these bits on PTEs immediately anyway . This is done to save the CPU from
* having to update them , but it helps us the same way : if they set
* _PAGE_ACCESSED then we can put a read - only PTE entry in immediately , and if
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately .
*/
2008-01-18 00:19:42 +03:00
static void do_set_pte ( struct lg_cpu * cpu , int idx ,
2007-10-22 05:03:33 +04:00
unsigned long vaddr , pte_t gpte )
2007-07-19 12:49:23 +04:00
{
2007-10-25 09:02:50 +04:00
/* Look up the matching shadow page directory entry. */
2008-01-18 00:19:42 +03:00
pgd_t * spgd = spgd_addr ( cpu , idx , vaddr ) ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t * spmd ;
# endif
2007-07-26 21:41:04 +04:00
/* If the top level isn't present, there's no entry to update. */
2007-10-22 05:03:33 +04:00
if ( pgd_flags ( * spgd ) & _PAGE_PRESENT ) {
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
spmd = spmd_addr ( cpu , * spgd , vaddr ) ;
if ( pmd_flags ( * spmd ) & _PAGE_PRESENT ) {
# endif
2009-07-31 02:03:45 +04:00
/* Otherwise, start by releasing the existing entry. */
2009-06-13 08:27:07 +04:00
pte_t * spte = spte_addr ( cpu , * spgd , vaddr ) ;
release_pte ( * spte ) ;
2009-07-31 02:03:45 +04:00
/*
* If they ' re setting this entry as dirty or accessed ,
* we might as well put that entry they ' ve given us in
* now . This shaves 10 % off a copy - on - write
* micro - benchmark .
*/
2009-06-13 08:27:07 +04:00
if ( pte_flags ( gpte ) & ( _PAGE_DIRTY | _PAGE_ACCESSED ) ) {
check_gpte ( cpu , gpte ) ;
native_set_pte ( spte ,
gpte_to_spte ( cpu , gpte ,
pte_flags ( gpte ) & _PAGE_DIRTY ) ) ;
2009-07-31 02:03:45 +04:00
} else {
/*
* Otherwise kill it and we can demand_page ( )
* it in later .
*/
2009-06-13 08:27:07 +04:00
native_set_pte ( spte , __pte ( 0 ) ) ;
2009-07-31 02:03:45 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
}
# endif
2007-07-19 12:49:23 +04:00
}
}
2009-07-31 02:03:45 +04:00
/*H:410
* Updating a PTE entry is a little trickier .
2007-07-26 21:41:04 +04:00
*
* We keep track of several different page tables ( the Guest uses one for each
* process , so it makes sense to cache at least a few ) . Each of these have
* identical kernel parts : ie . every mapping above PAGE_OFFSET is the same for
* all processes . So when the page table above that address changes , we update
* all the page tables , not just the current one . This is rare .
*
2008-03-28 19:05:53 +03:00
* The benefit is that when we have to track a new page table , we can keep all
2009-07-31 02:03:45 +04:00
* the kernel mappings . This speeds up context switch immensely .
*/
2008-01-18 00:19:42 +03:00
void guest_set_pte ( struct lg_cpu * cpu ,
2007-10-22 05:03:34 +04:00
unsigned long gpgdir , unsigned long vaddr , pte_t gpte )
2007-07-19 12:49:23 +04:00
{
2009-07-31 02:03:45 +04:00
/*
* Kernel mappings must be changed on all top levels . Slow , but doesn ' t
* happen often .
*/
2008-01-18 00:19:42 +03:00
if ( vaddr > = cpu - > lg - > kernel_address ) {
2007-07-19 12:49:23 +04:00
unsigned int i ;
2008-01-18 00:19:42 +03:00
for ( i = 0 ; i < ARRAY_SIZE ( cpu - > lg - > pgdirs ) ; i + + )
if ( cpu - > lg - > pgdirs [ i ] . pgdir )
do_set_pte ( cpu , i , vaddr , gpte ) ;
2007-07-19 12:49:23 +04:00
} else {
2007-07-26 21:41:04 +04:00
/* Is this page table one we have a shadow for? */
2008-01-18 00:19:42 +03:00
int pgdir = find_pgdir ( cpu - > lg , gpgdir ) ;
if ( pgdir ! = ARRAY_SIZE ( cpu - > lg - > pgdirs ) )
2007-07-26 21:41:04 +04:00
/* If so, do the update. */
2008-01-18 00:19:42 +03:00
do_set_pte ( cpu , pgdir , vaddr , gpte ) ;
2007-07-19 12:49:23 +04:00
}
}
2007-07-26 21:41:04 +04:00
/*H:400
2007-10-25 09:02:50 +04:00
* ( iii ) Setting up a page table entry when the Guest tells us one has changed .
2007-07-26 21:41:04 +04:00
*
* Just like we did in interrupts_and_traps . c , it makes sense for us to deal
* with the other side of page tables while we ' re here : what happens when the
* Guest asks for a page table to be updated ?
*
* We already saw that demand_page ( ) will fill in the shadow page tables when
* needed , so we can simply remove shadow page table entries whenever the Guest
* tells us they ' ve changed . When the Guest tries to use the new entry it will
* fault and demand_page ( ) will fix it up .
*
* So with that in mind here ' s our code to to update a ( top - level ) PGD entry :
*/
2009-05-30 22:48:08 +04:00
void guest_set_pgd ( struct lguest * lg , unsigned long gpgdir , u32 idx )
2007-07-19 12:49:23 +04:00
{
int pgdir ;
if ( idx > = SWITCHER_PGD_INDEX )
return ;
2007-07-26 21:41:04 +04:00
/* If they're talking about a page table we have a shadow for... */
2007-10-22 05:03:34 +04:00
pgdir = find_pgdir ( lg , gpgdir ) ;
2007-07-19 12:49:23 +04:00
if ( pgdir < ARRAY_SIZE ( lg - > pgdirs ) )
2007-07-26 21:41:04 +04:00
/* ... throw it away. */
2009-06-13 08:27:06 +04:00
release_pgd ( lg - > pgdirs [ pgdir ] . pgdir + idx ) ;
2007-07-19 12:49:23 +04:00
}
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
void guest_set_pmd ( struct lguest * lg , unsigned long pmdp , u32 idx )
{
guest_pagetable_clear_all ( & lg - > cpus [ 0 ] ) ;
}
# endif
2007-07-19 12:49:23 +04:00
2009-07-31 02:03:45 +04:00
/*
* Once we know how much memory we have we can construct simple identity ( which
* set virtual = = physical ) and linear mappings which will get the Guest far
* enough into the boot to create its own .
2008-09-29 08:40:07 +04:00
*
* We lay them out of the way , just below the initrd ( which is why we need to
2009-07-31 02:03:45 +04:00
* know its size here ) .
*/
2008-09-29 08:40:07 +04:00
static unsigned long setup_pagetables ( struct lguest * lg ,
unsigned long mem ,
unsigned long initrd_size )
{
pgd_t __user * pgdir ;
pte_t __user * linear ;
unsigned long mem_base = ( unsigned long ) lg - > mem_base ;
2009-06-13 08:27:07 +04:00
unsigned int mapped_pages , i , linear_pages ;
# ifdef CONFIG_X86_PAE
pmd_t __user * pmds ;
unsigned int j ;
pgd_t pgd ;
pmd_t pmd ;
# else
unsigned int phys_linear ;
# endif
2008-09-29 08:40:07 +04:00
2009-07-31 02:03:45 +04:00
/*
* We have mapped_pages frames to map , so we need linear_pages page
* tables to map them .
*/
2008-09-29 08:40:07 +04:00
mapped_pages = mem / PAGE_SIZE ;
linear_pages = ( mapped_pages + PTRS_PER_PTE - 1 ) / PTRS_PER_PTE ;
/* We put the toplevel page directory page at the top of memory. */
pgdir = ( pgd_t * ) ( mem + mem_base - initrd_size - PAGE_SIZE ) ;
/* Now we use the next linear_pages pages as pte pages */
linear = ( void * ) pgdir - linear_pages * PAGE_SIZE ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmds = ( void * ) linear - PAGE_SIZE ;
# endif
2009-07-31 02:03:45 +04:00
/*
* Linear mapping is easy : put every page ' s address into the
* mapping in order .
*/
2008-09-29 08:40:07 +04:00
for ( i = 0 ; i < mapped_pages ; i + + ) {
pte_t pte ;
pte = pfn_pte ( i , __pgprot ( _PAGE_PRESENT | _PAGE_RW | _PAGE_USER ) ) ;
if ( copy_to_user ( & linear [ i ] , & pte , sizeof ( pte ) ) ! = 0 )
return - EFAULT ;
}
2009-07-31 02:03:45 +04:00
/*
* The top level points to the linear page table pages above .
* We setup the identity and linear mappings here .
*/
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
2009-06-13 08:27:08 +04:00
for ( i = j = 0 ; i < mapped_pages & & j < PTRS_PER_PMD ;
2009-06-13 08:27:07 +04:00
i + = PTRS_PER_PTE , j + + ) {
native_set_pmd ( & pmd , __pmd ( ( ( unsigned long ) ( linear + i )
- mem_base ) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER ) ) ;
if ( copy_to_user ( & pmds [ j ] , & pmd , sizeof ( pmd ) ) ! = 0 )
return - EFAULT ;
}
set_pgd ( & pgd , __pgd ( ( ( u32 ) pmds - mem_base ) | _PAGE_PRESENT ) ) ;
if ( copy_to_user ( & pgdir [ 0 ] , & pgd , sizeof ( pgd ) ) ! = 0 )
return - EFAULT ;
if ( copy_to_user ( & pgdir [ 3 ] , & pgd , sizeof ( pgd ) ) ! = 0 )
return - EFAULT ;
# else
2008-09-29 08:40:07 +04:00
phys_linear = ( unsigned long ) linear - mem_base ;
for ( i = 0 ; i < mapped_pages ; i + = PTRS_PER_PTE ) {
pgd_t pgd ;
pgd = __pgd ( ( phys_linear + i * sizeof ( pte_t ) ) |
( _PAGE_PRESENT | _PAGE_RW | _PAGE_USER ) ) ;
if ( copy_to_user ( & pgdir [ i / PTRS_PER_PTE ] , & pgd , sizeof ( pgd ) )
| | copy_to_user ( & pgdir [ pgd_index ( PAGE_OFFSET )
+ i / PTRS_PER_PTE ] ,
& pgd , sizeof ( pgd ) ) )
return - EFAULT ;
}
2009-06-13 08:27:07 +04:00
# endif
2008-09-29 08:40:07 +04:00
2009-07-31 02:03:45 +04:00
/*
* We return the top level ( guest - physical ) address : remember where
* this is .
*/
2008-09-29 08:40:07 +04:00
return ( unsigned long ) pgdir - mem_base ;
}
2009-07-31 02:03:45 +04:00
/*H:500
* ( vii ) Setting up the page tables initially .
2007-07-26 21:41:04 +04:00
*
* When a Guest is first created , the Launcher tells us where the toplevel of
2009-07-31 02:03:45 +04:00
* its first page table is . We set some things up here :
*/
2008-09-29 08:40:07 +04:00
int init_guest_pagetable ( struct lguest * lg )
2007-07-19 12:49:23 +04:00
{
2008-09-29 08:40:07 +04:00
u64 mem ;
u32 initrd_size ;
struct boot_params __user * boot = ( struct boot_params * ) lg - > mem_base ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pgd_t * pgd ;
pmd_t * pmd_table ;
# endif
2009-07-31 02:03:45 +04:00
/*
* Get the Guest memory size and the ramdisk size from the boot header
* located at lg - > mem_base ( Guest address 0 ) .
*/
2008-09-29 08:40:07 +04:00
if ( copy_from_user ( & mem , & boot - > e820_map [ 0 ] . size , sizeof ( mem ) )
| | get_user ( initrd_size , & boot - > hdr . ramdisk_size ) )
return - EFAULT ;
2009-07-31 02:03:45 +04:00
/*
* We start on the first shadow page table , and give it a blank PGD
* page .
*/
2008-09-29 08:40:07 +04:00
lg - > pgdirs [ 0 ] . gpgdir = setup_pagetables ( lg , mem , initrd_size ) ;
if ( IS_ERR_VALUE ( lg - > pgdirs [ 0 ] . gpgdir ) )
return lg - > pgdirs [ 0 ] . gpgdir ;
2008-01-07 16:05:37 +03:00
lg - > pgdirs [ 0 ] . pgdir = ( pgd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! lg - > pgdirs [ 0 ] . pgdir )
2007-07-19 12:49:23 +04:00
return - ENOMEM ;
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pgd = lg - > pgdirs [ 0 ] . pgdir ;
pmd_table = ( pmd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! pmd_table )
return - ENOMEM ;
set_pgd ( pgd + SWITCHER_PGD_INDEX ,
__pgd ( __pa ( pmd_table ) | _PAGE_PRESENT ) ) ;
# endif
2008-01-07 16:05:37 +03:00
lg - > cpus [ 0 ] . cpu_pgd = 0 ;
2007-07-19 12:49:23 +04:00
return 0 ;
}
2007-10-22 05:03:36 +04:00
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
2008-01-18 00:19:42 +03:00
void page_table_guest_data_init ( struct lg_cpu * cpu )
2007-10-22 05:03:36 +04:00
{
/* We get the kernel address: above this is all kernel memory. */
2008-01-18 00:19:42 +03:00
if ( get_user ( cpu - > lg - > kernel_address ,
2009-06-13 08:27:07 +04:00
& cpu - > lg - > lguest_data - > kernel_address )
2009-07-31 02:03:45 +04:00
/*
* We tell the Guest that it can ' t use the top 2 or 4 MB
* of virtual addresses used by the Switcher .
*/
2009-06-13 08:27:07 +04:00
| | put_user ( RESERVE_MEM * 1024 * 1024 ,
& cpu - > lg - > lguest_data - > reserve_mem )
| | put_user ( cpu - > lg - > pgdirs [ 0 ] . gpgdir ,
& cpu - > lg - > lguest_data - > pgdir ) )
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " bad guest page %p " , cpu - > lg - > lguest_data ) ;
2007-10-22 05:03:36 +04:00
2009-07-31 02:03:45 +04:00
/*
* In flush_user_mappings ( ) we loop from 0 to
2007-10-22 05:03:36 +04:00
* " pgd_index(lg->kernel_address) " . This assumes it won ' t hit the
2009-07-31 02:03:45 +04:00
* Switcher mappings , so check that now .
*/
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
if ( pgd_index ( cpu - > lg - > kernel_address ) = = SWITCHER_PGD_INDEX & &
pmd_index ( cpu - > lg - > kernel_address ) = = SWITCHER_PMD_INDEX )
# else
2008-01-18 00:19:42 +03:00
if ( pgd_index ( cpu - > lg - > kernel_address ) > = SWITCHER_PGD_INDEX )
2009-06-13 08:27:07 +04:00
# endif
2008-01-18 00:19:42 +03:00
kill_guest ( cpu , " bad kernel address %#lx " ,
cpu - > lg - > kernel_address ) ;
2007-10-22 05:03:36 +04:00
}
2007-07-26 21:41:04 +04:00
/* When a Guest dies, our cleanup is fairly simple. */
2007-07-19 12:49:23 +04:00
void free_guest_pagetable ( struct lguest * lg )
{
unsigned int i ;
2007-07-26 21:41:04 +04:00
/* Throw away all page table pages. */
2007-07-19 12:49:23 +04:00
release_all_pagetables ( lg ) ;
2007-07-26 21:41:04 +04:00
/* Now free the top levels: free_page() can handle 0 just fine. */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < ARRAY_SIZE ( lg - > pgdirs ) ; i + + )
free_page ( ( long ) lg - > pgdirs [ i ] . pgdir ) ;
}
2009-07-31 02:03:45 +04:00
/*H:480
* ( vi ) Mapping the Switcher when the Guest is about to run .
2007-07-26 21:41:04 +04:00
*
2007-10-25 09:02:50 +04:00
* The Switcher and the two pages for this CPU need to be visible in the
2007-07-26 21:41:04 +04:00
* Guest ( and not the pages for other CPUs ) . We have the appropriate PTE pages
2007-10-25 09:02:50 +04:00
* for each CPU already set up , we just need to hook them in now we know which
2009-07-31 02:03:45 +04:00
* Guest is about to run on this CPU .
*/
2008-01-07 16:05:30 +03:00
void map_switcher_in_guest ( struct lg_cpu * cpu , struct lguest_pages * pages )
2007-07-19 12:49:23 +04:00
{
2007-10-22 05:03:33 +04:00
pte_t * switcher_pte_page = __get_cpu_var ( switcher_pte_pages ) ;
pte_t regs_pte ;
2008-01-07 16:05:32 +03:00
unsigned long pfn ;
2007-07-19 12:49:23 +04:00
2009-06-13 08:27:07 +04:00
# ifdef CONFIG_X86_PAE
pmd_t switcher_pmd ;
pmd_t * pmd_table ;
native_set_pmd ( & switcher_pmd , pfn_pmd ( __pa ( switcher_pte_page ) > >
PAGE_SHIFT , PAGE_KERNEL_EXEC ) ) ;
pmd_table = __va ( pgd_pfn ( cpu - > lg - >
pgdirs [ cpu - > cpu_pgd ] . pgdir [ SWITCHER_PGD_INDEX ] )
< < PAGE_SHIFT ) ;
native_set_pmd ( & pmd_table [ SWITCHER_PMD_INDEX ] , switcher_pmd ) ;
# else
pgd_t switcher_pgd ;
2009-07-31 02:03:45 +04:00
/*
* Make the last PGD entry for this Guest point to the Switcher ' s PTE
* page for this CPU ( with appropriate flags ) .
*/
2009-05-30 22:35:49 +04:00
switcher_pgd = __pgd ( __pa ( switcher_pte_page ) | __PAGE_KERNEL_EXEC ) ;
2007-10-22 05:03:33 +04:00
2008-01-07 16:05:37 +03:00
cpu - > lg - > pgdirs [ cpu - > cpu_pgd ] . pgdir [ SWITCHER_PGD_INDEX ] = switcher_pgd ;
2007-07-19 12:49:23 +04:00
2009-06-13 08:27:07 +04:00
# endif
2009-07-31 02:03:45 +04:00
/*
* We also change the Switcher PTE page . When we ' re running the Guest ,
2007-07-26 21:41:04 +04:00
* we want the Guest ' s " regs " page to appear where the first Switcher
* page for this CPU is . This is an optimization : when the Switcher
* saves the Guest registers , it saves them into the first page of this
* CPU ' s " struct lguest_pages " : if we make sure the Guest ' s register
* page is already mapped there , we don ' t have to copy them out
2009-07-31 02:03:45 +04:00
* again .
*/
2008-01-07 16:05:32 +03:00
pfn = __pa ( cpu - > regs_page ) > > PAGE_SHIFT ;
2009-06-13 08:27:06 +04:00
native_set_pte ( & regs_pte , pfn_pte ( pfn , PAGE_KERNEL ) ) ;
native_set_pte ( & switcher_pte_page [ pte_index ( ( unsigned long ) pages ) ] ,
regs_pte ) ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
static void free_switcher_pte_pages ( void )
{
unsigned int i ;
for_each_possible_cpu ( i )
free_page ( ( long ) switcher_pte_page ( i ) ) ;
}
2009-07-31 02:03:45 +04:00
/*H:520
* Setting up the Switcher PTE page for given CPU is fairly easy , given
2007-07-26 21:41:04 +04:00
* the CPU number and the " struct page " s for the Switcher code itself .
*
2009-07-31 02:03:45 +04:00
* Currently the Switcher is less than a page long , so " pages " is always 1.
*/
2007-07-19 12:49:23 +04:00
static __init void populate_switcher_pte_page ( unsigned int cpu ,
struct page * switcher_page [ ] ,
unsigned int pages )
{
unsigned int i ;
2007-10-22 05:03:33 +04:00
pte_t * pte = switcher_pte_page ( cpu ) ;
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* The first entries are easy: they map the Switcher code. */
2007-07-19 12:49:23 +04:00
for ( i = 0 ; i < pages ; i + + ) {
2009-06-13 08:27:06 +04:00
native_set_pte ( & pte [ i ] , mk_pte ( switcher_page [ i ] ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED ) ) ) ;
2007-07-19 12:49:23 +04:00
}
2007-07-26 21:41:04 +04:00
/* The only other thing we map is this CPU's pair of pages. */
2007-07-19 12:49:23 +04:00
i = pages + cpu * 2 ;
2007-07-26 21:41:04 +04:00
/* First page (Guest registers) is writable from the Guest */
2009-06-13 08:27:06 +04:00
native_set_pte ( & pte [ i ] , pfn_pte ( page_to_pfn ( switcher_page [ i ] ) ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_RW ) ) ) ;
2007-10-22 05:03:33 +04:00
2009-07-31 02:03:45 +04:00
/*
* The second page contains the " struct lguest_ro_state " , and is
* read - only .
*/
2009-06-13 08:27:06 +04:00
native_set_pte ( & pte [ i + 1 ] , pfn_pte ( page_to_pfn ( switcher_page [ i + 1 ] ) ,
__pgprot ( _PAGE_PRESENT | _PAGE_ACCESSED ) ) ) ;
2007-07-19 12:49:23 +04:00
}
2009-07-31 02:03:45 +04:00
/*
* We ' ve made it through the page table code . Perhaps our tired brains are
2007-10-25 09:02:50 +04:00
* still processing the details , or perhaps we ' re simply glad it ' s over .
*
2008-03-28 19:05:53 +03:00
* If nothing else , note that all this complexity in juggling shadow page tables
* in sync with the Guest ' s page tables is for one reason : for most Guests this
* page table dance determines how bad performance will be . This is why Xen
* uses exotic direct Guest pagetable manipulation , and why both Intel and AMD
* have implemented shadow page table support directly into hardware .
2007-10-25 09:02:50 +04:00
*
2009-07-31 02:03:45 +04:00
* There is just one file remaining in the Host .
*/
2007-10-25 09:02:50 +04:00
2009-07-31 02:03:45 +04:00
/*H:510
* At boot or module load time , init_pagetables ( ) allocates and populates
* the Switcher PTE page for each CPU .
*/
2007-07-19 12:49:23 +04:00
__init int init_pagetables ( struct page * * switcher_page , unsigned int pages )
{
unsigned int i ;
for_each_possible_cpu ( i ) {
2007-10-22 05:03:33 +04:00
switcher_pte_page ( i ) = ( pte_t * ) get_zeroed_page ( GFP_KERNEL ) ;
2007-07-19 12:49:23 +04:00
if ( ! switcher_pte_page ( i ) ) {
free_switcher_pte_pages ( ) ;
return - ENOMEM ;
}
populate_switcher_pte_page ( i , switcher_page , pages ) ;
}
return 0 ;
}
2007-07-26 21:41:04 +04:00
/*:*/
2007-07-19 12:49:23 +04:00
2007-07-26 21:41:04 +04:00
/* Cleaning up simply involves freeing the PTE page for each CPU. */
2007-07-19 12:49:23 +04:00
void free_pagetables ( void )
{
free_switcher_pte_pages ( ) ;
}