2009-02-09 23:05:49 +03:00
/******************************************************************************
* privcmd . c
*
* Interface to privileged domain - 0 commands .
*
* Copyright ( c ) 2002 - 2004 , K A Fraser , B Dragovic
*/
# include <linux/kernel.h>
2011-12-16 20:34:33 +04:00
# include <linux/module.h>
2009-02-09 23:05:49 +03:00
# include <linux/sched.h>
# include <linux/slab.h>
# include <linux/string.h>
# include <linux/errno.h>
# include <linux/mm.h>
# include <linux/mman.h>
# include <linux/uaccess.h>
# include <linux/swap.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <linux/seq_file.h>
2011-12-16 20:34:33 +04:00
# include <linux/miscdevice.h>
2009-02-09 23:05:49 +03:00
# include <asm/pgalloc.h>
# include <asm/pgtable.h>
# include <asm/tlb.h>
# include <asm/xen/hypervisor.h>
# include <asm/xen/hypercall.h>
# include <xen/xen.h>
# include <xen/privcmd.h>
# include <xen/interface/xen.h>
# include <xen/features.h>
# include <xen/page.h>
2009-05-21 13:09:46 +04:00
# include <xen/xen-ops.h>
2012-10-18 04:11:21 +04:00
# include <xen/balloon.h>
2009-05-20 18:42:14 +04:00
2011-12-16 20:34:33 +04:00
# include "privcmd.h"
MODULE_LICENSE ( " GPL " ) ;
2012-10-18 04:11:21 +04:00
# define PRIV_VMA_LOCKED ((void *)1)
2009-02-09 23:05:49 +03:00
# ifndef HAVE_ARCH_PRIVCMD_MMAP
static int privcmd_enforce_singleshot_mapping ( struct vm_area_struct * vma ) ;
# endif
static long privcmd_ioctl_hypercall ( void __user * udata )
{
struct privcmd_hypercall hypercall ;
long ret ;
if ( copy_from_user ( & hypercall , udata , sizeof ( hypercall ) ) )
return - EFAULT ;
ret = privcmd_call ( hypercall . op ,
hypercall . arg [ 0 ] , hypercall . arg [ 1 ] ,
hypercall . arg [ 2 ] , hypercall . arg [ 3 ] ,
hypercall . arg [ 4 ] ) ;
return ret ;
}
static void free_page_list ( struct list_head * pages )
{
struct page * p , * n ;
list_for_each_entry_safe ( p , n , pages , lru )
__free_page ( p ) ;
INIT_LIST_HEAD ( pages ) ;
}
/*
* Given an array of items in userspace , return a list of pages
* containing the data . If copying fails , either because of memory
* allocation failure or a problem reading user memory , return an
* error code ; its up to the caller to dispose of any partial list .
*/
static int gather_array ( struct list_head * pagelist ,
unsigned nelem , size_t size ,
2012-08-31 17:59:30 +04:00
const void __user * data )
2009-02-09 23:05:49 +03:00
{
unsigned pageidx ;
void * pagedata ;
int ret ;
if ( size > PAGE_SIZE )
return 0 ;
pageidx = PAGE_SIZE ;
pagedata = NULL ; /* quiet, gcc */
while ( nelem - - ) {
if ( pageidx > PAGE_SIZE - size ) {
struct page * page = alloc_page ( GFP_KERNEL ) ;
ret = - ENOMEM ;
if ( page = = NULL )
goto fail ;
pagedata = page_address ( page ) ;
list_add_tail ( & page - > lru , pagelist ) ;
pageidx = 0 ;
}
ret = - EFAULT ;
if ( copy_from_user ( pagedata + pageidx , data , size ) )
goto fail ;
data + = size ;
pageidx + = size ;
}
ret = 0 ;
fail :
return ret ;
}
/*
* Call function " fn " on each element of the array fragmented
* over a list of pages .
*/
static int traverse_pages ( unsigned nelem , size_t size ,
struct list_head * pos ,
int ( * fn ) ( void * data , void * state ) ,
void * state )
{
void * pagedata ;
unsigned pageidx ;
2009-05-20 18:42:14 +04:00
int ret = 0 ;
2009-02-09 23:05:49 +03:00
BUG_ON ( size > PAGE_SIZE ) ;
pageidx = PAGE_SIZE ;
pagedata = NULL ; /* hush, gcc */
while ( nelem - - ) {
if ( pageidx > PAGE_SIZE - size ) {
struct page * page ;
pos = pos - > next ;
page = list_entry ( pos , struct page , lru ) ;
pagedata = page_address ( page ) ;
pageidx = 0 ;
}
ret = ( * fn ) ( pagedata + pageidx , state ) ;
if ( ret )
break ;
pageidx + = size ;
}
return ret ;
}
struct mmap_mfn_state {
unsigned long va ;
struct vm_area_struct * vma ;
domid_t domain ;
} ;
static int mmap_mfn_range ( void * data , void * state )
{
struct privcmd_mmap_entry * msg = data ;
struct mmap_mfn_state * st = state ;
struct vm_area_struct * vma = st - > vma ;
int rc ;
/* Do not allow range to wrap the address space. */
if ( ( msg - > npages > ( LONG_MAX > > PAGE_SHIFT ) ) | |
( ( unsigned long ) ( msg - > npages < < PAGE_SHIFT ) > = - st - > va ) )
return - EINVAL ;
/* Range chunks must be contiguous in va space. */
if ( ( msg - > va ! = st - > va ) | |
( ( msg - > va + ( msg - > npages < < PAGE_SHIFT ) ) > vma - > vm_end ) )
return - EINVAL ;
2009-05-21 13:09:46 +04:00
rc = xen_remap_domain_mfn_range ( vma ,
msg - > va & PAGE_MASK ,
msg - > mfn , msg - > npages ,
vma - > vm_page_prot ,
2012-10-18 00:37:49 +04:00
st - > domain , NULL ) ;
2009-02-09 23:05:49 +03:00
if ( rc < 0 )
return rc ;
st - > va + = msg - > npages < < PAGE_SHIFT ;
return 0 ;
}
static long privcmd_ioctl_mmap ( void __user * udata )
{
struct privcmd_mmap mmapcmd ;
struct mm_struct * mm = current - > mm ;
struct vm_area_struct * vma ;
int rc ;
LIST_HEAD ( pagelist ) ;
struct mmap_mfn_state state ;
2012-10-18 04:11:21 +04:00
/* We only support privcmd_ioctl_mmap_batch for auto translated. */
if ( xen_feature ( XENFEAT_auto_translated_physmap ) )
return - ENOSYS ;
2009-02-09 23:05:49 +03:00
if ( copy_from_user ( & mmapcmd , udata , sizeof ( mmapcmd ) ) )
return - EFAULT ;
rc = gather_array ( & pagelist ,
mmapcmd . num , sizeof ( struct privcmd_mmap_entry ) ,
mmapcmd . entry ) ;
if ( rc | | list_empty ( & pagelist ) )
goto out ;
down_write ( & mm - > mmap_sem ) ;
{
struct page * page = list_first_entry ( & pagelist ,
struct page , lru ) ;
struct privcmd_mmap_entry * msg = page_address ( page ) ;
vma = find_vma ( mm , msg - > va ) ;
rc = - EINVAL ;
if ( ! vma | | ( msg - > va ! = vma - > vm_start ) | |
! privcmd_enforce_singleshot_mapping ( vma ) )
goto out_up ;
}
state . va = vma - > vm_start ;
state . vma = vma ;
state . domain = mmapcmd . dom ;
rc = traverse_pages ( mmapcmd . num , sizeof ( struct privcmd_mmap_entry ) ,
& pagelist ,
mmap_mfn_range , & state ) ;
out_up :
up_write ( & mm - > mmap_sem ) ;
out :
free_page_list ( & pagelist ) ;
return rc ;
}
struct mmap_batch_state {
domid_t domain ;
unsigned long va ;
struct vm_area_struct * vma ;
2012-10-18 04:11:21 +04:00
int index ;
2012-08-31 17:59:30 +04:00
/* A tristate:
* 0 for no errors
* 1 if at least one error has happened ( and no
* - ENOENT errors have happened )
* - ENOENT if at least 1 - ENOENT has happened .
*/
int global_error ;
2013-01-15 07:35:40 +04:00
int version ;
2012-08-31 17:59:30 +04:00
/* User-space mfn array to store errors in the second pass for V1. */
xen_pfn_t __user * user_mfn ;
2013-01-15 07:35:40 +04:00
/* User-space int array to store errors in the second pass for V2. */
int __user * user_err ;
2009-02-09 23:05:49 +03:00
} ;
2012-10-18 04:11:21 +04:00
/* auto translated dom0 note: if domU being created is PV, then mfn is
* mfn ( addr on bus ) . If it ' s auto xlated , then mfn is pfn ( input to HAP ) .
*/
2009-02-09 23:05:49 +03:00
static int mmap_batch_fn ( void * data , void * state )
{
xen_pfn_t * mfnp = data ;
struct mmap_batch_state * st = state ;
2012-10-18 04:11:21 +04:00
struct vm_area_struct * vma = st - > vma ;
struct page * * pages = vma - > vm_private_data ;
struct page * cur_page = NULL ;
2012-08-31 17:59:30 +04:00
int ret ;
2012-10-18 04:11:21 +04:00
if ( xen_feature ( XENFEAT_auto_translated_physmap ) )
cur_page = pages [ st - > index + + ] ;
2012-08-31 17:59:30 +04:00
ret = xen_remap_domain_mfn_range ( st - > vma , st - > va & PAGE_MASK , * mfnp , 1 ,
2012-10-18 00:37:49 +04:00
st - > vma - > vm_page_prot , st - > domain ,
2012-10-18 04:11:21 +04:00
& cur_page ) ;
2009-02-09 23:05:49 +03:00
2012-08-31 17:59:30 +04:00
/* Store error code for second pass. */
2013-01-15 07:35:40 +04:00
if ( st - > version = = 1 ) {
if ( ret < 0 ) {
/*
* V1 encodes the error codes in the 32 bit top nibble of the
* mfn ( with its known limitations vis - a - vis 64 bit callers ) .
*/
* mfnp | = ( ret = = - ENOENT ) ?
PRIVCMD_MMAPBATCH_PAGED_ERROR :
PRIVCMD_MMAPBATCH_MFN_ERROR ;
}
} else { /* st->version == 2 */
* ( ( int * ) mfnp ) = ret ;
}
2012-08-31 17:59:30 +04:00
/* And see if it affects the global_error. */
if ( ret < 0 ) {
if ( ret = = - ENOENT )
st - > global_error = - ENOENT ;
else {
/* Record that at least one error has happened. */
if ( st - > global_error = = 0 )
st - > global_error = 1 ;
}
2009-02-09 23:05:49 +03:00
}
st - > va + = PAGE_SIZE ;
return 0 ;
}
2013-01-15 07:35:40 +04:00
static int mmap_return_errors ( void * data , void * state )
2009-02-09 23:05:49 +03:00
{
struct mmap_batch_state * st = state ;
2012-08-31 17:59:30 +04:00
2013-01-15 07:35:40 +04:00
if ( st - > version = = 1 ) {
xen_pfn_t mfnp = * ( ( xen_pfn_t * ) data ) ;
if ( mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR )
return __put_user ( mfnp , st - > user_mfn + + ) ;
else
st - > user_mfn + + ;
} else { /* st->version == 2 */
int err = * ( ( int * ) data ) ;
if ( err )
return __put_user ( err , st - > user_err + + ) ;
else
st - > user_err + + ;
}
return 0 ;
2009-02-09 23:05:49 +03:00
}
2012-10-18 04:11:21 +04:00
/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
* the vma with the page info to use later .
* Returns : 0 if success , otherwise - errno
*/
static int alloc_empty_pages ( struct vm_area_struct * vma , int numpgs )
{
int rc ;
struct page * * pages ;
pages = kcalloc ( numpgs , sizeof ( pages [ 0 ] ) , GFP_KERNEL ) ;
if ( pages = = NULL )
return - ENOMEM ;
rc = alloc_xenballooned_pages ( numpgs , pages , 0 ) ;
if ( rc ! = 0 ) {
pr_warn ( " %s Could not alloc %d pfns rc:%d \n " , __func__ ,
numpgs , rc ) ;
kfree ( pages ) ;
return - ENOMEM ;
}
BUG_ON ( vma - > vm_private_data ! = PRIV_VMA_LOCKED ) ;
vma - > vm_private_data = pages ;
return 0 ;
}
2009-03-08 14:10:00 +03:00
static struct vm_operations_struct privcmd_vm_ops ;
2012-08-31 17:59:30 +04:00
static long privcmd_ioctl_mmap_batch ( void __user * udata , int version )
2009-02-09 23:05:49 +03:00
{
int ret ;
2012-08-31 17:59:30 +04:00
struct privcmd_mmapbatch_v2 m ;
2009-02-09 23:05:49 +03:00
struct mm_struct * mm = current - > mm ;
struct vm_area_struct * vma ;
unsigned long nr_pages ;
LIST_HEAD ( pagelist ) ;
struct mmap_batch_state state ;
2012-08-31 17:59:30 +04:00
switch ( version ) {
case 1 :
if ( copy_from_user ( & m , udata , sizeof ( struct privcmd_mmapbatch ) ) )
return - EFAULT ;
/* Returns per-frame error in m.arr. */
m . err = NULL ;
if ( ! access_ok ( VERIFY_WRITE , m . arr , m . num * sizeof ( * m . arr ) ) )
return - EFAULT ;
break ;
case 2 :
if ( copy_from_user ( & m , udata , sizeof ( struct privcmd_mmapbatch_v2 ) ) )
return - EFAULT ;
/* Returns per-frame error code in m.err. */
if ( ! access_ok ( VERIFY_WRITE , m . err , m . num * ( sizeof ( * m . err ) ) ) )
return - EFAULT ;
break ;
default :
return - EINVAL ;
}
2009-02-09 23:05:49 +03:00
nr_pages = m . num ;
if ( ( m . num < = 0 ) | | ( nr_pages > ( LONG_MAX > > PAGE_SHIFT ) ) )
return - EINVAL ;
2012-08-31 17:59:30 +04:00
ret = gather_array ( & pagelist , m . num , sizeof ( xen_pfn_t ) , m . arr ) ;
2009-02-09 23:05:49 +03:00
2012-08-31 17:59:30 +04:00
if ( ret )
2009-02-09 23:05:49 +03:00
goto out ;
2012-08-31 17:59:30 +04:00
if ( list_empty ( & pagelist ) ) {
ret = - EINVAL ;
goto out ;
}
2013-01-15 07:35:40 +04:00
if ( version = = 2 ) {
/* Zero error array now to only copy back actual errors. */
if ( clear_user ( m . err , sizeof ( int ) * m . num ) ) {
ret = - EFAULT ;
goto out ;
}
2012-08-31 17:59:30 +04:00
}
2009-02-09 23:05:49 +03:00
down_write ( & mm - > mmap_sem ) ;
vma = find_vma ( mm , m . addr ) ;
if ( ! vma | |
2009-03-08 14:10:00 +03:00
vma - > vm_ops ! = & privcmd_vm_ops | |
2009-02-09 23:05:49 +03:00
( m . addr ! = vma - > vm_start ) | |
( ( m . addr + ( nr_pages < < PAGE_SHIFT ) ) ! = vma - > vm_end ) | |
! privcmd_enforce_singleshot_mapping ( vma ) ) {
up_write ( & mm - > mmap_sem ) ;
2012-11-16 22:36:49 +04:00
ret = - EINVAL ;
2009-02-09 23:05:49 +03:00
goto out ;
}
2012-10-18 04:11:21 +04:00
if ( xen_feature ( XENFEAT_auto_translated_physmap ) ) {
ret = alloc_empty_pages ( vma , m . num ) ;
if ( ret < 0 ) {
up_write ( & mm - > mmap_sem ) ;
goto out ;
}
}
2009-02-09 23:05:49 +03:00
2012-08-31 17:59:30 +04:00
state . domain = m . dom ;
state . vma = vma ;
state . va = m . addr ;
2012-10-18 04:11:21 +04:00
state . index = 0 ;
2012-08-31 17:59:30 +04:00
state . global_error = 0 ;
2013-01-15 07:35:40 +04:00
state . version = version ;
2009-02-09 23:05:49 +03:00
2012-08-31 17:59:30 +04:00
/* mmap_batch_fn guarantees ret == 0 */
BUG_ON ( traverse_pages ( m . num , sizeof ( xen_pfn_t ) ,
& pagelist , mmap_batch_fn , & state ) ) ;
2009-02-09 23:05:49 +03:00
up_write ( & mm - > mmap_sem ) ;
2013-01-15 07:35:40 +04:00
if ( state . global_error ) {
/* Write back errors in second pass. */
state . user_mfn = ( xen_pfn_t * ) m . arr ;
state . user_err = m . err ;
ret = traverse_pages ( m . num , sizeof ( xen_pfn_t ) ,
& pagelist , mmap_return_errors , & state ) ;
} else
ret = 0 ;
2012-08-31 17:59:30 +04:00
/* If we have not had any EFAULT-like global errors then set the global
* error to - ENOENT if necessary . */
if ( ( ret = = 0 ) & & ( state . global_error = = - ENOENT ) )
ret = - ENOENT ;
2009-02-09 23:05:49 +03:00
out :
free_page_list ( & pagelist ) ;
return ret ;
}
static long privcmd_ioctl ( struct file * file ,
unsigned int cmd , unsigned long data )
{
int ret = - ENOSYS ;
void __user * udata = ( void __user * ) data ;
switch ( cmd ) {
case IOCTL_PRIVCMD_HYPERCALL :
ret = privcmd_ioctl_hypercall ( udata ) ;
break ;
case IOCTL_PRIVCMD_MMAP :
ret = privcmd_ioctl_mmap ( udata ) ;
break ;
case IOCTL_PRIVCMD_MMAPBATCH :
2012-08-31 17:59:30 +04:00
ret = privcmd_ioctl_mmap_batch ( udata , 1 ) ;
break ;
case IOCTL_PRIVCMD_MMAPBATCH_V2 :
ret = privcmd_ioctl_mmap_batch ( udata , 2 ) ;
2009-02-09 23:05:49 +03:00
break ;
default :
ret = - EINVAL ;
break ;
}
return ret ;
}
2012-10-18 04:11:21 +04:00
static void privcmd_close ( struct vm_area_struct * vma )
{
struct page * * pages = vma - > vm_private_data ;
int numpgs = ( vma - > vm_end - vma - > vm_start ) > > PAGE_SHIFT ;
if ( ! xen_feature ( XENFEAT_auto_translated_physmap | | ! numpgs | | ! pages ) )
return ;
xen_unmap_domain_mfn_range ( vma , numpgs , pages ) ;
free_xenballooned_pages ( numpgs , pages ) ;
kfree ( pages ) ;
}
2009-02-09 23:05:49 +03:00
static int privcmd_fault ( struct vm_area_struct * vma , struct vm_fault * vmf )
{
2009-03-06 20:56:59 +03:00
printk ( KERN_DEBUG " privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p \n " ,
vma , vma - > vm_start , vma - > vm_end ,
vmf - > pgoff , vmf - > virtual_address ) ;
2009-02-09 23:05:49 +03:00
return VM_FAULT_SIGBUS ;
}
static struct vm_operations_struct privcmd_vm_ops = {
2012-10-18 04:11:21 +04:00
. close = privcmd_close ,
2009-02-09 23:05:49 +03:00
. fault = privcmd_fault
} ;
static int privcmd_mmap ( struct file * file , struct vm_area_struct * vma )
{
2010-11-11 23:37:43 +03:00
/* DONTCOPY is essential for Xen because copy_page_range doesn't know
* how to recreate these mappings */
mm: kill vma flag VM_RESERVED and mm->reserved_vm counter
A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA,
currently it lost original meaning but still has some effects:
| effect | alternative flags
-+------------------------+---------------------------------------------
1| account as reserved_vm | VM_IO
2| skip in core dump | VM_IO, VM_DONTDUMP
3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
This patch removes reserved_vm counter from mm_struct. Seems like nobody
cares about it, it does not exported into userspace directly, it only
reduces total_vm showed in proc.
Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP.
remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP.
remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP.
[akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-09 03:29:02 +04:00
vma - > vm_flags | = VM_IO | VM_PFNMAP | VM_DONTCOPY |
VM_DONTEXPAND | VM_DONTDUMP ;
2009-02-09 23:05:49 +03:00
vma - > vm_ops = & privcmd_vm_ops ;
vma - > vm_private_data = NULL ;
return 0 ;
}
static int privcmd_enforce_singleshot_mapping ( struct vm_area_struct * vma )
{
2012-10-18 04:11:21 +04:00
return ! cmpxchg ( & vma - > vm_private_data , NULL , PRIV_VMA_LOCKED ) ;
2009-02-09 23:05:49 +03:00
}
2011-12-16 20:34:33 +04:00
const struct file_operations xen_privcmd_fops = {
. owner = THIS_MODULE ,
2009-02-09 23:05:49 +03:00
. unlocked_ioctl = privcmd_ioctl ,
. mmap = privcmd_mmap ,
} ;
2011-12-16 20:34:33 +04:00
EXPORT_SYMBOL_GPL ( xen_privcmd_fops ) ;
static struct miscdevice privcmd_dev = {
. minor = MISC_DYNAMIC_MINOR ,
. name = " xen/privcmd " ,
. fops = & xen_privcmd_fops ,
} ;
static int __init privcmd_init ( void )
{
int err ;
if ( ! xen_domain ( ) )
return - ENODEV ;
err = misc_register ( & privcmd_dev ) ;
if ( err ! = 0 ) {
printk ( KERN_ERR " Could not register Xen privcmd device \n " ) ;
return err ;
}
return 0 ;
}
static void __exit privcmd_exit ( void )
{
misc_deregister ( & privcmd_dev ) ;
}
module_init ( privcmd_init ) ;
module_exit ( privcmd_exit ) ;