2012-03-16 01:58:34 +04:00
/*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License , version 2 , as
* published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 , USA .
*
* Copyright 2010 Paul Mackerras , IBM Corp . < paulus @ au1 . ibm . com >
* Copyright 2011 David Gibson , IBM Corporation < dwg @ au1 . ibm . com >
2016-02-15 04:55:09 +03:00
* Copyright 2016 Alexey Kardashevskiy , IBM Corporation < aik @ au1 . ibm . com >
2012-03-16 01:58:34 +04:00
*/
# include <linux/types.h>
# include <linux/string.h>
# include <linux/kvm.h>
# include <linux/kvm_host.h>
# include <linux/highmem.h>
# include <linux/gfp.h>
# include <linux/slab.h>
2017-02-08 20:51:30 +03:00
# include <linux/sched/signal.h>
2012-03-16 01:58:34 +04:00
# include <linux/hugetlb.h>
# include <linux/list.h>
# include <linux/anon_inodes.h>
2017-03-22 07:21:56 +03:00
# include <linux/iommu.h>
# include <linux/file.h>
2012-03-16 01:58:34 +04:00
# include <asm/kvm_ppc.h>
# include <asm/kvm_book3s.h>
2016-03-01 10:29:20 +03:00
# include <asm/book3s/64/mmu-hash.h>
2012-03-16 01:58:34 +04:00
# include <asm/hvcall.h>
# include <asm/synch.h>
# include <asm/ppc-opcode.h>
# include <asm/kvm_host.h>
# include <asm/udbg.h>
2016-02-15 04:55:07 +03:00
# include <asm/iommu.h>
2016-02-15 04:55:09 +03:00
# include <asm/tce.h>
2017-03-22 07:21:56 +03:00
# include <asm/mmu_context.h>
2012-03-16 01:58:34 +04:00
2016-03-01 09:54:38 +03:00
static unsigned long kvmppc_tce_pages ( unsigned long iommu_pages )
2012-03-16 01:58:34 +04:00
{
2016-03-01 09:54:38 +03:00
return ALIGN ( iommu_pages * sizeof ( u64 ) , PAGE_SIZE ) / PAGE_SIZE ;
2012-03-16 01:58:34 +04:00
}
2016-02-15 04:55:06 +03:00
static unsigned long kvmppc_stt_pages ( unsigned long tce_pages )
{
unsigned long stt_bytes = sizeof ( struct kvmppc_spapr_tce_table ) +
( tce_pages * sizeof ( struct page * ) ) ;
return tce_pages + ALIGN ( stt_bytes , PAGE_SIZE ) / PAGE_SIZE ;
}
static long kvmppc_account_memlimit ( unsigned long stt_pages , bool inc )
{
long ret = 0 ;
if ( ! current | | ! current - > mm )
return ret ; /* process exited */
down_write ( & current - > mm - > mmap_sem ) ;
if ( inc ) {
unsigned long locked , lock_limit ;
locked = current - > mm - > locked_vm + stt_pages ;
lock_limit = rlimit ( RLIMIT_MEMLOCK ) > > PAGE_SHIFT ;
if ( locked > lock_limit & & ! capable ( CAP_IPC_LOCK ) )
ret = - ENOMEM ;
else
current - > mm - > locked_vm + = stt_pages ;
} else {
if ( WARN_ON_ONCE ( stt_pages > current - > mm - > locked_vm ) )
stt_pages = current - > mm - > locked_vm ;
current - > mm - > locked_vm - = stt_pages ;
}
pr_debug ( " [%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s \n " , current - > pid ,
inc ? ' + ' : ' - ' ,
stt_pages < < PAGE_SHIFT ,
current - > mm - > locked_vm < < PAGE_SHIFT ,
rlimit ( RLIMIT_MEMLOCK ) ,
ret ? " - exceeded " : " " ) ;
up_write ( & current - > mm - > mmap_sem ) ;
return ret ;
}
2017-03-22 07:21:56 +03:00
static void kvm_spapr_tce_iommu_table_free ( struct rcu_head * head )
{
struct kvmppc_spapr_tce_iommu_table * stit = container_of ( head ,
struct kvmppc_spapr_tce_iommu_table , rcu ) ;
iommu_tce_table_put ( stit - > tbl ) ;
kfree ( stit ) ;
}
static void kvm_spapr_tce_liobn_put ( struct kref * kref )
{
struct kvmppc_spapr_tce_iommu_table * stit = container_of ( kref ,
struct kvmppc_spapr_tce_iommu_table , kref ) ;
list_del_rcu ( & stit - > next ) ;
call_rcu ( & stit - > rcu , kvm_spapr_tce_iommu_table_free ) ;
}
extern void kvm_spapr_tce_release_iommu_group ( struct kvm * kvm ,
struct iommu_group * grp )
{
int i ;
struct kvmppc_spapr_tce_table * stt ;
struct kvmppc_spapr_tce_iommu_table * stit , * tmp ;
struct iommu_table_group * table_group = NULL ;
list_for_each_entry_rcu ( stt , & kvm - > arch . spapr_tce_tables , list ) {
table_group = iommu_group_get_iommudata ( grp ) ;
if ( WARN_ON ( ! table_group ) )
continue ;
list_for_each_entry_safe ( stit , tmp , & stt - > iommu_tables , next ) {
for ( i = 0 ; i < IOMMU_TABLE_GROUP_MAX_TABLES ; + + i ) {
if ( table_group - > tables [ i ] ! = stit - > tbl )
continue ;
kref_put ( & stit - > kref , kvm_spapr_tce_liobn_put ) ;
return ;
}
}
}
}
extern long kvm_spapr_tce_attach_iommu_group ( struct kvm * kvm , int tablefd ,
struct iommu_group * grp )
{
struct kvmppc_spapr_tce_table * stt = NULL ;
bool found = false ;
struct iommu_table * tbl = NULL ;
struct iommu_table_group * table_group ;
long i ;
struct kvmppc_spapr_tce_iommu_table * stit ;
struct fd f ;
f = fdget ( tablefd ) ;
if ( ! f . file )
return - EBADF ;
list_for_each_entry_rcu ( stt , & kvm - > arch . spapr_tce_tables , list ) {
if ( stt = = f . file - > private_data ) {
found = true ;
break ;
}
}
fdput ( f ) ;
if ( ! found )
return - EINVAL ;
table_group = iommu_group_get_iommudata ( grp ) ;
if ( WARN_ON ( ! table_group ) )
return - EFAULT ;
for ( i = 0 ; i < IOMMU_TABLE_GROUP_MAX_TABLES ; + + i ) {
struct iommu_table * tbltmp = table_group - > tables [ i ] ;
if ( ! tbltmp )
continue ;
2018-05-14 13:00:28 +03:00
/* Make sure hardware table parameters are compatible */
if ( ( tbltmp - > it_page_shift < = stt - > page_shift ) & &
( tbltmp - > it_offset < < tbltmp - > it_page_shift = =
stt - > offset < < stt - > page_shift ) & &
2018-06-20 11:42:58 +03:00
( tbltmp - > it_size < < tbltmp - > it_page_shift > =
2018-05-14 13:00:28 +03:00
stt - > size < < stt - > page_shift ) ) {
2017-03-22 07:21:56 +03:00
/*
* Reference the table to avoid races with
* add / remove DMA windows .
*/
tbl = iommu_tce_table_get ( tbltmp ) ;
break ;
}
}
if ( ! tbl )
return - EINVAL ;
list_for_each_entry_rcu ( stit , & stt - > iommu_tables , next ) {
if ( tbl ! = stit - > tbl )
continue ;
if ( ! kref_get_unless_zero ( & stit - > kref ) ) {
/* stit is being destroyed */
iommu_tce_table_put ( tbl ) ;
return - ENOTTY ;
}
/*
* The table is already known to this KVM , we just increased
* its KVM reference counter and can return .
*/
return 0 ;
}
stit = kzalloc ( sizeof ( * stit ) , GFP_KERNEL ) ;
if ( ! stit ) {
iommu_tce_table_put ( tbl ) ;
return - ENOMEM ;
}
stit - > tbl = tbl ;
kref_init ( & stit - > kref ) ;
list_add_rcu ( & stit - > next , & stt - > iommu_tables ) ;
return 0 ;
}
2016-02-15 04:55:05 +03:00
static void release_spapr_tce_table ( struct rcu_head * head )
2012-03-16 01:58:34 +04:00
{
2016-02-15 04:55:05 +03:00
struct kvmppc_spapr_tce_table * stt = container_of ( head ,
struct kvmppc_spapr_tce_table , rcu ) ;
2016-03-01 09:54:38 +03:00
unsigned long i , npages = kvmppc_tce_pages ( stt - > size ) ;
2012-03-16 01:58:34 +04:00
2016-02-15 04:55:06 +03:00
for ( i = 0 ; i < npages ; i + + )
2012-03-16 01:58:34 +04:00
__free_page ( stt - > pages [ i ] ) ;
2016-02-15 04:55:05 +03:00
kfree ( stt ) ;
2012-03-16 01:58:34 +04:00
}
2018-05-10 21:27:19 +03:00
static vm_fault_t kvm_spapr_tce_fault ( struct vm_fault * vmf )
2012-03-16 01:58:34 +04:00
{
2017-02-25 01:56:41 +03:00
struct kvmppc_spapr_tce_table * stt = vmf - > vma - > vm_file - > private_data ;
2012-03-16 01:58:34 +04:00
struct page * page ;
2016-03-01 09:54:38 +03:00
if ( vmf - > pgoff > = kvmppc_tce_pages ( stt - > size ) )
2012-03-16 01:58:34 +04:00
return VM_FAULT_SIGBUS ;
page = stt - > pages [ vmf - > pgoff ] ;
get_page ( page ) ;
vmf - > page = page ;
return 0 ;
}
static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
. fault = kvm_spapr_tce_fault ,
} ;
static int kvm_spapr_tce_mmap ( struct file * file , struct vm_area_struct * vma )
{
vma - > vm_ops = & kvm_spapr_tce_vm_ops ;
return 0 ;
}
static int kvm_spapr_tce_release ( struct inode * inode , struct file * filp )
{
struct kvmppc_spapr_tce_table * stt = filp - > private_data ;
2017-03-22 07:21:56 +03:00
struct kvmppc_spapr_tce_iommu_table * stit , * tmp ;
KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list
Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.
To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release(). Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN. This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock. With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().
The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.
The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter. Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally. So, in each case there is no
harmful effect.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-08-28 07:31:24 +03:00
struct kvm * kvm = stt - > kvm ;
2012-03-16 01:58:34 +04:00
KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list
Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.
To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release(). Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN. This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock. With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().
The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.
The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter. Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally. So, in each case there is no
harmful effect.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-08-28 07:31:24 +03:00
mutex_lock ( & kvm - > lock ) ;
2016-02-15 04:55:05 +03:00
list_del_rcu ( & stt - > list ) ;
KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list
Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.
To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release(). Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN. This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock. With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().
The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.
The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter. Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally. So, in each case there is no
harmful effect.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-08-28 07:31:24 +03:00
mutex_unlock ( & kvm - > lock ) ;
2016-02-15 04:55:05 +03:00
2017-03-22 07:21:56 +03:00
list_for_each_entry_safe ( stit , tmp , & stt - > iommu_tables , next ) {
WARN_ON ( ! kref_read ( & stit - > kref ) ) ;
while ( 1 ) {
if ( kref_put ( & stit - > kref , kvm_spapr_tce_liobn_put ) )
break ;
}
}
2016-02-15 04:55:05 +03:00
kvm_put_kvm ( stt - > kvm ) ;
2016-02-15 04:55:06 +03:00
kvmppc_account_memlimit (
2016-03-01 09:54:38 +03:00
kvmppc_stt_pages ( kvmppc_tce_pages ( stt - > size ) ) , false ) ;
2016-02-15 04:55:05 +03:00
call_rcu ( & stt - > rcu , release_spapr_tce_table ) ;
2012-03-16 01:58:34 +04:00
return 0 ;
}
2013-04-05 03:09:41 +04:00
static const struct file_operations kvm_spapr_tce_fops = {
2012-03-16 01:58:34 +04:00
. mmap = kvm_spapr_tce_mmap ,
. release = kvm_spapr_tce_release ,
} ;
long kvm_vm_ioctl_create_spapr_tce ( struct kvm * kvm ,
2016-03-01 09:54:40 +03:00
struct kvm_create_spapr_tce_64 * args )
2012-03-16 01:58:34 +04:00
{
struct kvmppc_spapr_tce_table * stt = NULL ;
2017-08-24 12:14:47 +03:00
struct kvmppc_spapr_tce_table * siter ;
2018-06-20 11:42:58 +03:00
unsigned long npages , size = args - > size ;
2012-03-16 01:58:34 +04:00
int ret = - ENOMEM ;
int i ;
2018-05-14 13:00:29 +03:00
if ( ! args - > size | | args - > page_shift < 12 | | args - > page_shift > 34 | |
( args - > offset + args - > size > ( ULLONG_MAX > > args - > page_shift ) ) )
2016-03-01 09:54:40 +03:00
return - EINVAL ;
2016-03-01 09:54:38 +03:00
npages = kvmppc_tce_pages ( size ) ;
2016-02-15 04:55:06 +03:00
ret = kvmppc_account_memlimit ( kvmppc_stt_pages ( npages ) , true ) ;
2017-08-24 12:14:47 +03:00
if ( ret )
return ret ;
2012-03-16 01:58:34 +04:00
2017-02-08 19:20:01 +03:00
ret = - ENOMEM ;
2012-03-16 01:58:34 +04:00
stt = kzalloc ( sizeof ( * stt ) + npages * sizeof ( struct page * ) ,
GFP_KERNEL ) ;
if ( ! stt )
2017-08-24 12:14:47 +03:00
goto fail_acct ;
2012-03-16 01:58:34 +04:00
stt - > liobn = args - > liobn ;
2016-03-01 09:54:40 +03:00
stt - > page_shift = args - > page_shift ;
stt - > offset = args - > offset ;
2016-03-01 09:54:38 +03:00
stt - > size = size ;
2012-03-16 01:58:34 +04:00
stt - > kvm = kvm ;
2017-03-22 07:21:56 +03:00
INIT_LIST_HEAD_RCU ( & stt - > iommu_tables ) ;
2012-03-16 01:58:34 +04:00
for ( i = 0 ; i < npages ; i + + ) {
stt - > pages [ i ] = alloc_page ( GFP_KERNEL | __GFP_ZERO ) ;
if ( ! stt - > pages [ i ] )
goto fail ;
}
mutex_lock ( & kvm - > lock ) ;
2017-08-24 12:14:47 +03:00
/* Check this LIOBN hasn't been previously allocated */
ret = 0 ;
list_for_each_entry ( siter , & kvm - > arch . spapr_tce_tables , list ) {
if ( siter - > liobn = = args - > liobn ) {
ret = - EBUSY ;
break ;
}
}
KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list
Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.
To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release(). Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN. This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock. With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().
The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.
The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter. Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally. So, in each case there is no
harmful effect.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-08-28 07:31:24 +03:00
if ( ! ret )
ret = anon_inode_getfd ( " kvm-spapr-tce " , & kvm_spapr_tce_fops ,
stt , O_RDWR | O_CLOEXEC ) ;
if ( ret > = 0 ) {
2017-08-24 12:14:47 +03:00
list_add_rcu ( & stt - > list , & kvm - > arch . spapr_tce_tables ) ;
kvm_get_kvm ( kvm ) ;
}
2012-03-16 01:58:34 +04:00
mutex_unlock ( & kvm - > lock ) ;
KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list
Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.
To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release(). Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN. This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock. With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().
The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.
The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter. Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally. So, in each case there is no
harmful effect.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-08-28 07:31:24 +03:00
if ( ret > = 0 )
return ret ;
2012-03-16 01:58:34 +04:00
2017-08-24 12:14:47 +03:00
fail :
for ( i = 0 ; i < npages ; i + + )
if ( stt - > pages [ i ] )
__free_page ( stt - > pages [ i ] ) ;
kfree ( stt ) ;
fail_acct :
kvmppc_account_memlimit ( kvmppc_stt_pages ( npages ) , false ) ;
2012-03-16 01:58:34 +04:00
return ret ;
}
2016-02-15 04:55:09 +03:00
2018-09-10 11:29:10 +03:00
static long kvmppc_tce_validate ( struct kvmppc_spapr_tce_table * stt ,
unsigned long tce )
{
unsigned long gpa = tce & ~ ( TCE_PCI_READ | TCE_PCI_WRITE ) ;
enum dma_data_direction dir = iommu_tce_direction ( tce ) ;
struct kvmppc_spapr_tce_iommu_table * stit ;
unsigned long ua = 0 ;
/* Allow userspace to poison TCE table */
if ( dir = = DMA_NONE )
return H_SUCCESS ;
if ( iommu_tce_check_gpa ( stt - > page_shift , gpa ) )
return H_TOO_HARD ;
2018-09-10 11:29:12 +03:00
if ( kvmppc_tce_to_ua ( stt - > kvm , tce , & ua , NULL ) )
2018-09-10 11:29:10 +03:00
return H_TOO_HARD ;
list_for_each_entry_rcu ( stit , & stt - > iommu_tables , next ) {
unsigned long hpa = 0 ;
struct mm_iommu_table_group_mem_t * mem ;
long shift = stit - > tbl - > it_page_shift ;
mem = mm_iommu_lookup ( stt - > kvm - > mm , ua , 1ULL < < shift ) ;
if ( ! mem )
return H_TOO_HARD ;
if ( mm_iommu_ua_to_hpa ( mem , ua , shift , & hpa ) )
return H_TOO_HARD ;
}
return H_SUCCESS ;
}
2017-03-22 07:21:56 +03:00
static void kvmppc_clear_tce ( struct iommu_table * tbl , unsigned long entry )
{
unsigned long hpa = 0 ;
enum dma_data_direction dir = DMA_NONE ;
iommu_tce_xchg ( tbl , entry , & hpa , & dir ) ;
}
static long kvmppc_tce_iommu_mapped_dec ( struct kvm * kvm ,
struct iommu_table * tbl , unsigned long entry )
{
struct mm_iommu_table_group_mem_t * mem = NULL ;
const unsigned long pgsize = 1ULL < < tbl - > it_page_shift ;
2018-07-04 09:13:46 +03:00
__be64 * pua = IOMMU_TABLE_USERSPACE_ENTRY ( tbl , entry ) ;
2017-03-22 07:21:56 +03:00
if ( ! pua )
/* it_userspace allocation might be delayed */
return H_TOO_HARD ;
2018-07-04 09:13:46 +03:00
mem = mm_iommu_lookup ( kvm - > mm , be64_to_cpu ( * pua ) , pgsize ) ;
2017-03-22 07:21:56 +03:00
if ( ! mem )
return H_TOO_HARD ;
mm_iommu_mapped_dec ( mem ) ;
2018-07-04 09:13:46 +03:00
* pua = cpu_to_be64 ( 0 ) ;
2017-03-22 07:21:56 +03:00
return H_SUCCESS ;
}
2018-05-14 13:00:28 +03:00
static long kvmppc_tce_iommu_do_unmap ( struct kvm * kvm ,
2017-03-22 07:21:56 +03:00
struct iommu_table * tbl , unsigned long entry )
{
enum dma_data_direction dir = DMA_NONE ;
unsigned long hpa = 0 ;
long ret ;
if ( WARN_ON_ONCE ( iommu_tce_xchg ( tbl , entry , & hpa , & dir ) ) )
2018-09-10 11:29:09 +03:00
return H_TOO_HARD ;
2017-03-22 07:21:56 +03:00
if ( dir = = DMA_NONE )
return H_SUCCESS ;
ret = kvmppc_tce_iommu_mapped_dec ( kvm , tbl , entry ) ;
if ( ret ! = H_SUCCESS )
iommu_tce_xchg ( tbl , entry , & hpa , & dir ) ;
return ret ;
}
2018-05-14 13:00:28 +03:00
static long kvmppc_tce_iommu_unmap ( struct kvm * kvm ,
struct kvmppc_spapr_tce_table * stt , struct iommu_table * tbl ,
unsigned long entry )
{
unsigned long i , ret = H_SUCCESS ;
unsigned long subpages = 1ULL < < ( stt - > page_shift - tbl - > it_page_shift ) ;
unsigned long io_entry = entry * subpages ;
for ( i = 0 ; i < subpages ; + + i ) {
ret = kvmppc_tce_iommu_do_unmap ( kvm , tbl , io_entry + i ) ;
if ( ret ! = H_SUCCESS )
break ;
}
return ret ;
}
long kvmppc_tce_iommu_do_map ( struct kvm * kvm , struct iommu_table * tbl ,
2017-03-22 07:21:56 +03:00
unsigned long entry , unsigned long ua ,
enum dma_data_direction dir )
{
long ret ;
2018-07-04 09:13:46 +03:00
unsigned long hpa ;
__be64 * pua = IOMMU_TABLE_USERSPACE_ENTRY ( tbl , entry ) ;
2017-03-22 07:21:56 +03:00
struct mm_iommu_table_group_mem_t * mem ;
if ( ! pua )
/* it_userspace allocation might be delayed */
return H_TOO_HARD ;
mem = mm_iommu_lookup ( kvm - > mm , ua , 1ULL < < tbl - > it_page_shift ) ;
if ( ! mem )
/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
return H_TOO_HARD ;
2018-07-17 10:19:13 +03:00
if ( WARN_ON_ONCE ( mm_iommu_ua_to_hpa ( mem , ua , tbl - > it_page_shift , & hpa ) ) )
2018-09-10 11:29:09 +03:00
return H_TOO_HARD ;
2017-03-22 07:21:56 +03:00
if ( mm_iommu_mapped_inc ( mem ) )
2018-09-10 11:29:09 +03:00
return H_TOO_HARD ;
2017-03-22 07:21:56 +03:00
ret = iommu_tce_xchg ( tbl , entry , & hpa , & dir ) ;
if ( WARN_ON_ONCE ( ret ) ) {
mm_iommu_mapped_dec ( mem ) ;
2018-09-10 11:29:09 +03:00
return H_TOO_HARD ;
2017-03-22 07:21:56 +03:00
}
if ( dir ! = DMA_NONE )
kvmppc_tce_iommu_mapped_dec ( kvm , tbl , entry ) ;
2018-07-04 09:13:46 +03:00
* pua = cpu_to_be64 ( ua ) ;
2017-03-22 07:21:56 +03:00
return 0 ;
}
2018-05-14 13:00:28 +03:00
static long kvmppc_tce_iommu_map ( struct kvm * kvm ,
struct kvmppc_spapr_tce_table * stt , struct iommu_table * tbl ,
unsigned long entry , unsigned long ua ,
enum dma_data_direction dir )
{
unsigned long i , pgoff , ret = H_SUCCESS ;
unsigned long subpages = 1ULL < < ( stt - > page_shift - tbl - > it_page_shift ) ;
unsigned long io_entry = entry * subpages ;
for ( i = 0 , pgoff = 0 ; i < subpages ;
+ + i , pgoff + = IOMMU_PAGE_SIZE ( tbl ) ) {
ret = kvmppc_tce_iommu_do_map ( kvm , tbl ,
io_entry + i , ua + pgoff , dir ) ;
if ( ret ! = H_SUCCESS )
break ;
}
return ret ;
}
2016-03-18 05:50:42 +03:00
long kvmppc_h_put_tce ( struct kvm_vcpu * vcpu , unsigned long liobn ,
unsigned long ioba , unsigned long tce )
{
2017-03-22 07:21:53 +03:00
struct kvmppc_spapr_tce_table * stt ;
2017-03-22 07:21:56 +03:00
long ret , idx ;
struct kvmppc_spapr_tce_iommu_table * stit ;
unsigned long entry , ua = 0 ;
enum dma_data_direction dir ;
2016-03-18 05:50:42 +03:00
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/* liobn, ioba, tce); */
2017-03-22 07:21:53 +03:00
stt = kvmppc_find_table ( vcpu - > kvm , liobn ) ;
2016-03-18 05:50:42 +03:00
if ( ! stt )
return H_TOO_HARD ;
ret = kvmppc_ioba_validate ( stt , ioba , 1 ) ;
if ( ret ! = H_SUCCESS )
return ret ;
ret = kvmppc_tce_validate ( stt , tce ) ;
if ( ret ! = H_SUCCESS )
return ret ;
2017-03-22 07:21:56 +03:00
dir = iommu_tce_direction ( tce ) ;
2017-10-11 08:00:34 +03:00
idx = srcu_read_lock ( & vcpu - > kvm - > srcu ) ;
2018-09-10 11:29:12 +03:00
if ( ( dir ! = DMA_NONE ) & & kvmppc_tce_to_ua ( vcpu - > kvm , tce , & ua , NULL ) ) {
2017-10-11 08:00:34 +03:00
ret = H_PARAMETER ;
goto unlock_exit ;
}
2017-03-22 07:21:56 +03:00
entry = ioba > > stt - > page_shift ;
list_for_each_entry_lockless ( stit , & stt - > iommu_tables , next ) {
2017-10-11 08:00:34 +03:00
if ( dir = = DMA_NONE )
2018-05-14 13:00:28 +03:00
ret = kvmppc_tce_iommu_unmap ( vcpu - > kvm , stt ,
2017-03-22 07:21:56 +03:00
stit - > tbl , entry ) ;
2017-10-11 08:00:34 +03:00
else
2018-05-14 13:00:28 +03:00
ret = kvmppc_tce_iommu_map ( vcpu - > kvm , stt , stit - > tbl ,
2017-03-22 07:21:56 +03:00
entry , ua , dir ) ;
2018-09-10 11:29:11 +03:00
if ( ret ! = H_SUCCESS ) {
kvmppc_clear_tce ( stit - > tbl , entry ) ;
2017-10-11 08:00:34 +03:00
goto unlock_exit ;
2018-09-10 11:29:11 +03:00
}
2017-03-22 07:21:56 +03:00
}
kvmppc_tce_put ( stt , entry , tce ) ;
2016-03-18 05:50:42 +03:00
2017-10-11 08:00:34 +03:00
unlock_exit :
srcu_read_unlock ( & vcpu - > kvm - > srcu , idx ) ;
return ret ;
2016-03-18 05:50:42 +03:00
}
EXPORT_SYMBOL_GPL ( kvmppc_h_put_tce ) ;
2016-02-15 04:55:09 +03:00
long kvmppc_h_put_tce_indirect ( struct kvm_vcpu * vcpu ,
unsigned long liobn , unsigned long ioba ,
unsigned long tce_list , unsigned long npages )
{
struct kvmppc_spapr_tce_table * stt ;
long i , ret = H_SUCCESS , idx ;
unsigned long entry , ua = 0 ;
2016-07-12 03:54:48 +03:00
u64 __user * tces ;
u64 tce ;
2017-03-22 07:21:56 +03:00
struct kvmppc_spapr_tce_iommu_table * stit ;
2016-02-15 04:55:09 +03:00
2017-03-22 07:21:53 +03:00
stt = kvmppc_find_table ( vcpu - > kvm , liobn ) ;
2016-02-15 04:55:09 +03:00
if ( ! stt )
return H_TOO_HARD ;
2016-03-01 09:54:38 +03:00
entry = ioba > > stt - > page_shift ;
2016-02-15 04:55:09 +03:00
/*
* SPAPR spec says that the maximum size of the list is 512 TCEs
* so the whole table fits in 4 K page
*/
if ( npages > 512 )
return H_PARAMETER ;
if ( tce_list & ( SZ_4K - 1 ) )
return H_PARAMETER ;
ret = kvmppc_ioba_validate ( stt , ioba , npages ) ;
if ( ret ! = H_SUCCESS )
return ret ;
idx = srcu_read_lock ( & vcpu - > kvm - > srcu ) ;
2018-09-10 11:29:12 +03:00
if ( kvmppc_tce_to_ua ( vcpu - > kvm , tce_list , & ua , NULL ) ) {
2016-02-15 04:55:09 +03:00
ret = H_TOO_HARD ;
goto unlock_exit ;
}
tces = ( u64 __user * ) ua ;
for ( i = 0 ; i < npages ; + + i ) {
if ( get_user ( tce , tces + i ) ) {
ret = H_TOO_HARD ;
goto unlock_exit ;
}
tce = be64_to_cpu ( tce ) ;
ret = kvmppc_tce_validate ( stt , tce ) ;
if ( ret ! = H_SUCCESS )
goto unlock_exit ;
2018-09-10 11:29:08 +03:00
}
for ( i = 0 ; i < npages ; + + i ) {
/*
* This looks unsafe , because we validate , then regrab
* the TCE from userspace which could have been changed by
* another thread .
*
* But it actually is safe , because the relevant checks will be
* re - executed in the following code . If userspace tries to
* change this dodgily it will result in a messier failure mode
* but won ' t threaten the host .
*/
if ( get_user ( tce , tces + i ) ) {
ret = H_TOO_HARD ;
goto unlock_exit ;
}
tce = be64_to_cpu ( tce ) ;
2016-02-15 04:55:09 +03:00
2018-09-10 11:29:12 +03:00
if ( kvmppc_tce_to_ua ( vcpu - > kvm , tce , & ua , NULL ) )
2017-03-22 07:21:56 +03:00
return H_PARAMETER ;
list_for_each_entry_lockless ( stit , & stt - > iommu_tables , next ) {
2018-05-14 13:00:28 +03:00
ret = kvmppc_tce_iommu_map ( vcpu - > kvm , stt ,
2017-03-22 07:21:56 +03:00
stit - > tbl , entry + i , ua ,
iommu_tce_direction ( tce ) ) ;
2018-09-10 11:29:11 +03:00
if ( ret ! = H_SUCCESS ) {
kvmppc_clear_tce ( stit - > tbl , entry ) ;
2017-03-22 07:21:56 +03:00
goto unlock_exit ;
2018-09-10 11:29:11 +03:00
}
2017-03-22 07:21:56 +03:00
}
2016-02-15 04:55:09 +03:00
kvmppc_tce_put ( stt , entry + i , tce ) ;
}
unlock_exit :
srcu_read_unlock ( & vcpu - > kvm - > srcu , idx ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( kvmppc_h_put_tce_indirect ) ;
2016-03-18 05:50:42 +03:00
long kvmppc_h_stuff_tce ( struct kvm_vcpu * vcpu ,
unsigned long liobn , unsigned long ioba ,
unsigned long tce_value , unsigned long npages )
{
struct kvmppc_spapr_tce_table * stt ;
long i , ret ;
2017-03-22 07:21:56 +03:00
struct kvmppc_spapr_tce_iommu_table * stit ;
2016-03-18 05:50:42 +03:00
2017-03-22 07:21:53 +03:00
stt = kvmppc_find_table ( vcpu - > kvm , liobn ) ;
2016-03-18 05:50:42 +03:00
if ( ! stt )
return H_TOO_HARD ;
ret = kvmppc_ioba_validate ( stt , ioba , npages ) ;
if ( ret ! = H_SUCCESS )
return ret ;
/* Check permission bits only to allow userspace poison TCE for debug */
if ( tce_value & ( TCE_PCI_WRITE | TCE_PCI_READ ) )
return H_PARAMETER ;
2017-03-22 07:21:56 +03:00
list_for_each_entry_lockless ( stit , & stt - > iommu_tables , next ) {
2018-05-14 13:00:27 +03:00
unsigned long entry = ioba > > stt - > page_shift ;
2017-03-22 07:21:56 +03:00
for ( i = 0 ; i < npages ; + + i ) {
2018-05-14 13:00:28 +03:00
ret = kvmppc_tce_iommu_unmap ( vcpu - > kvm , stt ,
2017-03-22 07:21:56 +03:00
stit - > tbl , entry + i ) ;
if ( ret = = H_SUCCESS )
continue ;
if ( ret = = H_TOO_HARD )
return ret ;
WARN_ON_ONCE ( 1 ) ;
kvmppc_clear_tce ( stit - > tbl , entry ) ;
}
}
2016-03-18 05:50:42 +03:00
for ( i = 0 ; i < npages ; + + i , ioba + = ( 1ULL < < stt - > page_shift ) )
kvmppc_tce_put ( stt , ioba > > stt - > page_shift , tce_value ) ;
return H_SUCCESS ;
}
EXPORT_SYMBOL_GPL ( kvmppc_h_stuff_tce ) ;