2022-11-29 16:29:33 -04:00
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
*
* The io_pagetable is the top of datastructure that maps IOVA ' s to PFNs . The
* PFNs can be placed into an iommu_domain , or returned to the caller as a page
* list for access by an in - kernel user .
*
* The datastructure uses the iopt_pages to optimize the storage of the PFNs
* between the domains and xarray .
*/
# include <linux/iommufd.h>
# include <linux/lockdep.h>
# include <linux/iommu.h>
# include <linux/sched/mm.h>
# include <linux/err.h>
# include <linux/slab.h>
# include <linux/errno.h>
# include "io_pagetable.h"
# include "double_span.h"
struct iopt_pages_list {
struct iopt_pages * pages ;
struct iopt_area * area ;
struct list_head next ;
unsigned long start_byte ;
unsigned long length ;
} ;
struct iopt_area * iopt_area_contig_init ( struct iopt_area_contig_iter * iter ,
struct io_pagetable * iopt ,
unsigned long iova ,
unsigned long last_iova )
{
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
iter - > cur_iova = iova ;
iter - > last_iova = last_iova ;
iter - > area = iopt_area_iter_first ( iopt , iova , iova ) ;
if ( ! iter - > area )
return NULL ;
if ( ! iter - > area - > pages ) {
iter - > area = NULL ;
return NULL ;
}
return iter - > area ;
}
struct iopt_area * iopt_area_contig_next ( struct iopt_area_contig_iter * iter )
{
unsigned long last_iova ;
if ( ! iter - > area )
return NULL ;
last_iova = iopt_area_last_iova ( iter - > area ) ;
if ( iter - > last_iova < = last_iova )
return NULL ;
iter - > cur_iova = last_iova + 1 ;
iter - > area = iopt_area_iter_next ( iter - > area , iter - > cur_iova ,
iter - > last_iova ) ;
if ( ! iter - > area )
return NULL ;
if ( iter - > cur_iova ! = iopt_area_iova ( iter - > area ) | |
! iter - > area - > pages ) {
iter - > area = NULL ;
return NULL ;
}
return iter - > area ;
}
static bool __alloc_iova_check_hole ( struct interval_tree_double_span_iter * span ,
unsigned long length ,
unsigned long iova_alignment ,
unsigned long page_offset )
{
if ( span - > is_used | | span - > last_hole - span - > start_hole < length - 1 )
return false ;
span - > start_hole = ALIGN ( span - > start_hole , iova_alignment ) |
page_offset ;
if ( span - > start_hole > span - > last_hole | |
span - > last_hole - span - > start_hole < length - 1 )
return false ;
return true ;
}
static bool __alloc_iova_check_used ( struct interval_tree_span_iter * span ,
unsigned long length ,
unsigned long iova_alignment ,
unsigned long page_offset )
{
if ( span - > is_hole | | span - > last_used - span - > start_used < length - 1 )
return false ;
span - > start_used = ALIGN ( span - > start_used , iova_alignment ) |
page_offset ;
if ( span - > start_used > span - > last_used | |
span - > last_used - span - > start_used < length - 1 )
return false ;
return true ;
}
/*
* Automatically find a block of IOVA that is not being used and not reserved .
* Does not return a 0 IOVA even if it is valid .
*/
static int iopt_alloc_iova ( struct io_pagetable * iopt , unsigned long * iova ,
unsigned long uptr , unsigned long length )
{
unsigned long page_offset = uptr % PAGE_SIZE ;
struct interval_tree_double_span_iter used_span ;
struct interval_tree_span_iter allowed_span ;
unsigned long iova_alignment ;
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
/* Protect roundup_pow-of_two() from overflow */
if ( length = = 0 | | length > = ULONG_MAX / 2 )
return - EOVERFLOW ;
/*
* Keep alignment present in the uptr when building the IOVA , this
* increases the chance we can map a THP .
*/
if ( ! uptr )
iova_alignment = roundup_pow_of_two ( length ) ;
else
iova_alignment = min_t ( unsigned long ,
roundup_pow_of_two ( length ) ,
1UL < < __ffs64 ( uptr ) ) ;
if ( iova_alignment < iopt - > iova_alignment )
return - EINVAL ;
interval_tree_for_each_span ( & allowed_span , & iopt - > allowed_itree ,
PAGE_SIZE , ULONG_MAX - PAGE_SIZE ) {
if ( RB_EMPTY_ROOT ( & iopt - > allowed_itree . rb_root ) ) {
allowed_span . start_used = PAGE_SIZE ;
allowed_span . last_used = ULONG_MAX - PAGE_SIZE ;
allowed_span . is_hole = false ;
}
if ( ! __alloc_iova_check_used ( & allowed_span , length ,
iova_alignment , page_offset ) )
continue ;
interval_tree_for_each_double_span (
& used_span , & iopt - > reserved_itree , & iopt - > area_itree ,
allowed_span . start_used , allowed_span . last_used ) {
if ( ! __alloc_iova_check_hole ( & used_span , length ,
iova_alignment ,
page_offset ) )
continue ;
* iova = used_span . start_hole ;
return 0 ;
}
}
return - ENOSPC ;
}
static int iopt_check_iova ( struct io_pagetable * iopt , unsigned long iova ,
unsigned long length )
{
unsigned long last ;
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
if ( ( iova & ( iopt - > iova_alignment - 1 ) ) )
return - EINVAL ;
if ( check_add_overflow ( iova , length - 1 , & last ) )
return - EOVERFLOW ;
/* No reserved IOVA intersects the range */
if ( iopt_reserved_iter_first ( iopt , iova , last ) )
return - EINVAL ;
/* Check that there is not already a mapping in the range */
if ( iopt_area_iter_first ( iopt , iova , last ) )
return - EEXIST ;
return 0 ;
}
/*
* The area takes a slice of the pages from start_bytes to start_byte + length
*/
static int iopt_insert_area ( struct io_pagetable * iopt , struct iopt_area * area ,
struct iopt_pages * pages , unsigned long iova ,
unsigned long start_byte , unsigned long length ,
int iommu_prot )
{
lockdep_assert_held_write ( & iopt - > iova_rwsem ) ;
if ( ( iommu_prot & IOMMU_WRITE ) & & ! pages - > writable )
return - EPERM ;
area - > iommu_prot = iommu_prot ;
area - > page_offset = start_byte % PAGE_SIZE ;
if ( area - > page_offset & ( iopt - > iova_alignment - 1 ) )
return - EINVAL ;
area - > node . start = iova ;
if ( check_add_overflow ( iova , length - 1 , & area - > node . last ) )
return - EOVERFLOW ;
area - > pages_node . start = start_byte / PAGE_SIZE ;
if ( check_add_overflow ( start_byte , length - 1 , & area - > pages_node . last ) )
return - EOVERFLOW ;
area - > pages_node . last = area - > pages_node . last / PAGE_SIZE ;
if ( WARN_ON ( area - > pages_node . last > = pages - > npages ) )
return - EOVERFLOW ;
/*
* The area is inserted with a NULL pages indicating it is not fully
* initialized yet .
*/
area - > iopt = iopt ;
interval_tree_insert ( & area - > node , & iopt - > area_itree ) ;
return 0 ;
}
static int iopt_alloc_area_pages ( struct io_pagetable * iopt ,
struct list_head * pages_list ,
unsigned long length , unsigned long * dst_iova ,
int iommu_prot , unsigned int flags )
{
struct iopt_pages_list * elm ;
unsigned long iova ;
int rc = 0 ;
list_for_each_entry ( elm , pages_list , next ) {
elm - > area = kzalloc ( sizeof ( * elm - > area ) , GFP_KERNEL_ACCOUNT ) ;
if ( ! elm - > area )
return - ENOMEM ;
}
down_write ( & iopt - > iova_rwsem ) ;
if ( ( length & ( iopt - > iova_alignment - 1 ) ) | | ! length ) {
rc = - EINVAL ;
goto out_unlock ;
}
if ( flags & IOPT_ALLOC_IOVA ) {
/* Use the first entry to guess the ideal IOVA alignment */
elm = list_first_entry ( pages_list , struct iopt_pages_list ,
next ) ;
rc = iopt_alloc_iova (
iopt , dst_iova ,
( uintptr_t ) elm - > pages - > uptr + elm - > start_byte , length ) ;
if ( rc )
goto out_unlock ;
} else {
rc = iopt_check_iova ( iopt , * dst_iova , length ) ;
if ( rc )
goto out_unlock ;
}
/*
* Areas are created with a NULL pages so that the IOVA space is
* reserved and we can unlock the iova_rwsem .
*/
iova = * dst_iova ;
list_for_each_entry ( elm , pages_list , next ) {
rc = iopt_insert_area ( iopt , elm - > area , elm - > pages , iova ,
elm - > start_byte , elm - > length , iommu_prot ) ;
if ( rc )
goto out_unlock ;
iova + = elm - > length ;
}
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
return rc ;
}
static void iopt_abort_area ( struct iopt_area * area )
{
if ( area - > iopt ) {
down_write ( & area - > iopt - > iova_rwsem ) ;
interval_tree_remove ( & area - > node , & area - > iopt - > area_itree ) ;
up_write ( & area - > iopt - > iova_rwsem ) ;
}
kfree ( area ) ;
}
void iopt_free_pages_list ( struct list_head * pages_list )
{
struct iopt_pages_list * elm ;
while ( ( elm = list_first_entry_or_null ( pages_list ,
struct iopt_pages_list , next ) ) ) {
if ( elm - > area )
iopt_abort_area ( elm - > area ) ;
if ( elm - > pages )
iopt_put_pages ( elm - > pages ) ;
list_del ( & elm - > next ) ;
kfree ( elm ) ;
}
}
static int iopt_fill_domains_pages ( struct list_head * pages_list )
{
struct iopt_pages_list * undo_elm ;
struct iopt_pages_list * elm ;
int rc ;
list_for_each_entry ( elm , pages_list , next ) {
rc = iopt_area_fill_domains ( elm - > area , elm - > pages ) ;
if ( rc )
goto err_undo ;
}
return 0 ;
err_undo :
list_for_each_entry ( undo_elm , pages_list , next ) {
if ( undo_elm = = elm )
break ;
iopt_area_unfill_domains ( undo_elm - > area , undo_elm - > pages ) ;
}
return rc ;
}
int iopt_map_pages ( struct io_pagetable * iopt , struct list_head * pages_list ,
unsigned long length , unsigned long * dst_iova ,
int iommu_prot , unsigned int flags )
{
struct iopt_pages_list * elm ;
int rc ;
rc = iopt_alloc_area_pages ( iopt , pages_list , length , dst_iova ,
iommu_prot , flags ) ;
if ( rc )
return rc ;
down_read ( & iopt - > domains_rwsem ) ;
rc = iopt_fill_domains_pages ( pages_list ) ;
if ( rc )
goto out_unlock_domains ;
down_write ( & iopt - > iova_rwsem ) ;
list_for_each_entry ( elm , pages_list , next ) {
/*
* area - > pages must be set inside the domains_rwsem to ensure
* any newly added domains will get filled . Moves the reference
* in from the list .
*/
elm - > area - > pages = elm - > pages ;
elm - > pages = NULL ;
elm - > area = NULL ;
}
up_write ( & iopt - > iova_rwsem ) ;
out_unlock_domains :
up_read ( & iopt - > domains_rwsem ) ;
return rc ;
}
/**
* iopt_map_user_pages ( ) - Map a user VA to an iova in the io page table
* @ ictx : iommufd_ctx the iopt is part of
* @ iopt : io_pagetable to act on
* @ iova : If IOPT_ALLOC_IOVA is set this is unused on input and contains
* the chosen iova on output . Otherwise is the iova to map to on input
* @ uptr : User VA to map
* @ length : Number of bytes to map
* @ iommu_prot : Combination of IOMMU_READ / WRITE / etc bits for the mapping
* @ flags : IOPT_ALLOC_IOVA or zero
*
* iova , uptr , and length must be aligned to iova_alignment . For domain backed
* page tables this will pin the pages and load them into the domain at iova .
* For non - domain page tables this will only setup a lazy reference and the
* caller must use iopt_access_pages ( ) to touch them .
*
* iopt_unmap_iova ( ) must be called to undo this before the io_pagetable can be
* destroyed .
*/
int iopt_map_user_pages ( struct iommufd_ctx * ictx , struct io_pagetable * iopt ,
unsigned long * iova , void __user * uptr ,
unsigned long length , int iommu_prot ,
unsigned int flags )
{
struct iopt_pages_list elm = { } ;
LIST_HEAD ( pages_list ) ;
int rc ;
elm . pages = iopt_alloc_pages ( uptr , length , iommu_prot & IOMMU_WRITE ) ;
if ( IS_ERR ( elm . pages ) )
return PTR_ERR ( elm . pages ) ;
if ( ictx - > account_mode = = IOPT_PAGES_ACCOUNT_MM & &
elm . pages - > account_mode = = IOPT_PAGES_ACCOUNT_USER )
elm . pages - > account_mode = IOPT_PAGES_ACCOUNT_MM ;
elm . start_byte = uptr - elm . pages - > uptr ;
elm . length = length ;
list_add ( & elm . next , & pages_list ) ;
rc = iopt_map_pages ( iopt , & pages_list , length , iova , iommu_prot , flags ) ;
if ( rc ) {
if ( elm . area )
iopt_abort_area ( elm . area ) ;
if ( elm . pages )
iopt_put_pages ( elm . pages ) ;
return rc ;
}
return 0 ;
}
int iopt_get_pages ( struct io_pagetable * iopt , unsigned long iova ,
unsigned long length , struct list_head * pages_list )
{
struct iopt_area_contig_iter iter ;
unsigned long last_iova ;
struct iopt_area * area ;
int rc ;
if ( ! length )
return - EINVAL ;
if ( check_add_overflow ( iova , length - 1 , & last_iova ) )
return - EOVERFLOW ;
down_read ( & iopt - > iova_rwsem ) ;
iopt_for_each_contig_area ( & iter , area , iopt , iova , last_iova ) {
struct iopt_pages_list * elm ;
unsigned long last = min ( last_iova , iopt_area_last_iova ( area ) ) ;
elm = kzalloc ( sizeof ( * elm ) , GFP_KERNEL_ACCOUNT ) ;
if ( ! elm ) {
rc = - ENOMEM ;
goto err_free ;
}
elm - > start_byte = iopt_area_start_byte ( area , iter . cur_iova ) ;
elm - > pages = area - > pages ;
elm - > length = ( last - iter . cur_iova ) + 1 ;
kref_get ( & elm - > pages - > kref ) ;
list_add_tail ( & elm - > next , pages_list ) ;
}
if ( ! iopt_area_contig_done ( & iter ) ) {
rc = - ENOENT ;
goto err_free ;
}
up_read ( & iopt - > iova_rwsem ) ;
return 0 ;
err_free :
up_read ( & iopt - > iova_rwsem ) ;
iopt_free_pages_list ( pages_list ) ;
return rc ;
}
static int iopt_unmap_iova_range ( struct io_pagetable * iopt , unsigned long start ,
unsigned long last , unsigned long * unmapped )
{
struct iopt_area * area ;
unsigned long unmapped_bytes = 0 ;
int rc = - ENOENT ;
/*
* The domains_rwsem must be held in read mode any time any area - > pages
* is NULL . This prevents domain attach / detatch from running
* concurrently with cleaning up the area .
*/
2022-11-29 16:29:37 -04:00
again :
2022-11-29 16:29:33 -04:00
down_read ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
while ( ( area = iopt_area_iter_first ( iopt , start , last ) ) ) {
unsigned long area_last = iopt_area_last_iova ( area ) ;
unsigned long area_first = iopt_area_iova ( area ) ;
struct iopt_pages * pages ;
/* Userspace should not race map/unmap's of the same area */
if ( ! area - > pages ) {
rc = - EBUSY ;
goto out_unlock_iova ;
}
if ( area_first < start | | area_last > last ) {
rc = - ENOENT ;
goto out_unlock_iova ;
}
/*
* num_accesses writers must hold the iova_rwsem too , so we can
* safely read it under the write side of the iovam_rwsem
* without the pages - > mutex .
*/
if ( area - > num_accesses ) {
start = area_first ;
area - > prevent_access = true ;
up_write ( & iopt - > iova_rwsem ) ;
up_read ( & iopt - > domains_rwsem ) ;
2022-11-29 16:29:37 -04:00
iommufd_access_notify_unmap ( iopt , area_first ,
iopt_area_length ( area ) ) ;
if ( WARN_ON ( READ_ONCE ( area - > num_accesses ) ) )
return - EDEADLOCK ;
goto again ;
2022-11-29 16:29:33 -04:00
}
pages = area - > pages ;
area - > pages = NULL ;
up_write ( & iopt - > iova_rwsem ) ;
iopt_area_unfill_domains ( area , pages ) ;
iopt_abort_area ( area ) ;
iopt_put_pages ( pages ) ;
unmapped_bytes + = area_last - area_first + 1 ;
down_write ( & iopt - > iova_rwsem ) ;
}
if ( unmapped_bytes )
rc = 0 ;
out_unlock_iova :
up_write ( & iopt - > iova_rwsem ) ;
up_read ( & iopt - > domains_rwsem ) ;
if ( unmapped )
* unmapped = unmapped_bytes ;
return rc ;
}
/**
* iopt_unmap_iova ( ) - Remove a range of iova
* @ iopt : io_pagetable to act on
* @ iova : Starting iova to unmap
* @ length : Number of bytes to unmap
* @ unmapped : Return number of bytes unmapped
*
* The requested range must be a superset of existing ranges .
* Splitting / truncating IOVA mappings is not allowed .
*/
int iopt_unmap_iova ( struct io_pagetable * iopt , unsigned long iova ,
unsigned long length , unsigned long * unmapped )
{
unsigned long iova_last ;
if ( ! length )
return - EINVAL ;
if ( check_add_overflow ( iova , length - 1 , & iova_last ) )
return - EOVERFLOW ;
return iopt_unmap_iova_range ( iopt , iova , iova_last , unmapped ) ;
}
int iopt_unmap_all ( struct io_pagetable * iopt , unsigned long * unmapped )
{
int rc ;
rc = iopt_unmap_iova_range ( iopt , 0 , ULONG_MAX , unmapped ) ;
/* If the IOVAs are empty then unmap all succeeds */
if ( rc = = - ENOENT )
return 0 ;
return rc ;
}
/* The caller must always free all the nodes in the allowed_iova rb_root. */
int iopt_set_allow_iova ( struct io_pagetable * iopt ,
struct rb_root_cached * allowed_iova )
{
struct iopt_allowed * allowed ;
down_write ( & iopt - > iova_rwsem ) ;
swap ( * allowed_iova , iopt - > allowed_itree ) ;
for ( allowed = iopt_allowed_iter_first ( iopt , 0 , ULONG_MAX ) ; allowed ;
allowed = iopt_allowed_iter_next ( allowed , 0 , ULONG_MAX ) ) {
if ( iopt_reserved_iter_first ( iopt , allowed - > node . start ,
allowed - > node . last ) ) {
swap ( * allowed_iova , iopt - > allowed_itree ) ;
up_write ( & iopt - > iova_rwsem ) ;
return - EADDRINUSE ;
}
}
up_write ( & iopt - > iova_rwsem ) ;
return 0 ;
}
int iopt_reserve_iova ( struct io_pagetable * iopt , unsigned long start ,
unsigned long last , void * owner )
{
struct iopt_reserved * reserved ;
lockdep_assert_held_write ( & iopt - > iova_rwsem ) ;
if ( iopt_area_iter_first ( iopt , start , last ) | |
iopt_allowed_iter_first ( iopt , start , last ) )
return - EADDRINUSE ;
reserved = kzalloc ( sizeof ( * reserved ) , GFP_KERNEL_ACCOUNT ) ;
if ( ! reserved )
return - ENOMEM ;
reserved - > node . start = start ;
reserved - > node . last = last ;
reserved - > owner = owner ;
interval_tree_insert ( & reserved - > node , & iopt - > reserved_itree ) ;
return 0 ;
}
static void __iopt_remove_reserved_iova ( struct io_pagetable * iopt , void * owner )
{
struct iopt_reserved * reserved , * next ;
lockdep_assert_held_write ( & iopt - > iova_rwsem ) ;
for ( reserved = iopt_reserved_iter_first ( iopt , 0 , ULONG_MAX ) ; reserved ;
reserved = next ) {
next = iopt_reserved_iter_next ( reserved , 0 , ULONG_MAX ) ;
if ( reserved - > owner = = owner ) {
interval_tree_remove ( & reserved - > node ,
& iopt - > reserved_itree ) ;
kfree ( reserved ) ;
}
}
}
void iopt_remove_reserved_iova ( struct io_pagetable * iopt , void * owner )
{
down_write ( & iopt - > iova_rwsem ) ;
__iopt_remove_reserved_iova ( iopt , owner ) ;
up_write ( & iopt - > iova_rwsem ) ;
}
void iopt_init_table ( struct io_pagetable * iopt )
{
init_rwsem ( & iopt - > iova_rwsem ) ;
init_rwsem ( & iopt - > domains_rwsem ) ;
iopt - > area_itree = RB_ROOT_CACHED ;
iopt - > allowed_itree = RB_ROOT_CACHED ;
iopt - > reserved_itree = RB_ROOT_CACHED ;
xa_init_flags ( & iopt - > domains , XA_FLAGS_ACCOUNT ) ;
xa_init_flags ( & iopt - > access_list , XA_FLAGS_ALLOC ) ;
/*
* iopt ' s start as SW tables that can use the entire size_t IOVA space
* due to the use of size_t in the APIs . They have no alignment
* restriction .
*/
iopt - > iova_alignment = 1 ;
}
void iopt_destroy_table ( struct io_pagetable * iopt )
{
struct interval_tree_node * node ;
while ( ( node = interval_tree_iter_first ( & iopt - > allowed_itree , 0 ,
ULONG_MAX ) ) ) {
interval_tree_remove ( node , & iopt - > allowed_itree ) ;
kfree ( container_of ( node , struct iopt_allowed , node ) ) ;
}
WARN_ON ( ! RB_EMPTY_ROOT ( & iopt - > reserved_itree . rb_root ) ) ;
WARN_ON ( ! xa_empty ( & iopt - > domains ) ) ;
WARN_ON ( ! xa_empty ( & iopt - > access_list ) ) ;
WARN_ON ( ! RB_EMPTY_ROOT ( & iopt - > area_itree . rb_root ) ) ;
}
/**
* iopt_unfill_domain ( ) - Unfill a domain with PFNs
* @ iopt : io_pagetable to act on
* @ domain : domain to unfill
*
* This is used when removing a domain from the iopt . Every area in the iopt
* will be unmapped from the domain . The domain must already be removed from the
* domains xarray .
*/
static void iopt_unfill_domain ( struct io_pagetable * iopt ,
struct iommu_domain * domain )
{
struct iopt_area * area ;
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
lockdep_assert_held_write ( & iopt - > domains_rwsem ) ;
/*
* Some other domain is holding all the pfns still , rapidly unmap this
* domain .
*/
if ( iopt - > next_domain_id ! = 0 ) {
/* Pick an arbitrary remaining domain to act as storage */
struct iommu_domain * storage_domain =
xa_load ( & iopt - > domains , 0 ) ;
for ( area = iopt_area_iter_first ( iopt , 0 , ULONG_MAX ) ; area ;
area = iopt_area_iter_next ( area , 0 , ULONG_MAX ) ) {
struct iopt_pages * pages = area - > pages ;
if ( ! pages )
continue ;
mutex_lock ( & pages - > mutex ) ;
if ( area - > storage_domain = = domain )
area - > storage_domain = storage_domain ;
mutex_unlock ( & pages - > mutex ) ;
iopt_area_unmap_domain ( area , domain ) ;
}
return ;
}
for ( area = iopt_area_iter_first ( iopt , 0 , ULONG_MAX ) ; area ;
area = iopt_area_iter_next ( area , 0 , ULONG_MAX ) ) {
struct iopt_pages * pages = area - > pages ;
if ( ! pages )
continue ;
mutex_lock ( & pages - > mutex ) ;
interval_tree_remove ( & area - > pages_node , & pages - > domains_itree ) ;
WARN_ON ( area - > storage_domain ! = domain ) ;
area - > storage_domain = NULL ;
iopt_area_unfill_domain ( area , pages , domain ) ;
mutex_unlock ( & pages - > mutex ) ;
}
}
/**
* iopt_fill_domain ( ) - Fill a domain with PFNs
* @ iopt : io_pagetable to act on
* @ domain : domain to fill
*
* Fill the domain with PFNs from every area in the iopt . On failure the domain
* is left unchanged .
*/
static int iopt_fill_domain ( struct io_pagetable * iopt ,
struct iommu_domain * domain )
{
struct iopt_area * end_area ;
struct iopt_area * area ;
int rc ;
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
lockdep_assert_held_write ( & iopt - > domains_rwsem ) ;
for ( area = iopt_area_iter_first ( iopt , 0 , ULONG_MAX ) ; area ;
area = iopt_area_iter_next ( area , 0 , ULONG_MAX ) ) {
struct iopt_pages * pages = area - > pages ;
if ( ! pages )
continue ;
mutex_lock ( & pages - > mutex ) ;
rc = iopt_area_fill_domain ( area , domain ) ;
if ( rc ) {
mutex_unlock ( & pages - > mutex ) ;
goto out_unfill ;
}
if ( ! area - > storage_domain ) {
WARN_ON ( iopt - > next_domain_id ! = 0 ) ;
area - > storage_domain = domain ;
interval_tree_insert ( & area - > pages_node ,
& pages - > domains_itree ) ;
}
mutex_unlock ( & pages - > mutex ) ;
}
return 0 ;
out_unfill :
end_area = area ;
for ( area = iopt_area_iter_first ( iopt , 0 , ULONG_MAX ) ; area ;
area = iopt_area_iter_next ( area , 0 , ULONG_MAX ) ) {
struct iopt_pages * pages = area - > pages ;
if ( area = = end_area )
break ;
if ( ! pages )
continue ;
mutex_lock ( & pages - > mutex ) ;
if ( iopt - > next_domain_id = = 0 ) {
interval_tree_remove ( & area - > pages_node ,
& pages - > domains_itree ) ;
area - > storage_domain = NULL ;
}
iopt_area_unfill_domain ( area , pages , domain ) ;
mutex_unlock ( & pages - > mutex ) ;
}
return rc ;
}
/* All existing area's conform to an increased page size */
static int iopt_check_iova_alignment ( struct io_pagetable * iopt ,
unsigned long new_iova_alignment )
{
unsigned long align_mask = new_iova_alignment - 1 ;
struct iopt_area * area ;
lockdep_assert_held ( & iopt - > iova_rwsem ) ;
lockdep_assert_held ( & iopt - > domains_rwsem ) ;
for ( area = iopt_area_iter_first ( iopt , 0 , ULONG_MAX ) ; area ;
area = iopt_area_iter_next ( area , 0 , ULONG_MAX ) )
if ( ( iopt_area_iova ( area ) & align_mask ) | |
( iopt_area_length ( area ) & align_mask ) | |
( area - > page_offset & align_mask ) )
return - EADDRINUSE ;
return 0 ;
}
int iopt_table_add_domain ( struct io_pagetable * iopt ,
struct iommu_domain * domain )
{
const struct iommu_domain_geometry * geometry = & domain - > geometry ;
struct iommu_domain * iter_domain ;
unsigned int new_iova_alignment ;
unsigned long index ;
int rc ;
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
xa_for_each ( & iopt - > domains , index , iter_domain ) {
if ( WARN_ON ( iter_domain = = domain ) ) {
rc = - EEXIST ;
goto out_unlock ;
}
}
/*
* The io page size drives the iova_alignment . Internally the iopt_pages
* works in PAGE_SIZE units and we adjust when mapping sub - PAGE_SIZE
* objects into the iommu_domain .
*
* A iommu_domain must always be able to accept PAGE_SIZE to be
* compatible as we can ' t guarantee higher contiguity .
*/
new_iova_alignment = max_t ( unsigned long ,
1UL < < __ffs ( domain - > pgsize_bitmap ) ,
iopt - > iova_alignment ) ;
if ( new_iova_alignment > PAGE_SIZE ) {
rc = - EINVAL ;
goto out_unlock ;
}
if ( new_iova_alignment ! = iopt - > iova_alignment ) {
rc = iopt_check_iova_alignment ( iopt , new_iova_alignment ) ;
if ( rc )
goto out_unlock ;
}
/* No area exists that is outside the allowed domain aperture */
if ( geometry - > aperture_start ! = 0 ) {
rc = iopt_reserve_iova ( iopt , 0 , geometry - > aperture_start - 1 ,
domain ) ;
if ( rc )
goto out_reserved ;
}
if ( geometry - > aperture_end ! = ULONG_MAX ) {
rc = iopt_reserve_iova ( iopt , geometry - > aperture_end + 1 ,
ULONG_MAX , domain ) ;
if ( rc )
goto out_reserved ;
}
rc = xa_reserve ( & iopt - > domains , iopt - > next_domain_id , GFP_KERNEL ) ;
if ( rc )
goto out_reserved ;
rc = iopt_fill_domain ( iopt , domain ) ;
if ( rc )
goto out_release ;
iopt - > iova_alignment = new_iova_alignment ;
xa_store ( & iopt - > domains , iopt - > next_domain_id , domain , GFP_KERNEL ) ;
iopt - > next_domain_id + + ;
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
return 0 ;
out_release :
xa_release ( & iopt - > domains , iopt - > next_domain_id ) ;
out_reserved :
__iopt_remove_reserved_iova ( iopt , domain ) ;
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
return rc ;
}
static int iopt_calculate_iova_alignment ( struct io_pagetable * iopt )
{
unsigned long new_iova_alignment ;
struct iommufd_access * access ;
struct iommu_domain * domain ;
unsigned long index ;
lockdep_assert_held_write ( & iopt - > iova_rwsem ) ;
lockdep_assert_held ( & iopt - > domains_rwsem ) ;
/* See batch_iommu_map_small() */
if ( iopt - > disable_large_pages )
new_iova_alignment = PAGE_SIZE ;
else
new_iova_alignment = 1 ;
xa_for_each ( & iopt - > domains , index , domain )
new_iova_alignment = max_t ( unsigned long ,
1UL < < __ffs ( domain - > pgsize_bitmap ) ,
new_iova_alignment ) ;
xa_for_each ( & iopt - > access_list , index , access )
new_iova_alignment = max_t ( unsigned long ,
access - > iova_alignment ,
new_iova_alignment ) ;
if ( new_iova_alignment > iopt - > iova_alignment ) {
int rc ;
rc = iopt_check_iova_alignment ( iopt , new_iova_alignment ) ;
if ( rc )
return rc ;
}
iopt - > iova_alignment = new_iova_alignment ;
return 0 ;
}
void iopt_table_remove_domain ( struct io_pagetable * iopt ,
struct iommu_domain * domain )
{
struct iommu_domain * iter_domain = NULL ;
unsigned long index ;
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
xa_for_each ( & iopt - > domains , index , iter_domain )
if ( iter_domain = = domain )
break ;
if ( WARN_ON ( iter_domain ! = domain ) | | index > = iopt - > next_domain_id )
goto out_unlock ;
/*
* Compress the xarray to keep it linear by swapping the entry to erase
* with the tail entry and shrinking the tail .
*/
iopt - > next_domain_id - - ;
iter_domain = xa_erase ( & iopt - > domains , iopt - > next_domain_id ) ;
if ( index ! = iopt - > next_domain_id )
xa_store ( & iopt - > domains , index , iter_domain , GFP_KERNEL ) ;
iopt_unfill_domain ( iopt , domain ) ;
__iopt_remove_reserved_iova ( iopt , domain ) ;
WARN_ON ( iopt_calculate_iova_alignment ( iopt ) ) ;
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
}
/**
* iopt_area_split - Split an area into two parts at iova
* @ area : The area to split
* @ iova : Becomes the last of a new area
*
* This splits an area into two . It is part of the VFIO compatibility to allow
* poking a hole in the mapping . The two areas continue to point at the same
* iopt_pages , just with different starting bytes .
*/
static int iopt_area_split ( struct iopt_area * area , unsigned long iova )
{
unsigned long alignment = area - > iopt - > iova_alignment ;
unsigned long last_iova = iopt_area_last_iova ( area ) ;
unsigned long start_iova = iopt_area_iova ( area ) ;
unsigned long new_start = iova + 1 ;
struct io_pagetable * iopt = area - > iopt ;
struct iopt_pages * pages = area - > pages ;
struct iopt_area * lhs ;
struct iopt_area * rhs ;
int rc ;
lockdep_assert_held_write ( & iopt - > iova_rwsem ) ;
if ( iova = = start_iova | | iova = = last_iova )
return 0 ;
if ( ! pages | | area - > prevent_access )
return - EBUSY ;
if ( new_start & ( alignment - 1 ) | |
iopt_area_start_byte ( area , new_start ) & ( alignment - 1 ) )
return - EINVAL ;
lhs = kzalloc ( sizeof ( * area ) , GFP_KERNEL_ACCOUNT ) ;
if ( ! lhs )
return - ENOMEM ;
rhs = kzalloc ( sizeof ( * area ) , GFP_KERNEL_ACCOUNT ) ;
if ( ! rhs ) {
rc = - ENOMEM ;
goto err_free_lhs ;
}
mutex_lock ( & pages - > mutex ) ;
/*
* Splitting is not permitted if an access exists , we don ' t track enough
* information to split existing accesses .
*/
if ( area - > num_accesses ) {
rc = - EINVAL ;
goto err_unlock ;
}
/*
* Splitting is not permitted if a domain could have been mapped with
* huge pages .
*/
if ( area - > storage_domain & & ! iopt - > disable_large_pages ) {
rc = - EINVAL ;
goto err_unlock ;
}
interval_tree_remove ( & area - > node , & iopt - > area_itree ) ;
rc = iopt_insert_area ( iopt , lhs , area - > pages , start_iova ,
iopt_area_start_byte ( area , start_iova ) ,
( new_start - 1 ) - start_iova + 1 ,
area - > iommu_prot ) ;
if ( WARN_ON ( rc ) )
goto err_insert ;
rc = iopt_insert_area ( iopt , rhs , area - > pages , new_start ,
iopt_area_start_byte ( area , new_start ) ,
last_iova - new_start + 1 , area - > iommu_prot ) ;
if ( WARN_ON ( rc ) )
goto err_remove_lhs ;
lhs - > storage_domain = area - > storage_domain ;
lhs - > pages = area - > pages ;
rhs - > storage_domain = area - > storage_domain ;
rhs - > pages = area - > pages ;
kref_get ( & rhs - > pages - > kref ) ;
kfree ( area ) ;
mutex_unlock ( & pages - > mutex ) ;
/*
* No change to domains or accesses because the pages hasn ' t been
* changed
*/
return 0 ;
err_remove_lhs :
interval_tree_remove ( & lhs - > node , & iopt - > area_itree ) ;
err_insert :
interval_tree_insert ( & area - > node , & iopt - > area_itree ) ;
err_unlock :
mutex_unlock ( & pages - > mutex ) ;
kfree ( rhs ) ;
err_free_lhs :
kfree ( lhs ) ;
return rc ;
}
int iopt_cut_iova ( struct io_pagetable * iopt , unsigned long * iovas ,
size_t num_iovas )
{
int rc = 0 ;
int i ;
down_write ( & iopt - > iova_rwsem ) ;
for ( i = 0 ; i < num_iovas ; i + + ) {
struct iopt_area * area ;
area = iopt_area_iter_first ( iopt , iovas [ i ] , iovas [ i ] ) ;
if ( ! area )
continue ;
rc = iopt_area_split ( area , iovas [ i ] ) ;
if ( rc )
break ;
}
up_write ( & iopt - > iova_rwsem ) ;
return rc ;
}
void iopt_enable_large_pages ( struct io_pagetable * iopt )
{
int rc ;
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
WRITE_ONCE ( iopt - > disable_large_pages , false ) ;
rc = iopt_calculate_iova_alignment ( iopt ) ;
WARN_ON ( rc ) ;
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
}
int iopt_disable_large_pages ( struct io_pagetable * iopt )
{
int rc = 0 ;
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
if ( iopt - > disable_large_pages )
goto out_unlock ;
/* Won't do it if domains already have pages mapped in them */
if ( ! xa_empty ( & iopt - > domains ) & &
! RB_EMPTY_ROOT ( & iopt - > area_itree . rb_root ) ) {
rc = - EINVAL ;
goto out_unlock ;
}
WRITE_ONCE ( iopt - > disable_large_pages , true ) ;
rc = iopt_calculate_iova_alignment ( iopt ) ;
if ( rc )
WRITE_ONCE ( iopt - > disable_large_pages , false ) ;
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
return rc ;
}
int iopt_add_access ( struct io_pagetable * iopt , struct iommufd_access * access )
{
int rc ;
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
rc = xa_alloc ( & iopt - > access_list , & access - > iopt_access_list_id , access ,
xa_limit_16b , GFP_KERNEL_ACCOUNT ) ;
if ( rc )
goto out_unlock ;
rc = iopt_calculate_iova_alignment ( iopt ) ;
if ( rc ) {
xa_erase ( & iopt - > access_list , access - > iopt_access_list_id ) ;
goto out_unlock ;
}
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
return rc ;
}
void iopt_remove_access ( struct io_pagetable * iopt ,
struct iommufd_access * access )
{
down_write ( & iopt - > domains_rwsem ) ;
down_write ( & iopt - > iova_rwsem ) ;
WARN_ON ( xa_erase ( & iopt - > access_list , access - > iopt_access_list_id ) ! =
access ) ;
WARN_ON ( iopt_calculate_iova_alignment ( iopt ) ) ;
up_write ( & iopt - > iova_rwsem ) ;
up_write ( & iopt - > domains_rwsem ) ;
}
/* Narrow the valid_iova_itree to include reserved ranges from a group. */
int iopt_table_enforce_group_resv_regions ( struct io_pagetable * iopt ,
struct device * device ,
struct iommu_group * group ,
phys_addr_t * sw_msi_start )
{
struct iommu_resv_region * resv ;
struct iommu_resv_region * tmp ;
LIST_HEAD ( group_resv_regions ) ;
int rc ;
down_write ( & iopt - > iova_rwsem ) ;
rc = iommu_get_group_resv_regions ( group , & group_resv_regions ) ;
if ( rc )
goto out_unlock ;
list_for_each_entry ( resv , & group_resv_regions , list ) {
if ( resv - > type = = IOMMU_RESV_DIRECT_RELAXABLE )
continue ;
/*
* The presence of any ' real ' MSI regions should take precedence
* over the software - managed one if the IOMMU driver happens to
* advertise both types .
*/
if ( sw_msi_start & & resv - > type = = IOMMU_RESV_MSI ) {
* sw_msi_start = 0 ;
sw_msi_start = NULL ;
}
if ( sw_msi_start & & resv - > type = = IOMMU_RESV_SW_MSI )
* sw_msi_start = resv - > start ;
rc = iopt_reserve_iova ( iopt , resv - > start ,
resv - > length - 1 + resv - > start , device ) ;
if ( rc )
goto out_reserved ;
}
rc = 0 ;
goto out_free_resv ;
out_reserved :
__iopt_remove_reserved_iova ( iopt , device ) ;
out_free_resv :
list_for_each_entry_safe ( resv , tmp , & group_resv_regions , list )
kfree ( resv ) ;
out_unlock :
up_write ( & iopt - > iova_rwsem ) ;
return rc ;
}