syzkaller found a race where IOMMUFD_DESTROY increments the refcount: obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY); if (IS_ERR(obj)) return PTR_ERR(obj); iommufd_ref_to_users(obj); /* See iommufd_ref_to_users() */ if (!iommufd_object_destroy_user(ucmd->ictx, obj)) As part of the sequence to join the two existing primitives together. Allowing the refcount the be elevated without holding the destroy_rwsem violates the assumption that all temporary refcount elevations are protected by destroy_rwsem. Racing IOMMUFD_DESTROY with iommufd_object_destroy_user() will cause spurious failures: WARNING: CPU: 0 PID: 3076 at drivers/iommu/iommufd/device.c:477 iommufd_access_destroy+0x18/0x20 drivers/iommu/iommufd/device.c:478 Modules linked in: CPU: 0 PID: 3076 Comm: syz-executor.0 Not tainted 6.3.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/03/2023 RIP: 0010:iommufd_access_destroy+0x18/0x20 drivers/iommu/iommufd/device.c:477 Code: e8 3d 4e 00 00 84 c0 74 01 c3 0f 0b c3 0f 1f 44 00 00 f3 0f 1e fa 48 89 fe 48 8b bf a8 00 00 00 e8 1d 4e 00 00 84 c0 74 01 c3 <0f> 0b c3 0f 1f 44 00 00 41 57 41 56 41 55 4c 8d ae d0 00 00 00 41 RSP: 0018:ffffc90003067e08 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888109ea0300 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 00000000ffffffff RBP: 0000000000000004 R08: 0000000000000000 R09: ffff88810bbb3500 R10: ffff88810bbb3e48 R11: 0000000000000000 R12: ffffc90003067e88 R13: ffffc90003067ea8 R14: ffff888101249800 R15: 00000000fffffffe FS: 00007ff7254fe6c0(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000555557262da8 CR3: 000000010a6fd000 CR4: 0000000000350ef0 Call Trace: <TASK> iommufd_test_create_access drivers/iommu/iommufd/selftest.c:596 [inline] iommufd_test+0x71c/0xcf0 drivers/iommu/iommufd/selftest.c:813 iommufd_fops_ioctl+0x10f/0x1b0 drivers/iommu/iommufd/main.c:337 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x84/0xc0 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd The solution is to not increment the refcount on the IOMMUFD_DESTROY path at all. Instead use the xa_lock to serialize everything. The refcount check == 1 and xa_erase can be done under a single critical region. This avoids the need for any refcount incrementing. It has the downside that if userspace races destroy with other operations it will get an EBUSY instead of waiting, but this is kind of racing is already dangerous. Fixes: 2ff4bed7fee7 ("iommufd: File descriptor, context, kconfig and makefiles") Link: https://lore.kernel.org/r/2-v1-85aacb2af554+bc-iommufd_syz3_jgg@nvidia.com Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reported-by: syzbot+7574ebfe589049630608@syzkaller.appspotmail.com Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
342 lines
11 KiB
C
342 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
|
|
*/
|
|
#ifndef __IOMMUFD_PRIVATE_H
|
|
#define __IOMMUFD_PRIVATE_H
|
|
|
|
#include <linux/rwsem.h>
|
|
#include <linux/xarray.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
struct iommu_domain;
|
|
struct iommu_group;
|
|
struct iommu_option;
|
|
struct iommufd_device;
|
|
|
|
struct iommufd_ctx {
|
|
struct file *file;
|
|
struct xarray objects;
|
|
|
|
u8 account_mode;
|
|
/* Compatibility with VFIO no iommu */
|
|
u8 no_iommu_mode;
|
|
struct iommufd_ioas *vfio_ioas;
|
|
};
|
|
|
|
/*
|
|
* The IOVA to PFN map. The map automatically copies the PFNs into multiple
|
|
* domains and permits sharing of PFNs between io_pagetable instances. This
|
|
* supports both a design where IOAS's are 1:1 with a domain (eg because the
|
|
* domain is HW customized), or where the IOAS is 1:N with multiple generic
|
|
* domains. The io_pagetable holds an interval tree of iopt_areas which point
|
|
* to shared iopt_pages which hold the pfns mapped to the page table.
|
|
*
|
|
* The locking order is domains_rwsem -> iova_rwsem -> pages::mutex
|
|
*/
|
|
struct io_pagetable {
|
|
struct rw_semaphore domains_rwsem;
|
|
struct xarray domains;
|
|
struct xarray access_list;
|
|
unsigned int next_domain_id;
|
|
|
|
struct rw_semaphore iova_rwsem;
|
|
struct rb_root_cached area_itree;
|
|
/* IOVA that cannot become reserved, struct iopt_allowed */
|
|
struct rb_root_cached allowed_itree;
|
|
/* IOVA that cannot be allocated, struct iopt_reserved */
|
|
struct rb_root_cached reserved_itree;
|
|
u8 disable_large_pages;
|
|
unsigned long iova_alignment;
|
|
};
|
|
|
|
void iopt_init_table(struct io_pagetable *iopt);
|
|
void iopt_destroy_table(struct io_pagetable *iopt);
|
|
int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
|
|
unsigned long length, struct list_head *pages_list);
|
|
void iopt_free_pages_list(struct list_head *pages_list);
|
|
enum {
|
|
IOPT_ALLOC_IOVA = 1 << 0,
|
|
};
|
|
int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
|
|
unsigned long *iova, void __user *uptr,
|
|
unsigned long length, int iommu_prot,
|
|
unsigned int flags);
|
|
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
|
|
unsigned long length, unsigned long *dst_iova,
|
|
int iommu_prot, unsigned int flags);
|
|
int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
|
|
unsigned long length, unsigned long *unmapped);
|
|
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
|
|
|
|
void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
|
|
unsigned long length);
|
|
int iopt_table_add_domain(struct io_pagetable *iopt,
|
|
struct iommu_domain *domain);
|
|
void iopt_table_remove_domain(struct io_pagetable *iopt,
|
|
struct iommu_domain *domain);
|
|
int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
|
|
struct device *device,
|
|
struct iommu_group *group,
|
|
phys_addr_t *sw_msi_start);
|
|
int iopt_set_allow_iova(struct io_pagetable *iopt,
|
|
struct rb_root_cached *allowed_iova);
|
|
int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
|
|
unsigned long last, void *owner);
|
|
void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner);
|
|
int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
|
|
size_t num_iovas);
|
|
void iopt_enable_large_pages(struct io_pagetable *iopt);
|
|
int iopt_disable_large_pages(struct io_pagetable *iopt);
|
|
|
|
struct iommufd_ucmd {
|
|
struct iommufd_ctx *ictx;
|
|
void __user *ubuffer;
|
|
u32 user_size;
|
|
void *cmd;
|
|
};
|
|
|
|
int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
|
|
unsigned long arg);
|
|
|
|
/* Copy the response in ucmd->cmd back to userspace. */
|
|
static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
|
|
size_t cmd_len)
|
|
{
|
|
if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
|
|
min_t(size_t, ucmd->user_size, cmd_len)))
|
|
return -EFAULT;
|
|
return 0;
|
|
}
|
|
|
|
enum iommufd_object_type {
|
|
IOMMUFD_OBJ_NONE,
|
|
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
|
|
IOMMUFD_OBJ_DEVICE,
|
|
IOMMUFD_OBJ_HW_PAGETABLE,
|
|
IOMMUFD_OBJ_IOAS,
|
|
IOMMUFD_OBJ_ACCESS,
|
|
#ifdef CONFIG_IOMMUFD_TEST
|
|
IOMMUFD_OBJ_SELFTEST,
|
|
#endif
|
|
};
|
|
|
|
/* Base struct for all objects with a userspace ID handle. */
|
|
struct iommufd_object {
|
|
struct rw_semaphore destroy_rwsem;
|
|
refcount_t users;
|
|
enum iommufd_object_type type;
|
|
unsigned int id;
|
|
};
|
|
|
|
static inline bool iommufd_lock_obj(struct iommufd_object *obj)
|
|
{
|
|
if (!down_read_trylock(&obj->destroy_rwsem))
|
|
return false;
|
|
if (!refcount_inc_not_zero(&obj->users)) {
|
|
up_read(&obj->destroy_rwsem);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
|
|
enum iommufd_object_type type);
|
|
static inline void iommufd_put_object(struct iommufd_object *obj)
|
|
{
|
|
refcount_dec(&obj->users);
|
|
up_read(&obj->destroy_rwsem);
|
|
}
|
|
|
|
/**
|
|
* iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount
|
|
* protection
|
|
* @obj - Object to release
|
|
*
|
|
* Objects have two refcount protections (destroy_rwsem and the refcount_t
|
|
* users). Holding either of these will prevent the object from being destroyed.
|
|
*
|
|
* Depending on the use case, one protection or the other is appropriate. In
|
|
* most cases references are being protected by the destroy_rwsem. This allows
|
|
* orderly destruction of the object because iommufd_object_destroy_user() will
|
|
* wait for it to become unlocked. However, as a rwsem, it cannot be held across
|
|
* a system call return. So cases that have longer term needs must switch
|
|
* to the weaker users refcount_t.
|
|
*
|
|
* With users protection iommufd_object_destroy_user() will return false,
|
|
* refusing to destroy the object, causing -EBUSY to userspace.
|
|
*/
|
|
static inline void iommufd_ref_to_users(struct iommufd_object *obj)
|
|
{
|
|
up_read(&obj->destroy_rwsem);
|
|
/* iommufd_lock_obj() obtains users as well */
|
|
}
|
|
void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj);
|
|
void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
|
|
struct iommufd_object *obj);
|
|
void iommufd_object_finalize(struct iommufd_ctx *ictx,
|
|
struct iommufd_object *obj);
|
|
void __iommufd_object_destroy_user(struct iommufd_ctx *ictx,
|
|
struct iommufd_object *obj, bool allow_fail);
|
|
static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
|
|
struct iommufd_object *obj)
|
|
{
|
|
__iommufd_object_destroy_user(ictx, obj, false);
|
|
}
|
|
static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx,
|
|
struct iommufd_object *obj)
|
|
{
|
|
__iommufd_object_destroy_user(ictx, obj, true);
|
|
}
|
|
|
|
struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
|
|
size_t size,
|
|
enum iommufd_object_type type);
|
|
|
|
#define iommufd_object_alloc(ictx, ptr, type) \
|
|
container_of(_iommufd_object_alloc( \
|
|
ictx, \
|
|
sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
|
|
offsetof(typeof(*(ptr)), \
|
|
obj) != 0), \
|
|
type), \
|
|
typeof(*(ptr)), obj)
|
|
|
|
/*
|
|
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
|
|
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
|
|
* mapping is copied into all of the associated domains and made available to
|
|
* in-kernel users.
|
|
*
|
|
* Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable
|
|
* object. When we go to attach a device to an IOAS we need to get an
|
|
* iommu_domain and wrapping iommufd_hw_pagetable for it.
|
|
*
|
|
* An iommu_domain & iommfd_hw_pagetable will be automatically selected
|
|
* for a device based on the hwpt_list. If no suitable iommu_domain
|
|
* is found a new iommu_domain will be created.
|
|
*/
|
|
struct iommufd_ioas {
|
|
struct iommufd_object obj;
|
|
struct io_pagetable iopt;
|
|
struct mutex mutex;
|
|
struct list_head hwpt_list;
|
|
};
|
|
|
|
static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx,
|
|
u32 id)
|
|
{
|
|
return container_of(iommufd_get_object(ictx, id,
|
|
IOMMUFD_OBJ_IOAS),
|
|
struct iommufd_ioas, obj);
|
|
}
|
|
|
|
struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
|
|
int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
|
|
void iommufd_ioas_destroy(struct iommufd_object *obj);
|
|
int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
|
|
int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
|
|
int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
|
|
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
|
|
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
|
|
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
|
|
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
|
|
struct iommufd_ctx *ictx);
|
|
|
|
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
|
|
|
|
/*
|
|
* A HW pagetable is called an iommu_domain inside the kernel. This user object
|
|
* allows directly creating and inspecting the domains. Domains that have kernel
|
|
* owned page tables will be associated with an iommufd_ioas that provides the
|
|
* IOVA to PFN map.
|
|
*/
|
|
struct iommufd_hw_pagetable {
|
|
struct iommufd_object obj;
|
|
struct iommufd_ioas *ioas;
|
|
struct iommu_domain *domain;
|
|
bool auto_domain : 1;
|
|
bool enforce_cache_coherency : 1;
|
|
bool msi_cookie : 1;
|
|
/* Head at iommufd_ioas::hwpt_list */
|
|
struct list_head hwpt_item;
|
|
struct mutex devices_lock;
|
|
struct list_head devices;
|
|
};
|
|
|
|
struct iommufd_hw_pagetable *
|
|
iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
|
|
struct iommufd_device *idev, bool immediate_attach);
|
|
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
|
|
struct iommufd_device *idev);
|
|
void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt,
|
|
struct iommufd_device *idev);
|
|
void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
|
|
|
|
/*
|
|
* A iommufd_device object represents the binding relationship between a
|
|
* consuming driver and the iommufd. These objects are created/destroyed by
|
|
* external drivers, not by userspace.
|
|
*/
|
|
struct iommufd_device {
|
|
struct iommufd_object obj;
|
|
struct iommufd_ctx *ictx;
|
|
struct iommufd_hw_pagetable *hwpt;
|
|
/* Head at iommufd_hw_pagetable::devices */
|
|
struct list_head devices_item;
|
|
/* always the physical device */
|
|
struct device *dev;
|
|
struct iommu_group *group;
|
|
bool enforce_cache_coherency;
|
|
};
|
|
|
|
void iommufd_device_destroy(struct iommufd_object *obj);
|
|
|
|
struct iommufd_access {
|
|
struct iommufd_object obj;
|
|
struct iommufd_ctx *ictx;
|
|
struct iommufd_ioas *ioas;
|
|
const struct iommufd_access_ops *ops;
|
|
void *data;
|
|
unsigned long iova_alignment;
|
|
u32 iopt_access_list_id;
|
|
};
|
|
|
|
int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access);
|
|
void iopt_remove_access(struct io_pagetable *iopt,
|
|
struct iommufd_access *access);
|
|
void iommufd_access_destroy_object(struct iommufd_object *obj);
|
|
|
|
#ifdef CONFIG_IOMMUFD_TEST
|
|
int iommufd_test(struct iommufd_ucmd *ucmd);
|
|
void iommufd_selftest_destroy(struct iommufd_object *obj);
|
|
extern size_t iommufd_test_memory_limit;
|
|
void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
|
|
unsigned int ioas_id, u64 *iova, u32 *flags);
|
|
bool iommufd_should_fail(void);
|
|
void __init iommufd_test_init(void);
|
|
void iommufd_test_exit(void);
|
|
bool iommufd_selftest_is_mock_dev(struct device *dev);
|
|
#else
|
|
static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
|
|
unsigned int ioas_id,
|
|
u64 *iova, u32 *flags)
|
|
{
|
|
}
|
|
static inline bool iommufd_should_fail(void)
|
|
{
|
|
return false;
|
|
}
|
|
static inline void __init iommufd_test_init(void)
|
|
{
|
|
}
|
|
static inline void iommufd_test_exit(void)
|
|
{
|
|
}
|
|
static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
#endif
|