From d525a5b8cf39791b47a3d61b36dcb43e1d6fbde8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:57 -0300 Subject: [PATCH 01/34] iommufd: Move isolated msi enforcement to iommufd_device_bind() With the recent rework this no longer needs to be done at domain attachment time, we know if the device is usable by iommufd when we bind it. The value of msi_device_has_isolated_msi() is not allowed to change while a driver is bound. Link: https://lore.kernel.org/r/1-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 59fec5783eb9..d4cc39426309 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -60,6 +60,26 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, if (!group) return ERR_PTR(-ENODEV); + /* + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Secure/Isolated means that a + * MemWr operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. + */ + if (!iommufd_selftest_is_mock_dev(dev) && + !iommu_group_has_isolated_msi(group)) { + if (!allow_unsafe_interrupts) { + rc = -EPERM; + goto out_group_put; + } + + dev_warn( + dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } + rc = iommu_device_claim_dma_owner(dev, ictx); if (rc) goto out_group_put; @@ -188,24 +208,6 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, */ hwpt->msi_cookie = true; } - - /* - * For historical compat with VFIO the insecure interrupt path is - * allowed if the module parameter is set. Insecure means that a MemWr - * operation from the device (eg a simple DMA) cannot trigger an - * interrupt outside this iommufd context. - */ - if (!iommufd_selftest_is_mock_dev(idev->dev) && - !iommu_group_has_isolated_msi(idev->group)) { - if (!allow_unsafe_interrupts) - return -EPERM; - - dev_warn( - idev->dev, - "MSI interrupts are not secure, they cannot be isolated by the platform. " - "Check that platform features like interrupt remapping are enabled. " - "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); - } return 0; } From 3a3329a7f14a7a0a8c30a12c7ed9f1f77f8efaa1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:58 -0300 Subject: [PATCH 02/34] iommufd: Add iommufd_group When the hwpt to device attachment is fairly static we could get away with the simple approach of keeping track of the groups via a device list. But with replace this is infeasible. Add an automatically managed struct that is 1:1 with the iommu_group per-ictx so we can store the necessary tracking information there. Link: https://lore.kernel.org/r/2-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 144 +++++++++++++++++++++--- drivers/iommu/iommufd/iommufd_private.h | 9 +- drivers/iommu/iommufd/main.c | 2 + 3 files changed, 137 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index d4cc39426309..dbd517d97df8 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -15,13 +15,121 @@ MODULE_PARM_DESC( "Allow IOMMUFD to bind to devices even if the platform cannot isolate " "the MSI interrupt window. Enabling this is a security weakness."); +static void iommufd_group_release(struct kref *kref) +{ + struct iommufd_group *igroup = + container_of(kref, struct iommufd_group, ref); + + xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, + NULL, GFP_KERNEL); + iommu_group_put(igroup->group); + kfree(igroup); +} + +static void iommufd_put_group(struct iommufd_group *group) +{ + kref_put(&group->ref, iommufd_group_release); +} + +static bool iommufd_group_try_get(struct iommufd_group *igroup, + struct iommu_group *group) +{ + if (!igroup) + return false; + /* + * group ID's cannot be re-used until the group is put back which does + * not happen if we could get an igroup pointer under the xa_lock. + */ + if (WARN_ON(igroup->group != group)) + return false; + return kref_get_unless_zero(&igroup->ref); +} + +/* + * iommufd needs to store some more data for each iommu_group, we keep a + * parallel xarray indexed by iommu_group id to hold this instead of putting it + * in the core structure. To keep things simple the iommufd_group memory is + * unique within the iommufd_ctx. This makes it easy to check there are no + * memory leaks. + */ +static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, + struct device *dev) +{ + struct iommufd_group *new_igroup; + struct iommufd_group *cur_igroup; + struct iommufd_group *igroup; + struct iommu_group *group; + unsigned int id; + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + id = iommu_group_id(group); + + xa_lock(&ictx->groups); + igroup = xa_load(&ictx->groups, id); + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommu_group_put(group); + return igroup; + } + xa_unlock(&ictx->groups); + + new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL); + if (!new_igroup) { + iommu_group_put(group); + return ERR_PTR(-ENOMEM); + } + + kref_init(&new_igroup->ref); + /* group reference moves into new_igroup */ + new_igroup->group = group; + + /* + * The ictx is not additionally refcounted here becase all objects using + * an igroup must put it before their destroy completes. + */ + new_igroup->ictx = ictx; + + /* + * We dropped the lock so igroup is invalid. NULL is a safe and likely + * value to assume for the xa_cmpxchg algorithm. + */ + cur_igroup = NULL; + xa_lock(&ictx->groups); + while (true) { + igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup, + GFP_KERNEL); + if (xa_is_err(igroup)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return ERR_PTR(xa_err(igroup)); + } + + /* new_group was successfully installed */ + if (cur_igroup == igroup) { + xa_unlock(&ictx->groups); + return new_igroup; + } + + /* Check again if the current group is any good */ + if (iommufd_group_try_get(igroup, group)) { + xa_unlock(&ictx->groups); + iommufd_put_group(new_igroup); + return igroup; + } + cur_igroup = igroup; + } +} + void iommufd_device_destroy(struct iommufd_object *obj) { struct iommufd_device *idev = container_of(obj, struct iommufd_device, obj); iommu_device_release_dma_owner(idev->dev); - iommu_group_put(idev->group); + iommufd_put_group(idev->igroup); if (!iommufd_selftest_is_mock_dev(idev->dev)) iommufd_ctx_put(idev->ictx); } @@ -46,7 +154,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id) { struct iommufd_device *idev; - struct iommu_group *group; + struct iommufd_group *igroup; int rc; /* @@ -56,9 +164,9 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) return ERR_PTR(-EINVAL); - group = iommu_group_get(dev); - if (!group) - return ERR_PTR(-ENODEV); + igroup = iommufd_get_group(ictx, dev); + if (IS_ERR(igroup)) + return ERR_CAST(igroup); /* * For historical compat with VFIO the insecure interrupt path is @@ -67,7 +175,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, * interrupt outside this iommufd context. */ if (!iommufd_selftest_is_mock_dev(dev) && - !iommu_group_has_isolated_msi(group)) { + !iommu_group_has_isolated_msi(igroup->group)) { if (!allow_unsafe_interrupts) { rc = -EPERM; goto out_group_put; @@ -97,8 +205,8 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); /* The calling driver is a user until iommufd_device_unbind() */ refcount_inc(&idev->obj.users); - /* group refcount moves into iommufd_device */ - idev->group = group; + /* igroup refcount moves into iommufd_device */ + idev->igroup = igroup; /* * If the caller fails after this success it must call @@ -113,7 +221,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, out_release_owner: iommu_device_release_dma_owner(dev); out_group_put: - iommu_group_put(group); + iommufd_put_group(igroup); return ERR_PTR(rc); } EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); @@ -138,7 +246,8 @@ bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) xa_lock(&ictx->objects); xa_for_each(&ictx->objects, index, obj) { if (obj->type == IOMMUFD_OBJ_DEVICE && - container_of(obj, struct iommufd_device, obj)->group == group) { + container_of(obj, struct iommufd_device, obj) + ->igroup->group == group) { xa_unlock(&ictx->objects); return true; } @@ -212,14 +321,14 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, } static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, - struct iommu_group *group) + struct iommufd_group *igroup) { struct iommufd_device *cur_dev; lockdep_assert_held(&hwpt->devices_lock); list_for_each_entry(cur_dev, &hwpt->devices, devices_item) - if (cur_dev->group == group) + if (cur_dev->igroup->group == igroup->group) return true; return false; } @@ -253,7 +362,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, - idev->group, &sw_msi_start); + idev->igroup->group, + &sw_msi_start); if (rc) return rc; @@ -265,8 +375,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * FIXME: Hack around missing a device-centric iommu api, only attach to * the group once for the first device that is in the group. */ - if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { - rc = iommu_attach_group(hwpt->domain, idev->group); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) { + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; } @@ -279,8 +389,8 @@ err_unresv: void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { - if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) - iommu_detach_group(hwpt->domain, idev->group); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) + iommu_detach_group(hwpt->domain, idev->igroup->group); iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 3dcaf86aab97..285ac4db8b45 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -17,6 +17,7 @@ struct iommufd_device; struct iommufd_ctx { struct file *file; struct xarray objects; + struct xarray groups; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -262,6 +263,12 @@ void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +struct iommufd_group { + struct kref ref; + struct iommufd_ctx *ictx; + struct iommu_group *group; +}; + /* * A iommufd_device object represents the binding relationship between a * consuming driver and the iommufd. These objects are created/destroyed by @@ -270,12 +277,12 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; + struct iommufd_group *igroup; struct iommufd_hw_pagetable *hwpt; /* Head at iommufd_hw_pagetable::devices */ struct list_head devices_item; /* always the physical device */ struct device *dev; - struct iommu_group *group; bool enforce_cache_coherency; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 4bbb20dff430..34fefc0a7833 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -183,6 +183,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) } xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + xa_init(&ictx->groups); ictx->file = filp; filp->private_data = ictx; return 0; @@ -218,6 +219,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) if (WARN_ON(!destroyed)) break; } + WARN_ON(!xa_empty(&ictx->groups)); kfree(ictx); return 0; } From 91a2e17e243f70e0aec29facf44946439fce9195 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:11:59 -0300 Subject: [PATCH 03/34] iommufd: Replace the hwpt->devices list with iommufd_group The devices list was used as a simple way to avoid having per-group information. Now that this seems to be unavoidable, just commit to per-group information fully and remove the devices list from the HWPT. The iommufd_group stores the currently assigned HWPT for the entire group and we can manage the per-device attach/detach with a list in the iommufd_group. For destruction the flow is organized to make the following patches easier, the actual call to iommufd_object_destroy_user() is done at the top of the call chain without holding any locks. The HWPT to be destroyed is returned out from the locked region to make this possible. Later patches create locking that requires this. Link: https://lore.kernel.org/r/3-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 100 +++++++++++------------- drivers/iommu/iommufd/hw_pagetable.c | 22 +----- drivers/iommu/iommufd/iommufd_private.h | 13 ++- 3 files changed, 54 insertions(+), 81 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dbd517d97df8..dbbfd50c0fce 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -20,9 +20,12 @@ static void iommufd_group_release(struct kref *kref) struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); + WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); iommu_group_put(igroup->group); + mutex_destroy(&igroup->lock); kfree(igroup); } @@ -83,6 +86,8 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, } kref_init(&new_igroup->ref); + mutex_init(&new_igroup->lock); + INIT_LIST_HEAD(&new_igroup->device_list); /* group reference moves into new_igroup */ new_igroup->group = group; @@ -320,29 +325,18 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, return 0; } -static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, - struct iommufd_group *igroup) -{ - struct iommufd_device *cur_dev; - - lockdep_assert_held(&hwpt->devices_lock); - - list_for_each_entry(cur_dev, &hwpt->devices, devices_item) - if (cur_dev->igroup->group == igroup->group) - return true; - return false; -} - int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { phys_addr_t sw_msi_start = PHYS_ADDR_MAX; int rc; - lockdep_assert_held(&hwpt->devices_lock); + mutex_lock(&idev->igroup->lock); - if (WARN_ON(idev->hwpt)) - return -EINVAL; + if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { + rc = -EINVAL; + goto err_unlock; + } /* * Try to upgrade the domain we have, it is an iommu driver bug to @@ -356,8 +350,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, hwpt->domain->ops->enforce_cache_coherency( hwpt->domain); if (!hwpt->enforce_cache_coherency) { - WARN_ON(list_empty(&hwpt->devices)); - return -EINVAL; + WARN_ON(list_empty(&idev->igroup->device_list)); + rc = -EINVAL; + goto err_unlock; } } @@ -365,51 +360,52 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, idev->igroup->group, &sw_msi_start); if (rc) - return rc; + goto err_unlock; rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); if (rc) goto err_unresv; /* - * FIXME: Hack around missing a device-centric iommu api, only attach to - * the group once for the first device that is in the group. + * Only attach to the group once for the first device that is in the + * group. All the other devices will follow this attachment. The user + * should attach every device individually to the hwpt as the per-device + * reserved regions are only updated during individual device + * attachment. */ - if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) { + if (list_empty(&idev->igroup->device_list)) { rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; + idev->igroup->hwpt = hwpt; } + refcount_inc(&hwpt->obj.users); + list_add_tail(&idev->group_item, &idev->igroup->device_list); + mutex_unlock(&idev->igroup->lock); return 0; err_unresv: iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); return rc; } -void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev) { - if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) + struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + + mutex_lock(&idev->igroup->lock); + list_del(&idev->group_item); + if (list_empty(&idev->igroup->device_list)) { iommu_detach_group(hwpt->domain, idev->igroup->group); + idev->igroup->hwpt = NULL; + } iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); -} + mutex_unlock(&idev->igroup->lock); -static int iommufd_device_do_attach(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt) -{ - int rc; - - mutex_lock(&hwpt->devices_lock); - rc = iommufd_hw_pagetable_attach(hwpt, idev); - if (rc) - goto out_unlock; - - idev->hwpt = hwpt; - refcount_inc(&hwpt->obj.users); - list_add(&idev->devices_item, &hwpt->devices); -out_unlock: - mutex_unlock(&hwpt->devices_lock); - return rc; + /* Caller must destroy hwpt */ + return hwpt; } /* @@ -418,7 +414,7 @@ out_unlock: * Automatic domain selection will never pick a manually created domain. */ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, - struct iommufd_ioas *ioas) + struct iommufd_ioas *ioas, u32 *pt_id) { struct iommufd_hw_pagetable *hwpt; int rc; @@ -435,7 +431,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, if (!iommufd_lock_obj(&hwpt->obj)) continue; - rc = iommufd_device_do_attach(idev, hwpt); + rc = iommufd_hw_pagetable_attach(hwpt, idev); iommufd_put_object(&hwpt->obj); /* @@ -445,6 +441,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, */ if (rc == -EINVAL) continue; + *pt_id = hwpt->obj.id; goto out_unlock; } @@ -454,6 +451,7 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } hwpt->auto_domain = true; + *pt_id = hwpt->obj.id; mutex_unlock(&ioas->mutex); iommufd_object_finalize(idev->ictx, &hwpt->obj); @@ -489,7 +487,7 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - rc = iommufd_device_do_attach(idev, hwpt); + rc = iommufd_hw_pagetable_attach(hwpt, idev); if (rc) goto out_put_pt_obj; break; @@ -498,7 +496,7 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - rc = iommufd_device_auto_get_domain(idev, ioas); + rc = iommufd_device_auto_get_domain(idev, ioas, pt_id); if (rc) goto out_put_pt_obj; break; @@ -509,7 +507,6 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) } refcount_inc(&idev->obj.users); - *pt_id = idev->hwpt->obj.id; rc = 0; out_put_pt_obj: @@ -527,14 +524,9 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); */ void iommufd_device_detach(struct iommufd_device *idev) { - struct iommufd_hw_pagetable *hwpt = idev->hwpt; - - mutex_lock(&hwpt->devices_lock); - list_del(&idev->devices_item); - idev->hwpt = NULL; - iommufd_hw_pagetable_detach(hwpt, idev); - mutex_unlock(&hwpt->devices_lock); + struct iommufd_hw_pagetable *hwpt; + hwpt = iommufd_hw_pagetable_detach(idev); if (hwpt->auto_domain) iommufd_object_destroy_user(idev->ictx, &hwpt->obj); else diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 6cdb6749d359..bdb76cdb1dc3 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -11,8 +11,6 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) struct iommufd_hw_pagetable *hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - WARN_ON(!list_empty(&hwpt->devices)); - if (!list_empty(&hwpt->hwpt_item)) { mutex_lock(&hwpt->ioas->mutex); list_del(&hwpt->hwpt_item); @@ -25,7 +23,6 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) iommu_domain_free(hwpt->domain); refcount_dec(&hwpt->ioas->obj.users); - mutex_destroy(&hwpt->devices_lock); } /** @@ -52,9 +49,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (IS_ERR(hwpt)) return hwpt; - INIT_LIST_HEAD(&hwpt->devices); INIT_LIST_HEAD(&hwpt->hwpt_item); - mutex_init(&hwpt->devices_lock); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); hwpt->ioas = ioas; @@ -65,8 +60,6 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - mutex_lock(&hwpt->devices_lock); - /* * immediate_attach exists only to accommodate iommu drivers that cannot * directly allocate a domain. These drivers do not finish creating the @@ -76,29 +69,18 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (immediate_attach) { rc = iommufd_hw_pagetable_attach(hwpt, idev); if (rc) - goto out_unlock; + goto out_abort; } rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); if (rc) goto out_detach; list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - - if (immediate_attach) { - /* See iommufd_device_do_attach() */ - refcount_inc(&hwpt->obj.users); - idev->hwpt = hwpt; - list_add(&idev->devices_item, &hwpt->devices); - } - - mutex_unlock(&hwpt->devices_lock); return hwpt; out_detach: if (immediate_attach) - iommufd_hw_pagetable_detach(hwpt, idev); -out_unlock: - mutex_unlock(&hwpt->devices_lock); + iommufd_hw_pagetable_detach(idev); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 285ac4db8b45..2d3fd2672939 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -250,8 +250,6 @@ struct iommufd_hw_pagetable { bool msi_cookie : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; - struct mutex devices_lock; - struct list_head devices; }; struct iommufd_hw_pagetable * @@ -259,14 +257,17 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, bool immediate_attach); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); -void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); struct iommufd_group { struct kref ref; + struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; + struct iommufd_hw_pagetable *hwpt; + struct list_head device_list; }; /* @@ -278,9 +279,7 @@ struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_group *igroup; - struct iommufd_hw_pagetable *hwpt; - /* Head at iommufd_hw_pagetable::devices */ - struct list_head devices_item; + struct list_head group_item; /* always the physical device */ struct device *dev; bool enforce_cache_coherency; From 8d0e2e9d93d2b25b62c7fb6faf8c3cc31c6c6626 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:00 -0300 Subject: [PATCH 04/34] iommu: Export iommu_get_resv_regions() iommufd wants to use this in the next patch. For some reason the iommu_put_resv_regions() was already exported. Link: https://lore.kernel.org/r/4-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index caaf563d38ae..b0804fbad6f9 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2642,6 +2642,14 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks); +/** + * iommu_get_resv_regions - get reserved regions + * @dev: device for which to get reserved regions + * @list: reserved region list for device + * + * This returns a list of reserved IOVA regions specific to this device. + * A domain user should not map IOVA in these ranges. + */ void iommu_get_resv_regions(struct device *dev, struct list_head *list) { const struct iommu_ops *ops = dev_iommu_ops(dev); @@ -2649,9 +2657,10 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list) if (ops->get_resv_regions) ops->get_resv_regions(dev, list); } +EXPORT_SYMBOL_GPL(iommu_get_resv_regions); /** - * iommu_put_resv_regions - release resered regions + * iommu_put_resv_regions - release reserved regions * @dev: device for which to free reserved regions * @list: reserved region list for device * From 34f327a985ff11e538aa52b3e4842328815ce5de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:01 -0300 Subject: [PATCH 05/34] iommufd: Keep track of each device's reserved regions instead of groups The driver facing API in the iommu core makes the reserved regions per-device. An algorithm in the core code consolidates the regions of all the devices in a group to return the group view. To allow for devices to be hotplugged into the group iommufd would re-load the entire group's reserved regions for each device, just in case they changed. Further iommufd already has to deal with duplicated/overlapping reserved regions as it must union all the groups together. Thus simplify all of this to just use the device reserved regions interface directly from the iommu driver. Link: https://lore.kernel.org/r/5-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 5 ++--- drivers/iommu/iommufd/io_pagetable.c | 27 ++++++++++--------------- drivers/iommu/iommufd/iommufd_private.h | 7 +++---- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dbbfd50c0fce..7fd5c184b560 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -356,9 +356,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } } - rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, - idev->igroup->group, - &sw_msi_start); + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, idev->dev, &sw_msi_start); if (rc) goto err_unlock; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 724c4c574241..112bea5a84e4 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1169,25 +1169,22 @@ void iopt_remove_access(struct io_pagetable *iopt, up_write(&iopt->domains_rwsem); } -/* Narrow the valid_iova_itree to include reserved ranges from a group. */ -int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, - struct device *device, - struct iommu_group *group, - phys_addr_t *sw_msi_start) +/* Narrow the valid_iova_itree to include reserved ranges from a device. */ +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start) { struct iommu_resv_region *resv; - struct iommu_resv_region *tmp; - LIST_HEAD(group_resv_regions); + LIST_HEAD(resv_regions); unsigned int num_hw_msi = 0; unsigned int num_sw_msi = 0; int rc; down_write(&iopt->iova_rwsem); - rc = iommu_get_group_resv_regions(group, &group_resv_regions); - if (rc) - goto out_unlock; + /* FIXME: drivers allocate memory but there is no failure propogated */ + iommu_get_resv_regions(dev, &resv_regions); - list_for_each_entry(resv, &group_resv_regions, list) { + list_for_each_entry(resv, &resv_regions, list) { if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; @@ -1199,7 +1196,7 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, } rc = iopt_reserve_iova(iopt, resv->start, - resv->length - 1 + resv->start, device); + resv->length - 1 + resv->start, dev); if (rc) goto out_reserved; } @@ -1214,11 +1211,9 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, goto out_free_resv; out_reserved: - __iopt_remove_reserved_iova(iopt, device); + __iopt_remove_reserved_iova(iopt, dev); out_free_resv: - list_for_each_entry_safe(resv, tmp, &group_resv_regions, list) - kfree(resv); -out_unlock: + iommu_put_resv_regions(dev, &resv_regions); up_write(&iopt->iova_rwsem); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 2d3fd2672939..206c0212ebfd 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -76,10 +76,9 @@ int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain); -int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, - struct device *device, - struct iommu_group *group, - phys_addr_t *sw_msi_start); +int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, + struct device *dev, + phys_addr_t *sw_msi_start); int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova); int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, From 269c5238c5b134ca8b18b98814b87ec995a59968 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:02 -0300 Subject: [PATCH 06/34] iommufd: Use the iommufd_group to avoid duplicate MSI setup This only needs to be done once per group, not once per device. The once per device was a way to make the device list work. Since we are abandoning this we can optimize things a bit. Link: https://lore.kernel.org/r/6-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 7fd5c184b560..eb90d6f9e10f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -361,10 +361,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, if (rc) goto err_unlock; - rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); - if (rc) - goto err_unresv; - /* * Only attach to the group once for the first device that is in the * group. All the other devices will follow this attachment. The user @@ -373,6 +369,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + if (rc) + goto err_unresv; + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; From 1d149ab2e0062a98b3676d6ee0aaa34be16f4d4f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:03 -0300 Subject: [PATCH 07/34] iommufd: Make sw_msi_start a group global The sw_msi_start is only set by the ARM drivers and it is always constant. Due to the way vfio/iommufd allow domains to be re-used between devices we have a built in assumption that there is only one value for sw_msi_start and it is global to the system. To make replace simpler where we may not reparse the iommu_get_resv_regions() move the sw_msi_start to the iommufd_group so it is always available once any HWPT has been attached. Link: https://lore.kernel.org/r/7-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 14 +++++++------- drivers/iommu/iommufd/iommufd_private.h | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index eb90d6f9e10f..4b41ef5292a5 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -88,6 +88,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); INIT_LIST_HEAD(&new_igroup->device_list); + new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -292,10 +293,10 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); -static int iommufd_device_setup_msi(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - phys_addr_t sw_msi_start) +static int iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hw_pagetable *hwpt) { + phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; /* @@ -328,7 +329,6 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { - phys_addr_t sw_msi_start = PHYS_ADDR_MAX; int rc; mutex_lock(&idev->igroup->lock); @@ -356,8 +356,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } } - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, idev->dev, &sw_msi_start); + rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, + &idev->igroup->sw_msi_start); if (rc) goto err_unlock; @@ -369,7 +369,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + rc = iommufd_group_setup_msi(idev->igroup, hwpt); if (rc) goto err_unresv; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 206c0212ebfd..6e11d6ef71ee 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -267,6 +267,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + phys_addr_t sw_msi_start; }; /* From d03f1336fd91b87ad218dd398265332b1cd2c68c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:04 -0300 Subject: [PATCH 08/34] iommufd: Move putting a hwpt to a helper function Next patch will need to call this from two places. Link: https://lore.kernel.org/r/8-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 6 +----- drivers/iommu/iommufd/iommufd_private.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4b41ef5292a5..89cef2d2a2b1 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -526,11 +526,7 @@ void iommufd_device_detach(struct iommufd_device *idev) struct iommufd_hw_pagetable *hwpt; hwpt = iommufd_hw_pagetable_detach(idev); - if (hwpt->auto_domain) - iommufd_object_destroy_user(idev->ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); - + iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 6e11d6ef71ee..8bdbef8bd878 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -260,6 +260,16 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + lockdep_assert_not_held(&hwpt->ioas->mutex); + if (hwpt->auto_domain) + iommufd_object_destroy_user(ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); +} + struct iommufd_group { struct kref ref; struct mutex lock; From 17bad52708b457c4a0cbd7195ad813d6c4308b82 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:05 -0300 Subject: [PATCH 09/34] iommufd: Add enforced_cache_coherency to iommufd_hw_pagetable_alloc() Logically the HWPT should have the coherency set properly for the device that it is being created for when it is created. This was happening implicitly if the immediate_attach was set because iommufd_hw_pagetable_attach() does it as the first thing. Do it unconditionally so !immediate_attach works properly. Link: https://lore.kernel.org/r/9-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 19 ++++------------- drivers/iommu/iommufd/hw_pagetable.c | 27 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 89cef2d2a2b1..08bba99f2fb0 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -338,22 +338,11 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* - * Try to upgrade the domain we have, it is an iommu driver bug to - * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail - * enforce_cache_coherency when there are no devices attached to the - * domain. - */ - if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) { - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) { - WARN_ON(list_empty(&idev->igroup->device_list)); - rc = -EINVAL; + /* Try to upgrade the domain we have */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) goto err_unlock; - } } rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index bdb76cdb1dc3..e0699d7f4c64 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -25,6 +25,20 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) refcount_dec(&hwpt->ioas->obj.users); } +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->enforce_cache_coherency) + return 0; + + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) + return -EINVAL; + return 0; +} + /** * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device * @ictx: iommufd context @@ -60,6 +74,19 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } + /* + * Set the coherency mode before we do iopt_table_add_domain() as some + * iommus have a per-PTE bit that controls it and need to decide before + * doing any maps. It is an iommu driver bug to report + * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on + * a new domain. + */ + if (idev->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (WARN_ON(rc)) + goto out_abort; + } + /* * immediate_attach exists only to accommodate iommu drivers that cannot * directly allocate a domain. These drivers do not finish creating the diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8bdbef8bd878..a6210f51b956 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -254,6 +254,7 @@ struct iommufd_hw_pagetable { struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, bool immediate_attach); +int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * From 70eadc7fc7ef29bfe0e361376983822b5e36dd67 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:06 -0300 Subject: [PATCH 10/34] iommufd: Allow a hwpt to be aborted after allocation During creation the hwpt must have the ioas->mutex held until the object is finalized. This means we need to be able to call iommufd_object_abort_and_destroy() while holding the mutex. Since iommufd_hw_pagetable_destroy() also needs the mutex this is problematic. Fix it by creating a special abort op for the object that can assume the caller is holding the lock, as required by the contract. The next patch will add another iommufd_object_abort_and_destroy() for a hwpt. Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices") Link: https://lore.kernel.org/r/10-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 19 +++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 7 ++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index e0699d7f4c64..2087b51d9807 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -25,6 +25,21 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) refcount_dec(&hwpt->ioas->obj.users); } +void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + /* The ioas->mutex must be held until finalize is called. */ + lockdep_assert_held(&hwpt->ioas->mutex); + + if (!list_empty(&hwpt->hwpt_item)) { + list_del_init(&hwpt->hwpt_item); + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + } + iommufd_hw_pagetable_destroy(obj); +} + int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) { if (hwpt->enforce_cache_coherency) @@ -49,6 +64,10 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain * is fully popoulated. + * + * The caller must hold the ioas->mutex until after + * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on + * the returned hwpt. */ struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a6210f51b956..47fed9feefd0 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -260,6 +260,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +void iommufd_hw_pagetable_abort(struct iommufd_object *obj); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 34fefc0a7833..438f55e6aa88 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -24,6 +24,7 @@ struct iommufd_object_ops { void (*destroy)(struct iommufd_object *obj); + void (*abort)(struct iommufd_object *obj); }; static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; @@ -95,7 +96,10 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - iommufd_object_ops[obj->type].destroy(obj); + if (iommufd_object_ops[obj->type].abort) + iommufd_object_ops[obj->type].abort(obj); + else + iommufd_object_ops[obj->type].destroy(obj); iommufd_object_abort(ictx, obj); } @@ -425,6 +429,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { }, [IOMMUFD_OBJ_HW_PAGETABLE] = { .destroy = iommufd_hw_pagetable_destroy, + .abort = iommufd_hw_pagetable_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { From 31422dff187b243c58f3a97d16bbe9e9ada639fe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:07 -0300 Subject: [PATCH 11/34] iommufd: Fix locking around hwpt allocation Due to the auto_domains mechanism the ioas->mutex must be held until the hwpt is completely setup by iommufd_object_abort_and_destroy() or iommufd_object_finalize(). This prevents a concurrent iommufd_device_auto_get_domain() from seeing an incompletely initialized object through the ioas->hwpt_list. To make this more consistent move the unlock until after finalize. Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices") Link: https://lore.kernel.org/r/11-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 08bba99f2fb0..9b9244c5b0ce 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -441,8 +441,8 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt->auto_domain = true; *pt_id = hwpt->obj.id; - mutex_unlock(&ioas->mutex); iommufd_object_finalize(idev->ictx, &hwpt->obj); + mutex_unlock(&ioas->mutex); return 0; out_unlock: mutex_unlock(&ioas->mutex); From ea2d6124b52389aa837403f5debf73620d3f441f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:08 -0300 Subject: [PATCH 12/34] iommufd: Reorganize iommufd_device_attach into iommufd_device_change_pt The code flow for first time attaching a PT and replacing a PT is very similar except for the lowest do_attach step. Reorganize this so that the do_attach step is a function pointer. Replace requires destroying the old HWPT once it is replaced. This destruction cannot be done under all the locks that are held in the function pointer, so the signature allows returning a HWPT which will be destroyed by the caller after everything is unlocked. Link: https://lore.kernel.org/r/12-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 165 +++++++++++++++++++++++---------- 1 file changed, 114 insertions(+), 51 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9b9244c5b0ce..138bcdb7f14f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -396,16 +396,41 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) return hwpt; } +static struct iommufd_hw_pagetable * +iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + int rc; + + rc = iommufd_hw_pagetable_attach(hwpt, idev); + if (rc) + return ERR_PTR(rc); + return NULL; +} + +typedef struct iommufd_hw_pagetable *(*attach_fn)( + struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + /* * When automatically managing the domains we search for a compatible domain in * the iopt and if one is found use it, otherwise create a new domain. * Automatic domain selection will never pick a manually created domain. */ -static int iommufd_device_auto_get_domain(struct iommufd_device *idev, - struct iommufd_ioas *ioas, u32 *pt_id) +static struct iommufd_hw_pagetable * +iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas, u32 *pt_id, + attach_fn do_attach) { + /* + * iommufd_hw_pagetable_attach() is called by + * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as + * iommufd_device_do_attach(). So if we are in this mode then we prefer + * to use the immediate_attach path as it supports drivers that can't + * directly allocate a domain. + */ + bool immediate_attach = do_attach == iommufd_device_do_attach; + struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_hw_pagetable *hwpt; - int rc; /* * There is no differentiation when domains are allocated, so any domain @@ -419,38 +444,101 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev, if (!iommufd_lock_obj(&hwpt->obj)) continue; - rc = iommufd_hw_pagetable_attach(hwpt, idev); - iommufd_put_object(&hwpt->obj); - - /* - * -EINVAL means the domain is incompatible with the device. - * Other error codes should propagate to userspace as failure. - * Success means the domain is attached. - */ - if (rc == -EINVAL) - continue; + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) { + iommufd_put_object(&hwpt->obj); + /* + * -EINVAL means the domain is incompatible with the + * device. Other error codes should propagate to + * userspace as failure. Success means the domain is + * attached. + */ + if (PTR_ERR(destroy_hwpt) == -EINVAL) + continue; + goto out_unlock; + } *pt_id = hwpt->obj.id; + iommufd_put_object(&hwpt->obj); goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, true); + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, + immediate_attach); if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + destroy_hwpt = ERR_CAST(hwpt); goto out_unlock; } + + if (!immediate_attach) { + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) + goto out_abort; + } else { + destroy_hwpt = NULL; + } + hwpt->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); mutex_unlock(&ioas->mutex); - return 0; + return destroy_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); out_unlock: mutex_unlock(&ioas->mutex); - return rc; + return destroy_hwpt; +} + +static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, + attach_fn do_attach) +{ + struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_object *pt_obj; + + pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return PTR_ERR(pt_obj); + + switch (pt_obj->type) { + case IOMMUFD_OBJ_HW_PAGETABLE: { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + destroy_hwpt = (*do_attach)(idev, hwpt); + if (IS_ERR(destroy_hwpt)) + goto out_put_pt_obj; + break; + } + case IOMMUFD_OBJ_IOAS: { + struct iommufd_ioas *ioas = + container_of(pt_obj, struct iommufd_ioas, obj); + + destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, + do_attach); + if (IS_ERR(destroy_hwpt)) + goto out_put_pt_obj; + break; + } + default: + destroy_hwpt = ERR_PTR(-EINVAL); + goto out_put_pt_obj; + } + iommufd_put_object(pt_obj); + + /* This destruction has to be after we unlock everything */ + if (destroy_hwpt) + iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt); + return 0; + +out_put_pt_obj: + iommufd_put_object(pt_obj); + return PTR_ERR(destroy_hwpt); } /** - * iommufd_device_attach - Connect a device from an iommu_domain + * iommufd_device_attach - Connect a device to an iommu_domain * @idev: device to attach * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE * Output the IOMMUFD_OBJ_HW_PAGETABLE ID @@ -463,43 +551,18 @@ out_unlock: */ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) { - struct iommufd_object *pt_obj; int rc; - pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); - if (IS_ERR(pt_obj)) - return PTR_ERR(pt_obj); - - switch (pt_obj->type) { - case IOMMUFD_OBJ_HW_PAGETABLE: { - struct iommufd_hw_pagetable *hwpt = - container_of(pt_obj, struct iommufd_hw_pagetable, obj); - - rc = iommufd_hw_pagetable_attach(hwpt, idev); - if (rc) - goto out_put_pt_obj; - break; - } - case IOMMUFD_OBJ_IOAS: { - struct iommufd_ioas *ioas = - container_of(pt_obj, struct iommufd_ioas, obj); - - rc = iommufd_device_auto_get_domain(idev, ioas, pt_id); - if (rc) - goto out_put_pt_obj; - break; - } - default: - rc = -EINVAL; - goto out_put_pt_obj; - } + rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + if (rc) + return rc; + /* + * Pairs with iommufd_device_detach() - catches caller bugs attempting + * to destroy a device with an attachment. + */ refcount_inc(&idev->obj.users); - rc = 0; - -out_put_pt_obj: - iommufd_put_object(pt_obj); - return rc; + return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); From addb665924f39f01dc1d6b2de3d21ebbc1232de6 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Jul 2023 15:12:09 -0300 Subject: [PATCH 13/34] iommu: Introduce a new iommu_group_replace_domain() API qemu has a need to replace the translations associated with a domain when the guest does large-scale operations like switching between an IDENTITY domain and, say, dma-iommu.c. Currently, it does this by replacing all the mappings in a single domain, but this is very inefficient and means that domains have to be per-device rather than per-translation. Provide a high-level API to allow replacements of one domain with another. This is similar to a detach/attach cycle except it doesn't force the group to go to the blocking domain in-between. By removing this forced blocking domain the iommu driver has the opportunity to implement a non-disruptive replacement of the domain to the greatest extent its hardware allows. This allows the qemu emulation of the vIOMMU to be more complete, as real hardware often has a non-distruptive replacement capability. It could be possible to address this by simply removing the protection from the iommu_attach_group(), but it is not so clear if that is safe for the few users. Thus, add a new API to serve this new purpose. All drivers are already required to support changing between active UNMANAGED domains when using their attach_dev ops. This API is expected to be used only by IOMMUFD, so add to the iommu-priv header and mark it as IOMMUFD_INTERNAL. Link: https://lore.kernel.org/r/13-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 10 ++++++++++ drivers/iommu/iommu.c | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 drivers/iommu/iommu-priv.h diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h new file mode 100644 index 000000000000..7c8011bfd153 --- /dev/null +++ b/drivers/iommu/iommu-priv.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __LINUX_IOMMU_PRIV_H +#define __LINUX_IOMMU_PRIV_H + +#include + +int iommu_group_replace_domain(struct iommu_group *group, + struct iommu_domain *new_domain); + +#endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b0804fbad6f9..587619676573 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,7 @@ #include #include "dma-iommu.h" +#include "iommu-priv.h" #include "iommu-sva.h" @@ -2114,6 +2115,32 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); +/** + * iommu_group_replace_domain - replace the domain that a group is attached to + * @new_domain: new IOMMU domain to replace with + * @group: IOMMU group that will be attached to the new domain + * + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group(). + */ +int iommu_group_replace_domain(struct iommu_group *group, + struct iommu_domain *new_domain) +{ + int ret; + + if (!new_domain) + return -EINVAL; + + mutex_lock(&group->mutex); + ret = __iommu_group_set_domain(group, new_domain); + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL); + static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, From e88d4ec154a87884086d3e38f2dfc66aec11bf8c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:10 -0300 Subject: [PATCH 14/34] iommufd: Add iommufd_device_replace() Replace allows all the devices in a group to move in one step to a new HWPT. Further, the HWPT move is done without going through a blocking domain so that the IOMMU driver can implement some level of non-distruption to ongoing DMA if that has meaning for it (eg for future special driver domains) Replace uses a lot of the same logic as normal attach, except the actual domain change over has different restrictions, and we are careful to sequence things so that failure is going to leave everything the way it was, and not get trapped in a blocking domain or something if there is ENOMEM. Link: https://lore.kernel.org/r/14-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 101 +++++++++++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 1 + 2 files changed, 102 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 138bcdb7f14f..57c0e81f5073 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -4,6 +4,7 @@ #include #include #include +#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" @@ -408,6 +409,84 @@ iommufd_device_do_attach(struct iommufd_device *idev, return NULL; } +static struct iommufd_hw_pagetable * +iommufd_device_do_replace(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + unsigned int num_devices = 0; + struct iommufd_device *cur; + int rc; + + mutex_lock(&idev->igroup->lock); + + if (igroup->hwpt == NULL) { + rc = -EINVAL; + goto err_unlock; + } + + if (hwpt == igroup->hwpt) { + mutex_unlock(&idev->igroup->lock); + return NULL; + } + + /* Try to upgrade the domain we have */ + list_for_each_entry(cur, &igroup->device_list, group_item) { + num_devices++; + if (cur->enforce_cache_coherency) { + rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (rc) + goto err_unlock; + } + } + + old_hwpt = igroup->hwpt; + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(idev->igroup, hwpt); + if (rc) + goto err_unresv; + + rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + if (rc) + goto err_unresv; + + if (hwpt->ioas != old_hwpt->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, + cur->dev); + } + + igroup->hwpt = hwpt; + + /* + * Move the refcounts held by the device_list to the new hwpt. Retain a + * refcount for this thread as the caller will free it. + */ + refcount_add(num_devices, &hwpt->obj.users); + if (num_devices > 1) + WARN_ON(refcount_sub_and_test(num_devices - 1, + &old_hwpt->obj.users)); + mutex_unlock(&idev->igroup->lock); + + /* Caller must destroy old_hwpt */ + return old_hwpt; +err_unresv: + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); +err_unlock: + mutex_unlock(&idev->igroup->lock); + return ERR_PTR(rc); +} + typedef struct iommufd_hw_pagetable *(*attach_fn)( struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); @@ -566,6 +645,28 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) } EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); +/** + * iommufd_device_replace - Change the device's iommu_domain + * @idev: device to change + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This is the same as:: + * + * iommufd_device_detach(); + * iommufd_device_attach(); + * + * If it fails then no change is made to the attachment. The iommu driver may + * implement this so there is no disruption in translation. This can only be + * called if iommufd_device_attach() has already succeeded. + */ +int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +{ + return iommufd_device_change_pt(idev, pt_id, + &iommufd_device_do_replace); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); + /** * iommufd_device_detach - Disconnect a device to an iommu_domain * @idev: device to detach diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 438f55e6aa88..521225c6a071 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -490,5 +490,6 @@ module_exit(iommufd_exit); MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif +MODULE_IMPORT_NS(IOMMUFD_INTERNAL); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); From 83f7bc6fdfd2fca11f35c061194d60f88acb3086 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:11 -0300 Subject: [PATCH 15/34] iommufd: Make destroy_rwsem use a lock class per object type The selftest invokes things like replace under the object lock of its idev which protects the idev in a similar way to a real user. Unfortunately this triggers lockdep. A lock class per type will solve the problem. Link: https://lore.kernel.org/r/15-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 47fed9feefd0..9394f0e6692a 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -119,6 +119,7 @@ enum iommufd_object_type { #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif + IOMMUFD_OBJ_MAX, }; /* Base struct for all objects with a userspace ID handle. */ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 521225c6a071..56cdf3c796a7 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -33,6 +33,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type) { + static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; struct iommufd_object *obj; int rc; @@ -40,7 +41,15 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, if (!obj) return ERR_PTR(-ENOMEM); obj->type = type; - init_rwsem(&obj->destroy_rwsem); + /* + * In most cases the destroy_rwsem is obtained with try so it doesn't + * interact with lockdep, however on destroy we have to sleep. This + * means if we have to destroy an object while holding a get on another + * object it triggers lockdep. Using one locking class per object type + * is a simple and reasonable way to avoid this. + */ + __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", + &obj_keys[type]); refcount_set(&obj->users, 1); /* From fa1ffdb9e2937d04657c33a146aa0d2cd39abba0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Jul 2023 15:12:12 -0300 Subject: [PATCH 16/34] iommufd/selftest: Test iommufd_device_replace() Allow the selftest to call the function on the mock idev, add some tests to exercise it. Link: https://lore.kernel.org/r/16-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 4 ++ drivers/iommu/iommufd/selftest.c | 39 +++++++++++++++++ include/linux/iommufd.h | 1 + tools/testing/selftests/iommu/iommufd.c | 37 ++++++++++++++-- .../selftests/iommu/iommufd_fail_nth.c | 42 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 30 +++++++++++++ 6 files changed, 149 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index b3d69cca7729..e3f1035cbd64 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -17,6 +17,7 @@ enum { IOMMU_TEST_OP_ACCESS_PAGES, IOMMU_TEST_OP_ACCESS_RW, IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, + IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, }; enum { @@ -52,6 +53,9 @@ struct iommu_test_cmd { __u32 out_stdev_id; __u32 out_hwpt_id; } mock_domain; + struct { + __u32 pt_id; + } mock_domain_replace; struct { __aligned_u64 iova; __aligned_u64 length; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 74c2076105d4..eb33dffb955d 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -455,6 +455,42 @@ out_sobj: return rc; } +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct iommufd_object *dev_obj; + struct selftest_obj *sobj; + int rc; + + /* + * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure + * it doesn't race with detach, which is not allowed. + */ + dev_obj = + iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(dev_obj)) + return PTR_ERR(dev_obj); + + sobj = container_of(dev_obj, struct selftest_obj, obj); + if (sobj->type != TYPE_IDEV) { + rc = -EINVAL; + goto out_dev_obj; + } + + rc = iommufd_device_replace(sobj->idev.idev, &pt_id); + if (rc) + goto out_dev_obj; + + cmd->mock_domain_replace.pt_id = pt_id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_dev_obj: + iommufd_put_object(dev_obj); + return rc; +} + /* Add an additional reserved IOVA to the IOAS */ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, @@ -948,6 +984,9 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: + return iommufd_test_mock_domain_replace( + ucmd, cmd->id, cmd->mock_domain_replace.pt_id, cmd); case IOMMU_TEST_OP_MD_CHECK_MAP: return iommufd_test_md_check_pa( ucmd, cmd->id, cmd->check_map.iova, diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 9657c58813dc..0ac60256b659 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -23,6 +23,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index e4a6b33cfde4..96881ecca5c7 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -9,9 +9,6 @@ #include "iommufd_utils.h" -static void *buffer; - -static unsigned long PAGE_SIZE; static unsigned long HUGEPAGE_SIZE; #define MOCK_PAGE_SIZE (PAGE_SIZE / 2) @@ -1035,6 +1032,7 @@ FIXTURE(iommufd_mock_domain) uint32_t ioas_id; uint32_t hwpt_id; uint32_t hwpt_ids[2]; + uint32_t stdev_ids[2]; int mmap_flags; size_t mmap_buf_size; }; @@ -1056,7 +1054,8 @@ FIXTURE_SETUP(iommufd_mock_domain) ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains); for (i = 0; i != variant->mock_domains; i++) - test_cmd_mock_domain(self->ioas_id, NULL, &self->hwpt_ids[i]); + test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i], + &self->hwpt_ids[i]); self->hwpt_id = self->hwpt_ids[0]; self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; @@ -1308,6 +1307,36 @@ TEST_F(iommufd_mock_domain, user_copy) test_ioctl_destroy(ioas_id); } +TEST_F(iommufd_mock_domain, replace) +{ + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + + test_cmd_mock_domain_replace(self->stdev_ids[0], ioas_id); + + /* + * Replacing the IOAS causes the prior HWPT to be deallocated, thus we + * should get enoent when we try to use it. + */ + if (variant->mock_domains == 1) + test_err_mock_domain_replace(ENOENT, self->stdev_ids[0], + self->hwpt_ids[0]); + + test_cmd_mock_domain_replace(self->stdev_ids[0], ioas_id); + if (variant->mock_domains >= 2) { + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[1]); + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[1]); + test_cmd_mock_domain_replace(self->stdev_ids[0], + self->hwpt_ids[0]); + } + + test_cmd_mock_domain_replace(self->stdev_ids[0], self->ioas_id); + test_ioctl_destroy(ioas_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index d9afcb23810e..96fb2f04623f 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -41,6 +41,8 @@ static int writeat(int dfd, const char *fn, const char *val) static __attribute__((constructor)) void setup_buffer(void) { + PAGE_SIZE = sysconf(_SC_PAGE_SIZE); + BUFFER_SIZE = 2*1024*1024; buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE, @@ -569,4 +571,44 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) return 0; } +/* device.c */ +TEST_FAIL_NTH(basic_fail_nth, device) +{ + uint32_t ioas_id; + uint32_t ioas_id2; + uint32_t stdev_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id2)) + return -1; + + iova = MOCK_APERTURE_START; + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, PAGE_SIZE, &iova, + IOMMU_IOAS_MAP_FIXED_IOVA | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + if (_test_ioctl_ioas_map(self->fd, ioas_id2, buffer, PAGE_SIZE, &iova, + IOMMU_IOAS_MAP_FIXED_IOVA | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL)) + return -1; + + if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) + return -1; + return 0; +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 85d6662ef8e8..8b11bb700952 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -19,6 +19,8 @@ static void *buffer; static unsigned long BUFFER_SIZE; +static unsigned long PAGE_SIZE; + /* * Have the kernel check the refcount on pages. I don't know why a freshly * mmap'd anon non-compound page starts out with a ref of 3 @@ -66,6 +68,34 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ stdev_id, hwpt_id)) +static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, + __u32 *hwpt_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, + .id = stdev_id, + .mock_domain_replace = { + .pt_id = pt_id, + }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + if (hwpt_id) + *hwpt_id = cmd.mock_domain_replace.pt_id; + return 0; +} + +#define test_cmd_mock_domain_replace(stdev_id, pt_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain_replace(self->fd, stdev_id, pt_id, \ + NULL)) +#define test_err_mock_domain_replace(_errno, stdev_id, pt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \ + pt_id, NULL)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From 7074d7bd67d495cb3f6fe7c7c96b357a3b9d4ec2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:13 -0300 Subject: [PATCH 17/34] iommufd: Add IOMMU_HWPT_ALLOC This allows userspace to manually create HWPTs on IOAS's and then use those HWPTs as inputs to iommufd_device_attach/replace(). Following series will extend this to allow creating iommu_domains with driver specific parameters. Link: https://lore.kernel.org/r/17-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 46 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 9 +++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 26 ++++++++++++++ 4 files changed, 84 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2087b51d9807..cf2c1504e20d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -3,6 +3,7 @@ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include +#include #include "iommufd_private.h" @@ -131,3 +132,48 @@ out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); } + +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_alloc *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_device *idev; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags || cmd->__reserved) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); + if (IS_ERR(ioas)) { + rc = PTR_ERR(ioas); + goto out_put_idev; + } + + mutex_lock(&ioas->mutex); + hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + + cmd->out_hwpt_id = hwpt->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_hwpt; + iommufd_object_finalize(ucmd->ictx, &hwpt->obj); + goto out_unlock; + +out_hwpt: + iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + iommufd_put_object(&ioas->obj); +out_put_idev: + iommufd_put_object(&idev->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 9394f0e6692a..dba730129b8c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -262,6 +262,7 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) @@ -298,6 +299,14 @@ struct iommufd_device { bool enforce_cache_coherency; }; +static inline struct iommufd_device * +iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_DEVICE), + struct iommufd_device, obj); +} + void iommufd_device_destroy(struct iommufd_object *obj); struct iommufd_access { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 56cdf3c796a7..94c498b8fdf6 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -265,6 +265,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_hwpt_alloc hwpt; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -296,6 +297,8 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, + __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 98ebba80cfa1..8245c01adca6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -45,6 +45,7 @@ enum { IOMMUFD_CMD_IOAS_UNMAP, IOMMUFD_CMD_OPTION, IOMMUFD_CMD_VFIO_IOAS, + IOMMUFD_CMD_HWPT_ALLOC, }; /** @@ -344,4 +345,29 @@ struct iommu_vfio_ioas { __u16 __reserved; }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) + +/** + * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) + * @size: sizeof(struct iommu_hwpt_alloc) + * @flags: Must be 0 + * @dev_id: The device to allocate this HWPT for + * @pt_id: The IOAS to connect this HWPT to + * @out_hwpt_id: The ID of the new HWPT + * @__reserved: Must be 0 + * + * Explicitly allocate a hardware page table object. This is the same object + * type that is returned by iommufd_device_attach() and represents the + * underlying iommu driver's iommu_domain kernel object. + * + * A HWPT will be created with the IOVA mappings from the given IOAS. + */ +struct iommu_hwpt_alloc { + __u32 size; + __u32 flags; + __u32 dev_id; + __u32 pt_id; + __u32 out_hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) #endif From 7a467e02b339a50ed2762edd72e763925ac2b0a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:14 -0300 Subject: [PATCH 18/34] iommufd/selftest: Return the real idev id from selftest mock_domain Now that we actually call iommufd_device_bind() we can return the idev_id from that function to userspace for use in other APIs. Link: https://lore.kernel.org/r/18-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 2 ++ drivers/iommu/iommufd/selftest.c | 1 + tools/testing/selftests/iommu/iommufd.c | 17 +++++++++-------- .../testing/selftests/iommu/iommufd_fail_nth.c | 18 ++++++++++-------- tools/testing/selftests/iommu/iommufd_utils.h | 12 +++++++----- 5 files changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index e3f1035cbd64..dd9168a20ddf 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -52,6 +52,8 @@ struct iommu_test_cmd { struct { __u32 out_stdev_id; __u32 out_hwpt_id; + /* out_idev_id is the standard iommufd_bind object */ + __u32 out_idev_id; } mock_domain; struct { __u32 pt_id; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index eb33dffb955d..9d43334e4faf 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -443,6 +443,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, /* Userspace must destroy the device_id to destroy the object */ cmd->mock_domain.out_hwpt_id = pt_id; cmd->mock_domain.out_stdev_id = sobj->obj.id; + cmd->mock_domain.out_idev_id = idev_id; iommufd_object_finalize(ucmd->ictx, &sobj->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 96881ecca5c7..78841aba47d0 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -211,7 +211,7 @@ FIXTURE_SETUP(iommufd_ioas) for (i = 0; i != variant->mock_domains; i++) { test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id); + &self->hwpt_id, NULL); self->base_iova = MOCK_APERTURE_START; } } @@ -262,7 +262,7 @@ TEST_F(iommufd_ioas, hwpt_attach) { /* Create a device attached directly to a hwpt */ if (self->stdev_id) { - test_cmd_mock_domain(self->hwpt_id, NULL, NULL); + test_cmd_mock_domain(self->hwpt_id, NULL, NULL, NULL); } else { test_err_mock_domain(ENOENT, self->hwpt_id, NULL, NULL); } @@ -681,7 +681,7 @@ TEST_F(iommufd_ioas, access_pin) _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), &access_cmd)); test_cmd_mock_domain(self->ioas_id, &mock_stdev_id, - &mock_hwpt_id); + &mock_hwpt_id, NULL); check_map_cmd.id = mock_hwpt_id; ASSERT_EQ(0, ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP), @@ -836,7 +836,7 @@ TEST_F(iommufd_ioas, fork_gone) * If a domain already existed then everything was pinned within * the fork, so this copies from one domain to another. */ - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0); @@ -885,7 +885,7 @@ TEST_F(iommufd_ioas, fork_present) ASSERT_EQ(8, read(efd, &tmp, sizeof(tmp))); /* Read pages from the remote process */ - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0); ASSERT_EQ(0, close(pipefds[1])); @@ -1033,6 +1033,7 @@ FIXTURE(iommufd_mock_domain) uint32_t hwpt_id; uint32_t hwpt_ids[2]; uint32_t stdev_ids[2]; + uint32_t idev_ids[2]; int mmap_flags; size_t mmap_buf_size; }; @@ -1055,7 +1056,7 @@ FIXTURE_SETUP(iommufd_mock_domain) for (i = 0; i != variant->mock_domains; i++) test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i], - &self->hwpt_ids[i]); + &self->hwpt_ids[i], &self->idev_ids[i]); self->hwpt_id = self->hwpt_ids[0]; self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; @@ -1249,7 +1250,7 @@ TEST_F(iommufd_mock_domain, all_aligns_copy) /* Add and destroy a domain while the area exists */ old_id = self->hwpt_ids[1]; test_cmd_mock_domain(self->ioas_id, &mock_stdev_id, - &self->hwpt_ids[1]); + &self->hwpt_ids[1], NULL); check_mock_iova(buf + start, iova, length); check_refs(buf + start / PAGE_SIZE * PAGE_SIZE, @@ -1458,7 +1459,7 @@ FIXTURE_SETUP(vfio_compat_mock_domain) /* Create what VFIO would consider a group */ test_ioctl_ioas_alloc(&self->ioas_id); - test_cmd_mock_domain(self->ioas_id, NULL, NULL); + test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL); /* Attach it to the vfio compat */ vfio_ioas_cmd.ioas_id = self->ioas_id; diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 96fb2f04623f..8ab20df4edc8 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -315,7 +315,7 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, @@ -326,7 +326,7 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain) if (_test_ioctl_destroy(self->fd, stdev_id)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; return 0; } @@ -350,12 +350,13 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains) if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2, + NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, @@ -369,9 +370,10 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains) if (_test_ioctl_destroy(self->fd, stdev_id2)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2, + NULL)) return -1; return 0; } @@ -528,7 +530,7 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) return -1; - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL)) return -1; if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova, @@ -603,7 +605,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, NULL)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 8b11bb700952..62e01412a767 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -41,7 +41,7 @@ static unsigned long PAGE_SIZE; }) static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, - __u32 *hwpt_id) + __u32 *hwpt_id, __u32 *idev_id) { struct iommu_test_cmd cmd = { .size = sizeof(cmd), @@ -59,14 +59,16 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, assert(cmd.id != 0); if (hwpt_id) *hwpt_id = cmd.mock_domain.out_hwpt_id; + if (idev_id) + *idev_id = cmd.mock_domain.out_idev_id; return 0; } -#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id) \ - ASSERT_EQ(0, \ - _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, hwpt_id)) +#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id, idev_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, \ + hwpt_id, idev_id)) #define test_err_mock_domain(_errno, ioas_id, stdev_id, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ - stdev_id, hwpt_id)) + stdev_id, hwpt_id, NULL)) static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, __u32 *hwpt_id) From 6583c865dec5493beb4c72fd724eb9d43d8d7ebb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Jul 2023 15:12:15 -0300 Subject: [PATCH 19/34] iommufd/selftest: Add a selftest for IOMMU_HWPT_ALLOC Test the basic flow. Link: https://lore.kernel.org/r/19-v8-6659224517ea+532-iommufd_alloc_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.c | 3 +++ tools/testing/selftests/iommu/iommufd.c | 15 +++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 11 +++++++++- tools/testing/selftests/iommu/iommufd_utils.h | 21 +++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 112bea5a84e4..4d095115c2d0 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1180,6 +1180,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, unsigned int num_sw_msi = 0; int rc; + if (iommufd_should_fail()) + return -EINVAL; + down_write(&iopt->iova_rwsem); /* FIXME: drivers allocate memory but there is no failure propogated */ iommu_get_resv_regions(dev, &resv_regions); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 78841aba47d0..dc09c1de319f 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1338,6 +1338,21 @@ TEST_F(iommufd_mock_domain, replace) test_ioctl_destroy(ioas_id); } +TEST_F(iommufd_mock_domain, alloc_hwpt) +{ + int i; + + for (i = 0; i != variant->mock_domains; i++) { + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); + } +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 8ab20df4edc8..d4c552e56948 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -579,6 +579,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t ioas_id; uint32_t ioas_id2; uint32_t stdev_id; + uint32_t idev_id; + uint32_t hwpt_id; __u64 iova; self->fd = open("/dev/iommu", O_RDWR); @@ -605,11 +607,18 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, NULL)) + if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, + &idev_id)) + return -1; + + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) return -1; + + if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL)) + return -1; return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 62e01412a767..53b4d3f2d9fc 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -98,6 +98,27 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \ pt_id, NULL)) +static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, + __u32 *hwpt_id) +{ + struct iommu_hwpt_alloc cmd = { + .size = sizeof(cmd), + .dev_id = device_id, + .pt_id = pt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &cmd); + if (ret) + return ret; + if (hwpt_id) + *hwpt_id = cmd.out_hwpt_id; + return 0; +} + +#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From 89e07fd4680985351559cdc916098ad72e034bfc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:23 -0700 Subject: [PATCH 20/34] vfio: Do not allow !ops->dma_unmap in vfio_pin/unpin_pages() A driver that doesn't implement ops->dma_unmap shouldn't be allowed to do vfio_pin/unpin_pages(), though it can use vfio_dma_rw() to access an iova range. Deny !ops->dma_unmap cases in vfio_pin/unpin_pages(). Link: https://lore.kernel.org/r/85d622729d8f2334b35d42f1c568df1ededb9171.1690523699.git.nicolinc@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 902f06e52c48..0da8ed81a97d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1483,6 +1483,8 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, /* group->container cannot change while a vfio device is open */ if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) return -EINVAL; + if (!device->ops->dma_unmap) + return -EINVAL; if (vfio_device_has_container(device)) return vfio_device_container_pin_pages(device, iova, npage, prot, pages); @@ -1520,6 +1522,8 @@ void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) { if (WARN_ON(!vfio_assert_device_open(device))) return; + if (WARN_ON(!device->ops->dma_unmap)) + return; if (vfio_device_has_container(device)) { vfio_device_container_unpin_pages(device, iova, npage); From 5d5c85ff6246c1b70b3b75a0b9d9fe749d0a5652 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:24 -0700 Subject: [PATCH 21/34] iommufd: Allow passing in iopt_access_list_id to iopt_remove_access() This is a preparatory change for ioas replacement support for accesses. The replacement routine does an iopt_add_access() for a new IOAS first and then iopt_remove_access() for the old IOAS upon the success of the first call. However, the first call overrides the iopt_access_list_id in the access struct, resulting in iopt_remove_access() being unable to work on the old IOAS. Add an iopt_access_list_id as a parameter to iopt_remove_access, so the replacement routine can save the id before it gets overwritten. Pass the id in iopt_remove_access() for a proper cleanup. The existing callers should just pass in access->iopt_access_list_id. Link: https://lore.kernel.org/r/7bb939b9e0102da0c099572bb3de78ab7622221e.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 6 ++++-- drivers/iommu/iommufd/io_pagetable.c | 6 +++--- drivers/iommu/iommufd/iommufd_private.h | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 57c0e81f5073..7a3e8660b902 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -690,7 +690,8 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) container_of(obj, struct iommufd_access, obj); if (access->ioas) { - iopt_remove_access(&access->ioas->iopt, access); + iopt_remove_access(&access->ioas->iopt, access, + access->iopt_access_list_id); refcount_dec(&access->ioas->obj.users); access->ioas = NULL; } @@ -776,7 +777,8 @@ void iommufd_access_detach(struct iommufd_access *access) access->ops->unmap(access->data, 0, ULONG_MAX); mutex_lock(&access->ioas_lock); } - iopt_remove_access(&cur_ioas->iopt, access); + iopt_remove_access(&cur_ioas->iopt, access, + access->iopt_access_list_id); refcount_dec(&cur_ioas->obj.users); out: access->ioas_unpin = NULL; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 4d095115c2d0..3a598182b761 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1158,12 +1158,12 @@ out_unlock: } void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access) + struct iommufd_access *access, + u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); - WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) != - access); + WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); WARN_ON(iopt_calculate_iova_alignment(iopt)); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index dba730129b8c..8ba786bc95ff 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -323,7 +323,8 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access); + struct iommufd_access *access, + u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST From 9227da7816dd1a42e20d41e2244cb63c205477ca Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:25 -0700 Subject: [PATCH 22/34] iommufd: Add iommufd_access_change_ioas(_id) helpers The complication of the mutex and refcount will be amplified after we introduce the replace support for accesses. So, add a preparatory change of a constitutive helper iommufd_access_change_ioas() and its wrapper iommufd_access_change_ioas_id(). They can simply take care of existing iommufd_access_attach() and iommufd_access_detach(), properly sequencing the refcount puts so that they are truely at the end of the sequence after we know the IOAS pointer is not required any more. Link: https://lore.kernel.org/r/da0c462532193b447329c4eb975a596f47e49b70.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 109 +++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 7a3e8660b902..e5c408415e95 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -684,6 +684,70 @@ void iommufd_device_detach(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); +/* + * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at + * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should + * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas. + */ +static int iommufd_access_change_ioas(struct iommufd_access *access, + struct iommufd_ioas *new_ioas) +{ + u32 iopt_access_list_id = access->iopt_access_list_id; + struct iommufd_ioas *cur_ioas = access->ioas; + int rc; + + lockdep_assert_held(&access->ioas_lock); + + /* We are racing with a concurrent detach, bail */ + if (cur_ioas != access->ioas_unpin) + return -EBUSY; + + if (cur_ioas == new_ioas) + return 0; + + /* + * Set ioas to NULL to block any further iommufd_access_pin_pages(). + * iommufd_access_unpin_pages() can continue using access->ioas_unpin. + */ + access->ioas = NULL; + + if (new_ioas) { + rc = iopt_add_access(&new_ioas->iopt, access); + if (rc) { + access->ioas = cur_ioas; + return rc; + } + refcount_inc(&new_ioas->obj.users); + } + + if (cur_ioas) { + if (access->ops->unmap) { + mutex_unlock(&access->ioas_lock); + access->ops->unmap(access->data, 0, ULONG_MAX); + mutex_lock(&access->ioas_lock); + } + iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id); + refcount_dec(&cur_ioas->obj.users); + } + + access->ioas = new_ioas; + access->ioas_unpin = new_ioas; + + return 0; +} + +static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) +{ + struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id); + int rc; + + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + rc = iommufd_access_change_ioas(access, ioas); + iommufd_put_object(&ioas->obj); + return rc; +} + void iommufd_access_destroy_object(struct iommufd_object *obj) { struct iommufd_access *access = @@ -761,60 +825,29 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); void iommufd_access_detach(struct iommufd_access *access) { - struct iommufd_ioas *cur_ioas = access->ioas; - mutex_lock(&access->ioas_lock); - if (WARN_ON(!access->ioas)) - goto out; - /* - * Set ioas to NULL to block any further iommufd_access_pin_pages(). - * iommufd_access_unpin_pages() can continue using access->ioas_unpin. - */ - access->ioas = NULL; - - if (access->ops->unmap) { + if (WARN_ON(!access->ioas)) { mutex_unlock(&access->ioas_lock); - access->ops->unmap(access->data, 0, ULONG_MAX); - mutex_lock(&access->ioas_lock); + return; } - iopt_remove_access(&cur_ioas->iopt, access, - access->iopt_access_list_id); - refcount_dec(&cur_ioas->obj.users); -out: - access->ioas_unpin = NULL; + WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); } EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) { - struct iommufd_ioas *new_ioas; - int rc = 0; + int rc; mutex_lock(&access->ioas_lock); - if (WARN_ON(access->ioas || access->ioas_unpin)) { + if (WARN_ON(access->ioas)) { mutex_unlock(&access->ioas_lock); return -EINVAL; } - new_ioas = iommufd_get_ioas(access->ictx, ioas_id); - if (IS_ERR(new_ioas)) { - mutex_unlock(&access->ioas_lock); - return PTR_ERR(new_ioas); - } - - rc = iopt_add_access(&new_ioas->iopt, access); - if (rc) { - mutex_unlock(&access->ioas_lock); - iommufd_put_object(&new_ioas->obj); - return rc; - } - iommufd_ref_to_users(&new_ioas->obj); - - access->ioas = new_ioas; - access->ioas_unpin = new_ioas; + rc = iommufd_access_change_ioas_id(access, ioas_id); mutex_unlock(&access->ioas_lock); - return 0; + return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); From 6129b59fcdf374b5d82e1f4518884da13de38b1a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:26 -0700 Subject: [PATCH 23/34] iommufd: Use iommufd_access_change_ioas in iommufd_access_destroy_object Update iommufd_access_destroy_object() to call the new iommufd_access_change_ioas() helper. It is impossible to legitimately race iommufd_access_destroy_object() with iommufd_access_change_ioas() as iommufd_access_destroy_object() is only called once the refcount reache zero, so any concurrent iommufd_access_change_ioas() is already UAFing the memory. Link: https://lore.kernel.org/r/f9fbeca2cde7f8515da18d689b3e02a6a40a5e14.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index e5c408415e95..c0b9cd97ec58 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -753,12 +753,10 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) struct iommufd_access *access = container_of(obj, struct iommufd_access, obj); - if (access->ioas) { - iopt_remove_access(&access->ioas->iopt, access, - access->iopt_access_list_id); - refcount_dec(&access->ioas->obj.users); - access->ioas = NULL; - } + mutex_lock(&access->ioas_lock); + if (access->ioas) + WARN_ON(iommufd_access_change_ioas(access, NULL)); + mutex_unlock(&access->ioas_lock); iommufd_ctx_put(access->ictx); } From 70c16123d865046899c36e3655b2d611f38f51a3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:27 -0700 Subject: [PATCH 24/34] iommufd: Add iommufd_access_replace() API Taking advantage of the new iommufd_access_change_ioas_id helper, add an iommufd_access_replace() API for the VFIO emulated pathway to use. Link: https://lore.kernel.org/r/a3267b924fd5f45e0d3a1dd13a9237e923563862.1690523699.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 15 +++++++++++++++ include/linux/iommufd.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index c0b9cd97ec58..4c37eeea2bcd 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -849,6 +849,21 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); +int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + rc = iommufd_access_change_ioas_id(access, ioas_id); + mutex_unlock(&access->ioas_lock); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); + /** * iommufd_access_notify_unmap - Notify users of an iopt to stop using it * @iopt: iopt to work on diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 0ac60256b659..ffc3a949f837 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -49,6 +49,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, const struct iommufd_access_ops *ops, void *data, u32 *id); void iommufd_access_destroy(struct iommufd_access *access); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id); +int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id); void iommufd_access_detach(struct iommufd_access *access); void iommufd_ctx_get(struct iommufd_ctx *ictx); From c154660b6e26c3f0670a49f43f1fafa5d65f6d39 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:28 -0700 Subject: [PATCH 25/34] iommufd/selftest: Add IOMMU_TEST_OP_ACCESS_REPLACE_IOAS coverage Add a new IOMMU_TEST_OP_ACCESS_REPLACE_IOAS to allow replacing the access->ioas, corresponding to the iommufd_access_replace() helper. Then add replace coverage as a part of user_copy test case, which basically repeats the copy test after replacing the old ioas with a new one. Link: https://lore.kernel.org/r/a4897f93d41c34b972213243b8dbf4c3832842e4.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 4 +++ drivers/iommu/iommufd/selftest.c | 19 ++++++++++++ tools/testing/selftests/iommu/iommufd.c | 29 +++++++++++++++++-- tools/testing/selftests/iommu/iommufd_utils.h | 19 ++++++++++++ 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index dd9168a20ddf..258de2253b61 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -18,6 +18,7 @@ enum { IOMMU_TEST_OP_ACCESS_RW, IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, + IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, }; enum { @@ -91,6 +92,9 @@ struct iommu_test_cmd { struct { __u32 limit; } memory_limit; + struct { + __u32 ioas_id; + } access_replace_ioas; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 9d43334e4faf..bb2cd54ca7b6 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -785,6 +785,22 @@ out_free_staccess: return rc; } +static int iommufd_test_access_replace_ioas(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int ioas_id) +{ + struct selftest_access *staccess; + int rc; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + rc = iommufd_access_replace(staccess->access, ioas_id); + fput(staccess->file); + return rc; +} + /* Check that the pages in a page array match the pages in the user VA */ static int iommufd_test_check_pages(void __user *uptr, struct page **pages, size_t npages) @@ -1000,6 +1016,9 @@ int iommufd_test(struct iommufd_ucmd *ucmd) case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_REPLACE_IOAS: + return iommufd_test_access_replace_ioas( + ucmd, cmd->id, cmd->access_replace_ioas.ioas_id); case IOMMU_TEST_OP_ACCESS_PAGES: return iommufd_test_access_pages( ucmd, cmd->id, cmd->access_pages.iova, diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index dc09c1de319f..8acd0af37aa5 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1283,7 +1283,13 @@ TEST_F(iommufd_mock_domain, user_copy) .dst_iova = MOCK_APERTURE_START, .length = BUFFER_SIZE, }; - unsigned int ioas_id; + struct iommu_ioas_unmap unmap_cmd = { + .size = sizeof(unmap_cmd), + .ioas_id = self->ioas_id, + .iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + }; + unsigned int new_ioas_id, ioas_id; /* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */ test_ioctl_ioas_alloc(&ioas_id); @@ -1301,11 +1307,30 @@ TEST_F(iommufd_mock_domain, user_copy) ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + /* Now replace the ioas with a new one */ + test_ioctl_ioas_alloc(&new_ioas_id); + test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE, + ©_cmd.src_iova); + test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id); + + /* Destroy the old ioas and cleanup copied mapping */ + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_UNMAP, &unmap_cmd)); + test_ioctl_destroy(ioas_id); + + /* Then run the same test again with the new ioas */ + access_cmd.access_pages.iova = copy_cmd.src_iova; + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + copy_cmd.src_ioas_id = new_ioas_id; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); + check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + test_cmd_destroy_access_pages( access_cmd.id, access_cmd.access_pages.out_access_pages_id); test_cmd_destroy_access(access_cmd.id); - test_ioctl_destroy(ioas_id); + test_ioctl_destroy(new_ioas_id); } TEST_F(iommufd_mock_domain, replace) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 53b4d3f2d9fc..70353e68e599 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -119,6 +119,25 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, #define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) +static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, + unsigned int ioas_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, + .id = access_id, + .access_replace_ioas = { .ioas_id = ioas_id }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + return 0; +} +#define test_cmd_access_replace_ioas(access_id, ioas_id) \ + ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { From c157fd88619946cd62af47c45214c479749dad8d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 27 Jul 2023 23:33:29 -0700 Subject: [PATCH 26/34] vfio: Support IO page table replacement Now both the physical path and the emulated path can support an IO page table replacement. Call iommufd_device_replace/iommufd_access_replace(), when vdev->iommufd_attached is true. Also update the VFIO_DEVICE_ATTACH_IOMMUFD_PT kdoc in the uAPI header. Link: https://lore.kernel.org/r/b5f01956ff161f76aa52c95b0fa1ad6eaca95c4a.1690523699.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/vfio/iommufd.c | 11 ++++++----- include/uapi/linux/vfio.h | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 4d84904fd927..82eba6966fa5 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -146,9 +146,9 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return -EINVAL; if (vdev->iommufd_attached) - return -EBUSY; - - rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + rc = iommufd_device_replace(vdev->iommufd_device, pt_id); + else + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); if (rc) return rc; vdev->iommufd_attached = true; @@ -223,8 +223,9 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) lockdep_assert_held(&vdev->dev_set->lock); if (vdev->iommufd_attached) - return -EBUSY; - rc = iommufd_access_attach(vdev->iommufd_access, *pt_id); + rc = iommufd_access_replace(vdev->iommufd_access, *pt_id); + else + rc = iommufd_access_attach(vdev->iommufd_access, *pt_id); if (rc) return rc; vdev->iommufd_attached = true; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa06e3eb4955..537157ff8670 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -939,6 +939,12 @@ struct vfio_device_bind_iommufd { * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * + * If a vfio device is currently attached to a valid hw_pagetable, without doing + * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl + * passing in another hw_pagetable (hwpt) id is allowed. This action, also known + * as a hw_pagetable replacement, will replace the device's currently attached + * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt { From 23a1b46f15d57583927742738579363f179942b1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 2 Aug 2023 21:08:02 -0300 Subject: [PATCH 27/34] iommufd/selftest: Make the mock iommu driver into a real driver I've avoided doing this because there is no way to make this happen without an intrusion into the core code. Up till now this has avoided needing the core code's probe path with some hackery - but now that default domains are becoming mandatory it is unavoidable. This became a serious problem when the core code stopped allowing partially registered iommu drivers in commit 14891af3799e ("iommu: Move the iommu driver sysfs setup into iommu_init/deinit_device()") which breaks the selftest. That series was developed along with a second series that contained this patch so it was not noticed. Make it so that iommufd selftest can create a real iommu driver and bind it only to is own private bus. Add iommu_device_register_bus() as a core code helper to make this possible. It simply sets the right pointers and registers the notifier block. The mock driver then works like any normal driver should, with probe triggered by the bus ops When the bus->iommu_ops stuff is fully unwound we can probably do better here and remove this special case. Link: https://lore.kernel.org/r/15-v6-e8114faedade+425-iommu_all_defdom_jgg@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 9 ++ drivers/iommu/iommu.c | 43 ++++++++ drivers/iommu/iommufd/iommufd_private.h | 5 +- drivers/iommu/iommufd/main.c | 8 +- drivers/iommu/iommufd/selftest.c | 138 ++++++++++++++---------- 5 files changed, 144 insertions(+), 59 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index 7c8011bfd153..e3e3b2015854 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + */ #ifndef __LINUX_IOMMU_PRIV_H #define __LINUX_IOMMU_PRIV_H @@ -7,4 +9,11 @@ int iommu_group_replace_domain(struct iommu_group *group, struct iommu_domain *new_domain); +int iommu_device_register_bus(struct iommu_device *iommu, + const struct iommu_ops *ops, struct bus_type *bus, + struct notifier_block *nb); +void iommu_device_unregister_bus(struct iommu_device *iommu, + struct bus_type *bus, + struct notifier_block *nb); + #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 587619676573..c9c370de9d3d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -37,6 +37,7 @@ #include "iommu-priv.h" #include "iommu-sva.h" +#include "iommu-priv.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -288,6 +289,48 @@ void iommu_device_unregister(struct iommu_device *iommu) } EXPORT_SYMBOL_GPL(iommu_device_unregister); +#if IS_ENABLED(CONFIG_IOMMUFD_TEST) +void iommu_device_unregister_bus(struct iommu_device *iommu, + struct bus_type *bus, + struct notifier_block *nb) +{ + bus_unregister_notifier(bus, nb); + iommu_device_unregister(iommu); +} +EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); + +/* + * Register an iommu driver against a single bus. This is only used by iommufd + * selftest to create a mock iommu driver. The caller must provide + * some memory to hold a notifier_block. + */ +int iommu_device_register_bus(struct iommu_device *iommu, + const struct iommu_ops *ops, struct bus_type *bus, + struct notifier_block *nb) +{ + int err; + + iommu->ops = ops; + nb->notifier_call = iommu_bus_notifier; + err = bus_register_notifier(bus, nb); + if (err) + return err; + + spin_lock(&iommu_device_lock); + list_add_tail(&iommu->list, &iommu_device_list); + spin_unlock(&iommu_device_lock); + + bus->iommu_ops = ops; + err = bus_iommu_probe(bus); + if (err) { + iommu_device_unregister_bus(iommu, bus, nb); + return err; + } + return 0; +} +EXPORT_SYMBOL_GPL(iommu_device_register_bus); +#endif + static struct dev_iommu *dev_iommu_get(struct device *dev) { struct dev_iommu *param = dev->iommu; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8ba786bc95ff..d82986ad10ae 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -334,7 +334,7 @@ extern size_t iommufd_test_memory_limit; void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags); bool iommufd_should_fail(void); -void __init iommufd_test_init(void); +int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); #else @@ -347,8 +347,9 @@ static inline bool iommufd_should_fail(void) { return false; } -static inline void __init iommufd_test_init(void) +static inline int __init iommufd_test_init(void) { + return 0; } static inline void iommufd_test_exit(void) { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 94c498b8fdf6..b4c300912cb0 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -480,8 +480,14 @@ static int __init iommufd_init(void) if (ret) goto err_misc; } - iommufd_test_init(); + ret = iommufd_test_init(); + if (ret) + goto err_vfio_misc; return 0; + +err_vfio_misc: + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); err_misc: misc_deregister(&iommu_misc_dev); return ret; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index bb2cd54ca7b6..9223ff3f23f9 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -9,14 +9,17 @@ #include #include #include +#include #include +#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" #include "iommufd_test.h" static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; +static struct platform_device *selftest_iommu_dev; size_t iommufd_test_memory_limit = 65536; @@ -135,7 +138,7 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) return &mock_blocking_domain; - if (WARN_ON(iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)) + if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) return NULL; mock = kzalloc(sizeof(*mock), GFP_KERNEL); @@ -276,12 +279,22 @@ static void mock_domain_set_plaform_dma_ops(struct device *dev) */ } +static struct iommu_device mock_iommu_device = { +}; + +static struct iommu_device *mock_probe_device(struct device *dev) +{ + return &mock_iommu_device; +} + static const struct iommu_ops mock_ops = { .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .domain_alloc = mock_domain_alloc, .capable = mock_domain_capable, .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, + .device_group = generic_device_group, + .probe_device = mock_probe_device, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, @@ -292,10 +305,6 @@ static const struct iommu_ops mock_ops = { }, }; -static struct iommu_device mock_iommu_device = { - .ops = &mock_ops, -}; - static inline struct iommufd_hw_pagetable * get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, struct mock_iommu_domain **mock) @@ -316,22 +325,29 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } -static struct bus_type iommufd_mock_bus_type = { - .name = "iommufd_mock", - .iommu_ops = &mock_ops, +struct mock_bus_type { + struct bus_type bus; + struct notifier_block nb; }; +static struct mock_bus_type iommufd_mock_bus_type = { + .bus = { + .name = "iommufd_mock", + }, +}; + +static atomic_t mock_dev_num; + static void mock_dev_release(struct device *dev) { struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + atomic_dec(&mock_dev_num); kfree(mdev); } static struct mock_dev *mock_dev_create(void) { - struct iommu_group *iommu_group; - struct dev_iommu *dev_iommu; struct mock_dev *mdev; int rc; @@ -341,51 +357,18 @@ static struct mock_dev *mock_dev_create(void) device_initialize(&mdev->dev); mdev->dev.release = mock_dev_release; - mdev->dev.bus = &iommufd_mock_bus_type; - - iommu_group = iommu_group_alloc(); - if (IS_ERR(iommu_group)) { - rc = PTR_ERR(iommu_group); - goto err_put; - } + mdev->dev.bus = &iommufd_mock_bus_type.bus; rc = dev_set_name(&mdev->dev, "iommufd_mock%u", - iommu_group_id(iommu_group)); + atomic_inc_return(&mock_dev_num)); if (rc) - goto err_group; - - /* - * The iommu core has no way to associate a single device with an iommu - * driver (heck currently it can't even support two iommu_drivers - * registering). Hack it together with an open coded dev_iommu_get(). - * Notice that the normal notifier triggered iommu release process also - * does not work here because this bus is not in iommu_buses. - */ - mdev->dev.iommu = kzalloc(sizeof(*dev_iommu), GFP_KERNEL); - if (!mdev->dev.iommu) { - rc = -ENOMEM; - goto err_group; - } - mutex_init(&mdev->dev.iommu->lock); - mdev->dev.iommu->iommu_dev = &mock_iommu_device; + goto err_put; rc = device_add(&mdev->dev); if (rc) - goto err_dev_iommu; - - rc = iommu_group_add_device(iommu_group, &mdev->dev); - if (rc) - goto err_del; - iommu_group_put(iommu_group); + goto err_put; return mdev; -err_del: - device_del(&mdev->dev); -err_dev_iommu: - kfree(mdev->dev.iommu); - mdev->dev.iommu = NULL; -err_group: - iommu_group_put(iommu_group); err_put: put_device(&mdev->dev); return ERR_PTR(rc); @@ -393,11 +376,7 @@ err_put: static void mock_dev_destroy(struct mock_dev *mdev) { - iommu_group_remove_device(&mdev->dev); - device_del(&mdev->dev); - kfree(mdev->dev.iommu); - mdev->dev.iommu = NULL; - put_device(&mdev->dev); + device_unregister(&mdev->dev); } bool iommufd_selftest_is_mock_dev(struct device *dev) @@ -444,9 +423,14 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, cmd->mock_domain.out_hwpt_id = pt_id; cmd->mock_domain.out_stdev_id = sobj->obj.id; cmd->mock_domain.out_idev_id = idev_id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_detach; iommufd_object_finalize(ucmd->ictx, &sobj->obj); - return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + return 0; +out_detach: + iommufd_device_detach(idev); out_unbind: iommufd_device_unbind(idev); out_mdev: @@ -1051,15 +1035,57 @@ bool iommufd_should_fail(void) return should_fail(&fail_iommufd, 1); } -void __init iommufd_test_init(void) +int __init iommufd_test_init(void) { + struct platform_device_info pdevinfo = { + .name = "iommufd_selftest_iommu", + }; + int rc; + dbgfs_root = fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); - WARN_ON(bus_register(&iommufd_mock_bus_type)); + + selftest_iommu_dev = platform_device_register_full(&pdevinfo); + if (IS_ERR(selftest_iommu_dev)) { + rc = PTR_ERR(selftest_iommu_dev); + goto err_dbgfs; + } + + rc = bus_register(&iommufd_mock_bus_type.bus); + if (rc) + goto err_platform; + + rc = iommu_device_sysfs_add(&mock_iommu_device, + &selftest_iommu_dev->dev, NULL, "%s", + dev_name(&selftest_iommu_dev->dev)); + if (rc) + goto err_bus; + + rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); + if (rc) + goto err_sysfs; + return 0; + +err_sysfs: + iommu_device_sysfs_remove(&mock_iommu_device); +err_bus: + bus_unregister(&iommufd_mock_bus_type.bus); +err_platform: + platform_device_del(selftest_iommu_dev); +err_dbgfs: + debugfs_remove_recursive(dbgfs_root); + return rc; } void iommufd_test_exit(void) { + iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_unregister_bus(&mock_iommu_device, + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); + bus_unregister(&iommufd_mock_bus_type.bus); + platform_device_del(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); - bus_unregister(&iommufd_mock_bus_type); } From 65aaca1134025d437882535c9eb54903b9cd09c9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Aug 2023 20:24:34 -0300 Subject: [PATCH 28/34] iommufd: Remove iommufd_ref_to_users() This no longer has any callers, remove the function Kevin noticed that after commit 99f98a7c0d69 ("iommufd: IOMMUFD_DESTROY should not increase the refcount") there was only one other user and it turns out the rework in commit 9227da7816dd ("iommufd: Add iommufd_access_change_ioas(_id) helpers") got rid of the last one. Link: https://lore.kernel.org/r/0-v1-abb31bedd888+c1-iommufd_ref_to_users_jgg@nvidia.com Suggested-by: Kevin Tian Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_private.h | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 15596a08a057..5a45b8ba2e26 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -149,29 +149,6 @@ static inline void iommufd_put_object(struct iommufd_object *obj) up_read(&obj->destroy_rwsem); } -/** - * iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount - * protection - * @obj - Object to release - * - * Objects have two refcount protections (destroy_rwsem and the refcount_t - * users). Holding either of these will prevent the object from being destroyed. - * - * Depending on the use case, one protection or the other is appropriate. In - * most cases references are being protected by the destroy_rwsem. This allows - * orderly destruction of the object because iommufd_object_destroy_user() will - * wait for it to become unlocked. However, as a rwsem, it cannot be held across - * a system call return. So cases that have longer term needs must switch - * to the weaker users refcount_t. - * - * With users protection iommufd_object_destroy_user() will return false, - * refusing to destroy the object, causing -EBUSY to userspace. - */ -static inline void iommufd_ref_to_users(struct iommufd_object *obj) -{ - up_read(&obj->destroy_rwsem); - /* iommufd_lock_obj() obtains users as well */ -} void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); From 92766e1b953d6e419684b39f55dab574287dd144 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 18 Aug 2023 03:10:29 -0700 Subject: [PATCH 29/34] iommu: Move dev_iommu_ops() to private header dev_iommu_ops() is essentially only used in iommu subsystem, so move to a private header to avoid being abused by other drivers. Link: https://lore.kernel.org/r/20230818101033.4100-2-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 11 +++++++++++ include/linux/iommu.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index e3e3b2015854..2024a2313348 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -6,6 +6,17 @@ #include +static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) +{ + /* + * Assume that valid ops must be installed if iommu_probe_device() + * has succeeded. The device ops are essentially for internal use + * within the IOMMU subsystem itself, so we should be able to trust + * ourselves not to misuse the helper. + */ + return dev->iommu->iommu_dev->ops; +} + int iommu_group_replace_domain(struct iommu_group *group, struct iommu_domain *new_domain); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d31642596675..e0245aa82b75 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -450,17 +450,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) }; } -static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) -{ - /* - * Assume that valid ops must be installed if iommu_probe_device() - * has succeeded. The device ops are essentially for internal use - * within the IOMMU subsystem itself, so we should be able to trust - * ourselves not to misuse the helper. - */ - return dev->iommu->iommu_dev->ops; -} - extern int bus_iommu_probe(const struct bus_type *bus); extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); From 60fedb262bbc632ab58bfdec7f6e47b2f94992d3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 18 Aug 2023 03:10:30 -0700 Subject: [PATCH 30/34] iommu: Add new iommu op to get iommu hardware information Introduce a new iommu op to get the IOMMU hardware capabilities for iommufd. This information will be used by any vIOMMU driver which is owned by userspace. This op chooses to make the special parameters opaque to the core. This suits the current usage model where accessing any of the IOMMU device special parameters does require a userspace driver that matches the kernel driver. If a need for common parameters, implemented similarly by several drivers, arises then there's room in the design to grow a generic parameter set as well. No wrapper API is added as it is supposed to be used by iommufd only. Different IOMMU hardware would have different hardware information. So the information reported differs as well. To let the external user understand the difference, enum iommu_hw_info_type is defined. For the iommu drivers that are capable to report hardware information, it should have a unique iommu_hw_info_type and return to caller. For the driver doesn't report hardware information, caller just uses IOMMU_HW_INFO_TYPE_NONE if a type is required. Link: https://lore.kernel.org/r/20230818101033.4100-3-yi.l.liu@intel.com Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 5 +++++ include/uapi/linux/iommufd.h | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e0245aa82b75..bd6a1110b294 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -228,6 +228,10 @@ struct iommu_iotlb_gather { /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability + * @hw_info: report iommu hardware information. The data buffer returned by this + * op is allocated in the iommu driver and freed by the caller after + * use. The information type is one of enum iommu_hw_info_type defined + * in include/uapi/linux/iommufd.h. * @domain_alloc: allocate iommu domain * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling @@ -257,6 +261,7 @@ struct iommu_iotlb_gather { */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); + void *(*hw_info)(struct device *dev, u32 *length, u32 *type); /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 8245c01adca6..ac11ace21edb 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -370,4 +370,13 @@ struct iommu_hwpt_alloc { __u32 __reserved; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) + +/** + * enum iommu_hw_info_type - IOMMU Hardware Info Types + * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * info + */ +enum iommu_hw_info_type { + IOMMU_HW_INFO_TYPE_NONE, +}; #endif From 55dd4023cead250c89decf1a7a882c94cbf5765a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 18 Aug 2023 03:10:31 -0700 Subject: [PATCH 31/34] iommufd: Add IOMMU_GET_HW_INFO Under nested IOMMU translation, userspace owns the stage-1 translation table (e.g. the stage-1 page table of Intel VT-d or the context table of ARM SMMUv3, and etc.). Stage-1 translation tables are vendor specific, and need to be compatible with the underlying IOMMU hardware. Hence, userspace should know the IOMMU hardware capability before creating and configuring the stage-1 translation table to kernel. This adds IOMMU_GET_HW_INFO ioctl to query the IOMMU hardware information (a.k.a capability) for a given device. The returned data is vendor specific, userspace needs to decode it with the structure by the output @out_data_type field. As only physical devices have IOMMU hardware, so this will return error if the given device is not a physical device. Link: https://lore.kernel.org/r/20230818101033.4100-4-yi.l.liu@intel.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 73 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 1 + drivers/iommu/iommufd/main.c | 3 + include/uapi/linux/iommufd.h | 39 +++++++++++++ 4 files changed, 116 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 90f88c295ce0..ce78c3671539 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -1119,3 +1120,75 @@ err_out: return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); + +int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_info *cmd = ucmd->cmd; + void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); + const struct iommu_ops *ops; + struct iommufd_device *idev; + unsigned int data_len; + unsigned int copy_len; + void *data; + int rc; + + if (cmd->flags || cmd->__reserved) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (ops->hw_info) { + data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type); + if (IS_ERR(data)) { + rc = PTR_ERR(data); + goto out_put; + } + + /* + * drivers that have hw_info callback should have a unique + * iommu_hw_info_type. + */ + if (WARN_ON_ONCE(cmd->out_data_type == + IOMMU_HW_INFO_TYPE_NONE)) { + rc = -ENODEV; + goto out_free; + } + } else { + cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE; + data_len = 0; + data = NULL; + } + + copy_len = min(cmd->data_len, data_len); + if (copy_to_user(user_ptr, data, copy_len)) { + rc = -EFAULT; + goto out_free; + } + + /* + * Zero the trailing bytes if the user buffer is bigger than the + * data size kernel actually has. + */ + if (copy_len < cmd->data_len) { + if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) { + rc = -EFAULT; + goto out_free; + } + } + + /* + * We return the length the kernel supports so userspace may know what + * the kernel capability is. It could be larger than the input buffer. + */ + cmd->data_len = data_len; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_free: + kfree(data); +out_put: + iommufd_put_object(&idev->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 5a45b8ba2e26..2c58670011fe 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -296,6 +296,7 @@ iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) } void iommufd_device_destroy(struct iommufd_object *obj); +int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); struct iommufd_access { struct iommufd_object obj; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 5f7e9fa45502..e71523cbd0de 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -305,6 +305,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -337,6 +338,8 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, + __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index ac11ace21edb..e01d5ac00dc3 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -46,6 +46,7 @@ enum { IOMMUFD_CMD_OPTION, IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, + IOMMUFD_CMD_GET_HW_INFO, }; /** @@ -379,4 +380,42 @@ struct iommu_hwpt_alloc { enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE, }; + +/** + * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) + * @size: sizeof(struct iommu_hw_info) + * @flags: Must be 0 + * @dev_id: The device bound to the iommufd + * @data_len: Input the length of a user buffer in bytes. Output the length of + * data that kernel supports + * @data_uptr: User pointer to a user-space buffer used by the kernel to fill + * the iommu type specific hardware information data + * @out_data_type: Output the iommu hardware info type as defined in the enum + * iommu_hw_info_type. + * @__reserved: Must be 0 + * + * Query an iommu type specific hardware information data from an iommu behind + * a given device that has been bound to iommufd. This hardware info data will + * be used to sync capabilities between the virtual iommu and the physical + * iommu, e.g. a nested translation setup needs to check the hardware info, so + * a guest stage-1 page table can be compatible with the physical iommu. + * + * To capture an iommu type specific hardware information data, @data_uptr and + * its length @data_len must be provided. Trailing bytes will be zeroed if the + * user buffer is larger than the data that kernel has. Otherwise, kernel only + * fills the buffer using the given length in @data_len. If the ioctl succeeds, + * @data_len will be updated to the length that kernel actually supports, + * @out_data_type will be filled to decode the data filled in the buffer + * pointed by @data_uptr. Input @data_len == zero is allowed. + */ +struct iommu_hw_info { + __u32 size; + __u32 flags; + __u32 dev_id; + __u32 data_len; + __aligned_u64 data_uptr; + __u32 out_data_type; + __u32 __reserved; +}; +#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) #endif From af4fde93c3196ed2cd4e64109a9307a7bc471c5a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 18 Aug 2023 03:10:32 -0700 Subject: [PATCH 32/34] iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl Add a mock_domain_hw_info function and an iommu_test_hw_info data structure. This allows to test the IOMMU_GET_HW_INFO ioctl passing the test_reg value for the mock_dev. Link: https://lore.kernel.org/r/20230818101033.4100-5-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 9 +++ drivers/iommu/iommufd/selftest.c | 16 +++++ tools/testing/selftests/iommu/iommufd.c | 38 +++++++++++- .../selftests/iommu/iommufd_fail_nth.c | 4 ++ tools/testing/selftests/iommu/iommufd_utils.h | 62 +++++++++++++++++++ 5 files changed, 128 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 258de2253b61..3f3644375bf1 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -100,4 +100,13 @@ struct iommu_test_cmd { }; #define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) +/* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */ +#define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef +#define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef + +struct iommu_test_hw_info { + __u32 flags; + __u32 test_reg; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 9223ff3f23f9..90e7c5400282 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -131,6 +131,21 @@ static struct iommu_domain mock_blocking_domain = { .ops = &mock_blocking_ops, }; +static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct iommu_test_hw_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->test_reg = IOMMU_HW_INFO_SELFTEST_REGVAL; + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_SELFTEST; + + return info; +} + static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) { struct mock_iommu_domain *mock; @@ -290,6 +305,7 @@ static struct iommu_device *mock_probe_device(struct device *dev) static const struct iommu_ops mock_ops = { .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, + .hw_info = mock_domain_hw_info, .domain_alloc = mock_domain_alloc, .capable = mock_domain_capable, .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 8acd0af37aa5..33d08600be13 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -113,6 +113,7 @@ TEST_F(iommufd, cmd_length) } TEST_LENGTH(iommu_destroy, IOMMU_DESTROY); + TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO); TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC); TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES); TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS); @@ -185,6 +186,7 @@ FIXTURE(iommufd_ioas) uint32_t ioas_id; uint32_t stdev_id; uint32_t hwpt_id; + uint32_t device_id; uint64_t base_iova; }; @@ -211,7 +213,7 @@ FIXTURE_SETUP(iommufd_ioas) for (i = 0; i != variant->mock_domains; i++) { test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id, NULL); + &self->hwpt_id, &self->device_id); self->base_iova = MOCK_APERTURE_START; } } @@ -290,6 +292,40 @@ TEST_F(iommufd_ioas, ioas_area_auto_destroy) } } +TEST_F(iommufd_ioas, get_hw_info) +{ + struct iommu_test_hw_info buffer_exact; + struct iommu_test_hw_info_buffer_larger { + struct iommu_test_hw_info info; + uint64_t trailing_bytes; + } buffer_larger; + struct iommu_test_hw_info_buffer_smaller { + __u32 flags; + } buffer_smaller; + + if (self->device_id) { + /* Provide a zero-size user_buffer */ + test_cmd_get_hw_info(self->device_id, NULL, 0); + /* Provide a user_buffer with exact size */ + test_cmd_get_hw_info(self->device_id, &buffer_exact, sizeof(buffer_exact)); + /* + * Provide a user_buffer with size larger than the exact size to check if + * kernel zero the trailing bytes. + */ + test_cmd_get_hw_info(self->device_id, &buffer_larger, sizeof(buffer_larger)); + /* + * Provide a user_buffer with size smaller than the exact size to check if + * the fields within the size range still gets updated. + */ + test_cmd_get_hw_info(self->device_id, &buffer_smaller, sizeof(buffer_smaller)); + } else { + test_err_get_hw_info(ENOENT, self->device_id, + &buffer_exact, sizeof(buffer_exact)); + test_err_get_hw_info(ENOENT, self->device_id, + &buffer_larger, sizeof(buffer_larger)); + } +} + TEST_F(iommufd_ioas, area) { int i; diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index d4c552e56948..a220ca2a689d 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -576,6 +576,7 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) /* device.c */ TEST_FAIL_NTH(basic_fail_nth, device) { + struct iommu_test_hw_info info; uint32_t ioas_id; uint32_t ioas_id2; uint32_t stdev_id; @@ -611,6 +612,9 @@ TEST_FAIL_NTH(basic_fail_nth, device) &idev_id)) return -1; + if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info))) + return -1; + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id)) return -1; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 70353e68e599..e0753d03ecaa 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -21,6 +21,10 @@ static unsigned long BUFFER_SIZE; static unsigned long PAGE_SIZE; +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) + /* * Have the kernel check the refcount on pages. I don't know why a freshly * mmap'd anon non-compound page starts out with a ref of 3 @@ -348,3 +352,61 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) }) #endif + +/* @data can be NULL */ +static int _test_cmd_get_hw_info(int fd, __u32 device_id, + void *data, size_t data_len) +{ + struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; + struct iommu_hw_info cmd = { + .size = sizeof(cmd), + .dev_id = device_id, + .data_len = data_len, + .data_uptr = (uint64_t)data, + }; + int ret; + + ret = ioctl(fd, IOMMU_GET_HW_INFO, &cmd); + if (ret) + return ret; + + assert(cmd.out_data_type == IOMMU_HW_INFO_TYPE_SELFTEST); + + /* + * The struct iommu_test_hw_info should be the one defined + * by the current kernel. + */ + assert(cmd.data_len == sizeof(struct iommu_test_hw_info)); + + /* + * Trailing bytes should be 0 if user buffer is larger than + * the data that kernel reports. + */ + if (data_len > cmd.data_len) { + char *ptr = (char *)(data + cmd.data_len); + int idx = 0; + + while (idx < data_len - cmd.data_len) { + assert(!*(ptr + idx)); + idx++; + } + } + + if (info) { + if (data_len >= offsetofend(struct iommu_test_hw_info, test_reg)) + assert(info->test_reg == IOMMU_HW_INFO_SELFTEST_REGVAL); + if (data_len >= offsetofend(struct iommu_test_hw_info, flags)) + assert(!info->flags); + } + + return 0; +} + +#define test_cmd_get_hw_info(device_id, data, data_len) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ + data, data_len)) + +#define test_err_get_hw_info(_errno, device_id, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_get_hw_info(self->fd, device_id, \ + data, data_len)) From 55243393b06ced5378dcdbb7d51bd8a8edc69294 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 18 Aug 2023 03:10:33 -0700 Subject: [PATCH 33/34] iommu/vt-d: Implement hw_info for iommu capability query Add intel_iommu_hw_info() to report cap_reg and ecap_reg information. Link: https://lore.kernel.org/r/20230818101033.4100-6-yi.l.liu@intel.com Signed-off-by: Lu Baolu Acked-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 19 +++++++++++++++++++ include/uapi/linux/iommufd.h | 23 +++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5c8c5cdc36cf..9e6f78830ece 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "iommu.h" #include "../dma-iommu.h" @@ -4732,8 +4733,26 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) intel_pasid_tear_down_entry(iommu, dev, pasid, false); } +static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_hw_info_vtd *vtd; + + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); + if (!vtd) + return ERR_PTR(-ENOMEM); + + vtd->cap_reg = iommu->cap; + vtd->ecap_reg = iommu->ecap; + *length = sizeof(*vtd); + *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; + return vtd; +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, + .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index e01d5ac00dc3..b4ba0c0cbab6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -372,13 +372,36 @@ struct iommu_hwpt_alloc { }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * struct iommu_hw_info_vtd - Intel VT-d hardware information + * + * @flags: Must be 0 + * @__reserved: Must be 0 + * + * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec + * section 11.4.2 Capability Register. + * @ecap_reg: Value of Intel VT-d capability register defined in VT-d spec + * section 11.4.3 Extended Capability Register. + * + * User needs to understand the Intel VT-d specification to decode the + * register value. + */ +struct iommu_hw_info_vtd { + __u32 flags; + __u32 __reserved; + __aligned_u64 cap_reg; + __aligned_u64 ecap_reg; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info + * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE, + IOMMU_HW_INFO_TYPE_INTEL_VTD, }; /** From eb501c2d96cfce6b42528e8321ea085ec605e790 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 16 Aug 2023 16:13:18 +0800 Subject: [PATCH 34/34] iommufd/selftest: Don't leak the platform device memory when unloading the module It should call platform_device_unregister() instead of platform_device_del() to unregister and free the device. Fixes: 23a1b46f15d5 ("iommufd/selftest: Make the mock iommu driver into a real driver") Link: https://lore.kernel.org/r/20230816081318.1232865-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 90e7c5400282..56506d5753f1 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1089,7 +1089,7 @@ err_sysfs: err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: - platform_device_del(selftest_iommu_dev); + platform_device_unregister(selftest_iommu_dev); err_dbgfs: debugfs_remove_recursive(dbgfs_root); return rc; @@ -1102,6 +1102,6 @@ void iommufd_test_exit(void) &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); - platform_device_del(selftest_iommu_dev); + platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); }