From 07321a6d087d4ec9866cfb0c8b53692a59758976 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 19 Jul 2024 18:50:11 +0200 Subject: [PATCH 01/16] hw/vfio/container: Fix SIGSEV on vfio_container_instance_finalize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In vfio_connect_container's error path, the base container is removed twice form the VFIOAddressSpace QLIST: first on the listener_release_exit label and second, on free_container_exit label, through object_unref(container), which calls vfio_container_instance_finalize(). Let's remove the first instance. Fixes: 938026053f4 ("vfio/container: Switch to QOM") Signed-off-by: Eric Auger Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan --- hw/vfio/container.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 38a9df3496..ce9a858e56 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -656,7 +656,6 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as, return true; listener_release_exit: QLIST_REMOVE(group, container_next); - QLIST_REMOVE(bcontainer, next); vfio_kvm_device_del_group(group); memory_listener_unregister(&bcontainer->listener); if (vioc->release) { From 13e522f644e2b15fa857028a33e6a3b75e45158d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:49 +0100 Subject: [PATCH 02/16] vfio/pci: Extract mdev check into an helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to skip initialization of the HostIOMMUDevice for mdev, extract the checks that validate if a device is an mdev into helpers. A vfio_device_is_mdev() is created, and subsystems consult VFIODevice::mdev to check if it's mdev or not. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/helpers.c | 14 ++++++++++++++ hw/vfio/pci.c | 12 +++--------- include/hw/vfio/vfio-common.h | 2 ++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index b14edd46ed..7e23e9080c 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -675,3 +675,17 @@ int vfio_device_get_aw_bits(VFIODevice *vdev) return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; } + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e03d9f3ba5..b34e91468a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2963,12 +2963,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - char *subsys; int i, ret; - bool is_mdev; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - g_autofree char *tmp = NULL; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -2997,14 +2994,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); + vbasedev->mdev = vfio_device_is_mdev(vbasedev); - trace_vfio_mdev(vbasedev->name, is_mdev); + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - if (vbasedev->ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); goto error; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e8ddf92bb1..98acae8c1c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -116,6 +116,7 @@ typedef struct VFIODevice { DeviceState *dev; int fd; int type; + bool mdev; bool reset_works; bool needs_reset; bool no_mmap; @@ -231,6 +232,7 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +bool vfio_device_is_mdev(VFIODevice *vbasedev); bool vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); From 9f17604195c03580b16c3cc5631c86f3894dd442 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:50 +0100 Subject: [PATCH 03/16] vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by skipping HostIOMMUDevice initialization in the presence of mdevs, and skip setting an iommu device when it is known to be an mdev. Cc: Zhenzhong Duan Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Joao Martins Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 4 ++++ hw/vfio/pci.c | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6d15b36e0b..d7f02be595 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1548,6 +1548,10 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev, return false; } + if (vbasedev->mdev) { + return true; + } + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { object_unref(hiod); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b34e91468a..265d3cb82f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3115,7 +3115,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); - if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { error_prepend(errp, "Failed to set iommu_device: "); goto out_teardown; } @@ -3238,7 +3239,9 @@ out_deregister: timer_free(vdev->intx.mmap_timer); } out_unset_idev: - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3283,7 +3286,9 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); vfio_migration_exit(vbasedev); - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } } static void vfio_pci_reset(DeviceState *dev) From 2d1bf2589736b3714f3940d360404732ac13019c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:51 +0100 Subject: [PATCH 04/16] backends/iommufd: Extend iommufd_backend_get_device_info() to fetch HW capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The helper will be able to fetch vendor agnostic IOMMU capabilities supported both by hardware and software. Right now it is only iommu dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 4 +++- hw/vfio/iommufd.c | 4 +++- include/sysemu/iommufd.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index cabd1b5002..48dfd39624 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -209,7 +209,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp) + uint64_t *caps, Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -225,6 +225,8 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, g_assert(type); *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7b5f87a148..7c1b9e0284 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -628,11 +628,13 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, union { struct iommu_hw_info_vtd vtd; } data; + uint64_t hw_caps; hiod->agent = opaque; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), errp)) { + &type, &data, sizeof(data), + &hw_caps, errp)) { return false; } diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 9edfec6045..57d502a1c7 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -49,7 +49,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp); + uint64_t *caps, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif From b07dcb7d4f116b157de5aaacfa9d58a496a9ed1d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:52 +0100 Subject: [PATCH 05/16] vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to implement auto domains have the attach function return the errno it got during domain attach instead of a bool. -EINVAL is tracked to track domain incompatibilities, and decide whether to create a new IOMMU domain. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7c1b9e0284..7390621ee9 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -172,7 +172,7 @@ out: return ret; } -static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, +static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, Error **errp) { int iommufd = vbasedev->iommufd->fd; @@ -187,12 +187,12 @@ static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, error_setg_errno(errp, errno, "[iommufd=%d] error attach %s (%d) to id=%d", iommufd, vbasedev->name, vbasedev->fd, id); - return false; + return -errno; } trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, vbasedev->fd, id); - return true; + return 0; } static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) @@ -216,7 +216,7 @@ static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, VFIOIOMMUFDContainer *container, Error **errp) { - return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } static void iommufd_cdev_detach_container(VFIODevice *vbasedev, From c598d65aef4016b80608c93e2fba2c46d4c5e42e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:12 +0800 Subject: [PATCH 06/16] vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Reviewed-by: Eric Auger --- hw/vfio/ap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 0c4354e3e7..71bf32b83c 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -230,6 +230,9 @@ static void vfio_ap_instance_init(Object *obj) */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, DEVICE(vapdev), true); + + /* AP device is mdev type device */ + vbasedev->mdev = true; } #ifdef CONFIG_IOMMUFD From 8b8705e7f20afa545862fcae15c1515b8a832d2d Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:13 +0800 Subject: [PATCH 07/16] vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Acked-by: Eric Farman Reviewed-by: Eric Auger --- hw/vfio/ccw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 1f8e1272c7..115862f430 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -675,6 +675,9 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; + /* CCW device is mdev type device */ + vbasedev->mdev = true; + /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are From 5b1e96e654036a65d3edb2671281e1a30c6f5ce7 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:18 +0100 Subject: [PATCH 08/16] vfio/iommufd: Introduce auto domain creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's generally two modes of operation for IOMMUFD: 1) The simple user API which intends to perform relatively simple things with IOMMUs e.g. DPDK. The process generally creates an IOAS and attaches to VFIO and mainly performs IOAS_MAP and UNMAP. 2) The native IOMMUFD API where you have fine grained control of the IOMMU domain and model it accordingly. This is where most new feature are being steered to. For dirty tracking 2) is required, as it needs to ensure that the stage-2/parent IOMMU domain will only attach devices that support dirty tracking (so far it is all homogeneous in x86, likely not the case for smmuv3). Such invariant on dirty tracking provides a useful guarantee to VMMs that will refuse incompatible device attachments for IOMMU domains. Dirty tracking insurance is enforced via HWPT_ALLOC, which is responsible for creating an IOMMU domain. This is contrast to the 'simple API' where the IOMMU domain is created by IOMMUFD automatically when it attaches to VFIO (usually referred as autodomains) but it has the needed handling for mdevs. To support dirty tracking with the advanced IOMMUFD API, it needs similar logic, where IOMMU domains are created and devices attached to compatible domains. Essentially mimicking kernel iommufd_device_auto_get_domain(). With mdevs given there's no IOMMU domain it falls back to IOAS attach. The auto domain logic allows different IOMMU domains to be created when DMA dirty tracking is not desired (and VF can provide it), and others where it is. Here it is not used in this way given how VFIODevice migration state is initialized after the device attachment. But such mixed mode of IOMMU dirty tracking + device dirty tracking is an improvement that can be added on. Keep the 'all of nothing' of type1 approach that we have been using so far between container vs device dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan [ clg: Added ERRP_GUARD() in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 30 +++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 85 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 9 ++++ include/sysemu/iommufd.h | 5 +++ 5 files changed, 130 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 48dfd39624..60a3d14bfa 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -207,6 +207,36 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { + .size = sizeof(struct iommu_hwpt_alloc), + .flags = flags, + .dev_id = dev_id, + .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, + alloc_hwpt.out_hwpt_id, ret); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate hwpt"); + return false; + } + + *out_hwpt = alloc_hwpt.out_hwpt_id; + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index 211e6f374a..4d8ac02fe7 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7390621ee9..58c11c9308 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -212,10 +212,89 @@ static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) return true; } +static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *iommufd = vbasedev->iommufd; + uint32_t flags = 0; + VFIOIOASHwpt *hwpt; + uint32_t hwpt_id; + int ret; + + /* Try to find a domain */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + /* -EINVAL means the domain is incompatible with the device. */ + if (ret == -EINVAL) { + /* + * It is an expected failure and it just means we will try + * another domain, or create one if no existing compatible + * domain is found. Hence why the error is discarded below. + */ + error_free(*errp); + *errp = NULL; + continue; + } + + return false; + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + return true; + } + } + + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &hwpt_id, errp)) { + return false; + } + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + return false; + } + + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + return true; +} + +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + VFIOIOASHwpt *hwpt = vbasedev->hwpt; + + QLIST_REMOVE(vbasedev, hwpt_next); + vbasedev->hwpt = NULL; + + if (QLIST_EMPTY(&hwpt->device_list)) { + QLIST_REMOVE(hwpt, next); + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + } +} + static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, VFIOIOMMUFDContainer *container, Error **errp) { + /* mdevs aren't physical devices and will fail with auto domains */ + if (!vbasedev->mdev) { + return iommufd_cdev_autodomains_get(vbasedev, container, errp); + } + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } @@ -227,6 +306,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { error_report_err(err); } + + if (vbasedev->hwpt) { + iommufd_cdev_autodomains_put(vbasedev, container); + } + } static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) @@ -354,6 +438,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; + QLIST_INIT(&container->hwpt_list); bcontainer = &container->bcontainer; vfio_address_space_insert(space, bcontainer); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 98acae8c1c..1a96678f8c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow { typedef struct IOMMUFDBackend IOMMUFDBackend; +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + typedef struct VFIOIOMMUFDContainer { VFIOContainerBase bcontainer; IOMMUFDBackend *be; uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; } VFIOIOMMUFDContainer; OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD); @@ -135,6 +142,8 @@ typedef struct VFIODevice { HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; + VFIOIOASHwpt *hwpt; + QLIST_ENTRY(VFIODevice) hwpt_next; } VFIODevice; struct VFIODeviceOps { diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 57d502a1c7..e917e7591d 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -50,6 +50,11 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif From 6c635326425091e164b563a7ce96408ef74ff2ec Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:19 +0100 Subject: [PATCH 09/16] vfio/{iommufd,container}: Remove caps::aw_bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove caps::aw_bits which requires the bcontainer::iova_ranges being initialized after device is actually attached. Instead defer that to .get_cap() and call vfio_device_get_aw_bits() directly. This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Suggested-by: Zhenzhong Duan Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 3 ++- hw/vfio/container.c | 5 +---- hw/vfio/iommufd.c | 1 - include/sysemu/host_iommu_device.h | 3 --- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 60a3d14bfa..06b135111f 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "monitor/monitor.h" #include "trace.h" +#include "hw/vfio/vfio-common.h" #include #include @@ -269,7 +270,7 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: return caps->type; case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index ce9a858e56..10cb4b4320 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1141,7 +1141,6 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, VFIODevice *vdev = opaque; hiod->name = g_strdup(vdev->name); - hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); hiod->agent = opaque; return true; @@ -1150,11 +1149,9 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { - HostIOMMUDeviceCaps *caps = &hiod->caps; - switch (cap) { case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 58c11c9308..f1e7cf3e9c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -725,7 +725,6 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; - caps->aw_bits = vfio_device_get_aw_bits(vdev); return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index c1bf74ae2c..d1c10ff7c2 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,12 +19,9 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. - * - * @aw_bits: host IOMMU address width. 0xff if no limitation. */ typedef struct HostIOMMUDeviceCaps { uint32_t type; - uint8_t aw_bits; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" From 21e8d3a3aa1627079088e2430946dea62bf5c703 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:20 +0100 Subject: [PATCH 10/16] vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the value of @caps returned by iommufd_backend_get_device_info() in a new field HostIOMMUDeviceCaps::hw_caps. Right now the only value is whether device IOMMU supports dirty tracking (IOMMU_HW_CAP_DIRTY_TRACKING). This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/iommufd.c | 1 + include/sysemu/host_iommu_device.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index f1e7cf3e9c..fb87e64e44 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -725,6 +725,7 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; + caps->hw_caps = hw_caps; return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index d1c10ff7c2..809cced4ba 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,9 +19,13 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. + * + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; + uint64_t hw_caps; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" From 83a4d596a93bc22ace521789979f5ccdcd91ab96 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:21 +0100 Subject: [PATCH 11/16] vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() during attach_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the HostIOMMUDevice::realize() to be invoked during the attach of the device before we allocate IOMMUFD hardware pagetable objects (HWPT). This allows the use of the hw_caps obtained by IOMMU_GET_HW_INFO that essentially tell if the IOMMU behind the device supports dirty tracking. Note: The HostIOMMUDevice data from legacy backend is static and doesn't need any information from the (type1-iommu) backend to be initialized. In contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd FD to be connected and having a devid to be able to successfully GET_HW_INFO. This means vfio_device_hiod_realize() is called in different places within the backend .attach_device() implementation. Suggested-by: Cédric Le Goater Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater [ clg: Fixed error handling in iommufd_cdev_attach() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger --- hw/vfio/common.c | 20 ++++++++------------ hw/vfio/container.c | 4 ++++ hw/vfio/helpers.c | 11 +++++++++++ hw/vfio/iommufd.c | 11 +++++++++++ include/hw/vfio/vfio-common.h | 1 + 5 files changed, 35 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d7f02be595..26e74fa430 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1536,7 +1536,7 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); - HostIOMMUDevice *hiod; + HostIOMMUDevice *hiod = NULL; if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); @@ -1544,21 +1544,17 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); + + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + vbasedev->hiod = hiod; + } + if (!ops->attach_device(name, vbasedev, as, errp)) { - return false; - } - - if (vbasedev->mdev) { - return true; - } - - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { object_unref(hiod); - ops->detach_device(vbasedev); + vbasedev->hiod = NULL; return false; } - vbasedev->hiod = hiod; return true; } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 10cb4b4320..9ccdb639ac 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -914,6 +914,10 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, trace_vfio_attach_device(vbasedev->name, groupid); + if (!vfio_device_hiod_realize(vbasedev, errp)) { + return false; + } + group = vfio_get_group(groupid, as, errp); if (!group) { return false; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 7e23e9080c..ea15c79db0 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -689,3 +689,14 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev) subsys = realpath(tmp, NULL); return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); } + +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) +{ + HostIOMMUDevice *hiod = vbasedev->hiod; + + if (!hiod) { + return true; + } + + return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index fb87e64e44..798c4798a5 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -404,6 +404,17 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, space = vfio_get_address_space(as); + /* + * The HostIOMMUDevice data from legacy backend is static and doesn't need + * any information from the (type1-iommu) backend to be initialized. In + * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd + * FD to be connected and having a devid to be able to successfully call + * iommufd_backend_get_device_info(). + */ + if (!vfio_device_hiod_realize(vbasedev, errp)) { + goto err_alloc_ioas; + } + /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 1a96678f8c..4e44b26d3c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -242,6 +242,7 @@ void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); bool vfio_device_is_mdev(VFIODevice *vbasedev); +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); bool vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); From dddfd8d6670d2d63b851b0c976c3ae08592f4339 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:22 +0100 Subject: [PATCH 12/16] vfio/iommufd: Probe and request hwpt dirty tracking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to using the dirty tracking UAPI, probe whether the IOMMU supports dirty tracking. This is done via the data stored in hiod::caps::hw_caps initialized from GET_HW_INFO. Qemu doesn't know if VF dirty tracking is supported when allocating hardware pagetable in iommufd_cdev_autodomains_get(). This is because VFIODevice migration state hasn't been initialized *yet* hence it can't pick between VF dirty tracking vs IOMMU dirty tracking. So, if IOMMU supports dirty tracking it always creates HWPTs with IOMMU_HWPT_ALLOC_DIRTY_TRACKING even if later on VFIOMigration decides to use VF dirty tracking instead. Signed-off-by: Joao Martins [ clg: - Fixed vbasedev->iommu_dirty_tracking assignment in iommufd_cdev_autodomains_get() - Added warning for heterogeneous dirty page tracking support in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 26 ++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 798c4798a5..240c476eaf 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -110,6 +110,11 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) iommufd_backend_disconnect(vbasedev->iommufd); } +static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) +{ + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { ERRP_GUARD(); @@ -243,10 +248,22 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } else { vbasedev->hwpt = hwpt; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return true; } } + /* + * This is quite early and VFIO Migration state isn't yet fully + * initialized, thus rely only on IOMMU hardware capabilities as to + * whether IOMMU dirty tracking is going to be requested. Later + * vfio_migration_realize() may decide to use VF dirty tracking + * instead. + */ + if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -256,6 +273,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; + hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); @@ -266,8 +284,16 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } vbasedev->hwpt = hwpt; + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + container->bcontainer.dirty_pages_supported |= + vbasedev->iommu_dirty_tracking; + if (container->bcontainer.dirty_pages_supported && + !vbasedev->iommu_dirty_tracking) { + warn_report("IOMMU instance for device %s doesn't support dirty tracking", + vbasedev->name); + } return true; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 4e44b26d3c..1e02c98b09 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -97,6 +97,7 @@ typedef struct IOMMUFDBackend IOMMUFDBackend; typedef struct VFIOIOASHwpt { uint32_t hwpt_id; + uint32_t hwpt_flags; QLIST_HEAD(, VFIODevice) device_list; QLIST_ENTRY(VFIOIOASHwpt) next; } VFIOIOASHwpt; @@ -139,6 +140,7 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + bool iommu_dirty_tracking; HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; From 52ce88229c2d63a223f4c822240e84c3daeb7f6e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:23 +0100 Subject: [PATCH 13/16] vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking support ioctl(iommufd, IOMMU_HWPT_SET_DIRTY_TRACKING, arg) is the UAPI that enables or disables dirty page tracking. The ioctl is used if the hwpt has been created with dirty tracking supported domain (stored in hwpt::flags) and it is called on the whole list of iommu domains. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- backends/iommufd.c | 23 +++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 32 ++++++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 2 ++ 4 files changed, 58 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 06b135111f..b978835038 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -238,6 +238,29 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, return true; } +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + uint32_t hwpt_id, bool start, + Error **errp) +{ + int ret; + struct iommu_hwpt_set_dirty_tracking set_dirty = { + .size = sizeof(set_dirty), + .hwpt_id = hwpt_id, + .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); + trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", + hwpt_id); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index 4d8ac02fe7..28aca3b859 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -16,3 +16,4 @@ iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t si iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 240c476eaf..e39fbf4dd6 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -115,6 +115,37 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } +static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start, Error **errp) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, start, errp)) { + goto err; + } + } + + return 0; + +err: + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, !start, NULL); + } + return -EINVAL; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { ERRP_GUARD(); @@ -739,6 +770,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index e917e7591d..6fb412f611 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -55,6 +55,8 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif From 7c30710bd99510b1ce91a1b287118c759c55a495 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:24 +0100 Subject: [PATCH 14/16] vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(iommufd, IOMMU_HWPT_GET_DIRTY_BITMAP, arg) is the UAPI that fetches the bitmap that tells what was dirty in an IOVA range. A single bitmap is allocated and used across all the hwpts sharing an IOAS which is then used in log_sync() to set Qemu global bitmaps. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- backends/iommufd.c | 29 +++++++++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 28 ++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 4 ++++ 4 files changed, 62 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index b978835038..9bc466a89c 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -261,6 +261,35 @@ bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, return true; } +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp) +{ + int ret; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { + .size = sizeof(get_dirty_bitmap), + .hwpt_id = hwpt_id, + .iova = iova, + .length = size, + .page_size = page_size, + .data = (uintptr_t)data, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); + trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, + page_size, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx + " size: 0x"RAM_ADDR_FMT") failed", iova, size); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index 28aca3b859..40811a3162 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -17,3 +17,4 @@ iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index e39fbf4dd6..e7bece4ea1 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,6 +25,7 @@ #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" +#include "exec/ram_addr.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -146,6 +147,32 @@ err: return -EINVAL; } +static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) +{ + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + unsigned long page_size = qemu_real_host_page_size(); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, + iova, size, page_size, + (uint64_t *)vbmap->bitmap, + errp)) { + return -EINVAL; + } + } + + return 0; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { ERRP_GUARD(); @@ -771,6 +798,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 6fb412f611..4c4886c778 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -57,6 +57,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif From f48b472450efd086dec2e32dc93c882a62a53649 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:25 +0100 Subject: [PATCH 15/16] vfio/migration: Don't block migration device dirty tracking is unsupported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default VFIO migration is set to auto, which will support live migration if the migration capability is set *and* also dirty page tracking is supported. For testing purposes one can force enable without dirty page tracking via enable-migration=on, but that option is generally left for testing purposes. So starting with IOMMU dirty tracking it can use to accommodate the lack of VF dirty page tracking allowing us to minimize the VF requirements for migration and thus enabling migration by default for those too. While at it change the error messages to mention IOMMU dirty tracking as well. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger [ clg: - spelling in commit log ] Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 34d4be2ce1..cbfaef7aff 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -1036,16 +1036,16 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported) { + if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, - "%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + "%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); goto add_blocker; } - warn_report("%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + warn_report("%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); } ret = vfio_block_multiple_devices_migration(vbasedev, errp); From 30b9167785177ac43d11b881fe321918124aeb88 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:26 +0100 Subject: [PATCH 16/16] vfio/common: Allow disabling device dirty page tracking The property 'x-pre-copy-dirty-page-tracking' allows disabling the whole tracking of VF pre-copy phase of dirty page tracking, though it means that it will only be used at the start of the switchover phase. Add an option that disables the VF dirty page tracking, and fall back into container-based dirty page tracking. This also allows to use IOMMU dirty tracking even on VFs with their own dirty tracker scheme. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 3 +++ hw/vfio/migration.c | 4 +++- hw/vfio/pci.c | 3 +++ include/hw/vfio/vfio-common.h | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 26e74fa430..17a5865476 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -199,6 +199,9 @@ bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } if (!vbasedev->dirty_pages_supported) { return false; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index cbfaef7aff..262d42a46e 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -1036,7 +1036,9 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { + if ((!vbasedev->dirty_pages_supported || + vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && + !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, "%s: VFIO device doesn't support device and " diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 265d3cb82f..2407720c35 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3361,6 +3361,9 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 1e02c98b09..fed499b199 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -138,6 +138,7 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + OnOffAuto device_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; bool iommu_dirty_tracking;