From 912b625b4dcf23f6c16a950227715c75ef027e7b Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Thu, 4 May 2023 15:16:54 +0200 Subject: [PATCH 01/19] vfio/pci: demote hiding ecap messages to debug level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seeing a burst of messages like this: vfio-pci 0000:98:00.0: vfio_ecap_init: hiding ecap 0x19@0x1d0 vfio-pci 0000:98:00.0: vfio_ecap_init: hiding ecap 0x25@0x200 vfio-pci 0000:98:00.0: vfio_ecap_init: hiding ecap 0x26@0x210 vfio-pci 0000:98:00.0: vfio_ecap_init: hiding ecap 0x27@0x250 vfio-pci 0000:98:00.1: vfio_ecap_init: hiding ecap 0x25@0x200 vfio-pci 0000:b1:00.0: vfio_ecap_init: hiding ecap 0x19@0x1d0 vfio-pci 0000:b1:00.0: vfio_ecap_init: hiding ecap 0x25@0x200 vfio-pci 0000:b1:00.0: vfio_ecap_init: hiding ecap 0x26@0x210 vfio-pci 0000:b1:00.0: vfio_ecap_init: hiding ecap 0x27@0x250 vfio-pci 0000:b1:00.1: vfio_ecap_init: hiding ecap 0x25@0x200 is of little to no value for an ordinary user. Hence, use pci_dbg() instead of pci_info(). Signed-off-by: Oleksandr Natalenko Acked-by: Cédric Le Goater Tested-by: YangHang Liu Link: https://lore.kernel.org/r/20230504131654.24922-1-oleksandr@natalenko.name Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 948cdd464f4e..1d95fe435f0e 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1643,8 +1643,8 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev) } if (!len) { - pci_info(pdev, "%s: hiding ecap %#x@%#x\n", - __func__, ecap, epos); + pci_dbg(pdev, "%s: hiding ecap %#x@%#x\n", + __func__, ecap, epos); /* If not the first in the chain, we can skip over it */ if (prev) { From a65f35cfd504e5135540939cffd4323083190b36 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:28 -0700 Subject: [PATCH 02/19] vfio/pci: Consolidate irq cleanup on MSI/MSI-X disable vfio_msi_disable() releases all previously allocated state associated with each interrupt before disabling MSI/MSI-X. vfio_msi_disable() iterates twice over the interrupt state: first directly with a for loop to do virqfd cleanup, followed by another for loop within vfio_msi_set_block() that removes the interrupt handler and its associated state using vfio_msi_set_vector_signal(). Simplify interrupt cleanup by iterating over allocated interrupts once. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/837acb8cbe86a258a50da05e56a1f17c1a19abbe.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index bffb0741518b..6a9c6a143cc3 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -426,10 +426,9 @@ static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) for (i = 0; i < vdev->num_ctx; i++) { vfio_virqfd_disable(&vdev->ctx[i].unmask); vfio_virqfd_disable(&vdev->ctx[i].mask); + vfio_msi_set_vector_signal(vdev, i, -1, msix); } - vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix); - cmd = vfio_pci_memory_lock_and_enable(vdev); pci_free_irq_vectors(pdev); vfio_pci_memory_unlock_and_restore(vdev, cmd); From 6578ed85c7d63693669bfede01e0237d0e24211a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:29 -0700 Subject: [PATCH 03/19] vfio/pci: Remove negative check on unsigned vector User space provides the vector as an unsigned int that is checked early for validity (vfio_set_irqs_validate_and_prepare()). A later negative check of the provided vector is not necessary. Remove the negative check and ensure the type used for the vector is consistent as an unsigned int. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/28521e1b0b091849952b0ecb8c118729fc8cdc4f.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 6a9c6a143cc3..258de57ef956 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -317,14 +317,14 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi } static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, - int vector, int fd, bool msix) + unsigned int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; struct eventfd_ctx *trigger; int irq, ret; u16 cmd; - if (vector < 0 || vector >= vdev->num_ctx) + if (vector >= vdev->num_ctx) return -EINVAL; irq = pci_irq_vector(pdev, vector); @@ -399,7 +399,8 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, unsigned count, int32_t *fds, bool msix) { - int i, j, ret = 0; + unsigned int i, j; + int ret = 0; if (start >= vdev->num_ctx || start + count > vdev->num_ctx) return -EINVAL; @@ -410,8 +411,8 @@ static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, } if (ret) { - for (--j; j >= (int)start; j--) - vfio_msi_set_vector_signal(vdev, j, -1, msix); + for (i = start; i < j; i++) + vfio_msi_set_vector_signal(vdev, i, -1, msix); } return ret; @@ -420,7 +421,7 @@ static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) { struct pci_dev *pdev = vdev->pdev; - int i; + unsigned int i; u16 cmd; for (i = 0; i < vdev->num_ctx; i++) { @@ -542,7 +543,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) { - int i; + unsigned int i; bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { From d977e0f7663961368f6442589e52d27484c2f5c2 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:30 -0700 Subject: [PATCH 04/19] vfio/pci: Prepare for dynamic interrupt context storage Interrupt context storage is statically allocated at the time interrupts are allocated. Following allocation, the interrupt context is managed by directly accessing the elements of the array using the vector as index. It is possible to allocate additional MSI-X vectors after MSI-X has been enabled. Dynamic storage of interrupt context is needed to support adding new MSI-X vectors after initial allocation. Replace direct access of array elements with pointers to the array elements. Doing so reduces impact of moving to a new data structure. Move interactions with the array to helpers to mostly contain changes needed to transition to a dynamic data structure. No functional change intended. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/eab289693c8325ede9aba99380f8b8d5143980a4.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 215 +++++++++++++++++++++--------- 1 file changed, 149 insertions(+), 66 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 258de57ef956..6094679349d9 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -48,6 +48,31 @@ static bool is_irq_none(struct vfio_pci_core_device *vdev) vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX); } +static +struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, + unsigned long index) +{ + if (index >= vdev->num_ctx) + return NULL; + return &vdev->ctx[index]; +} + +static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) +{ + kfree(vdev->ctx); +} + +static int vfio_irq_ctx_alloc_num(struct vfio_pci_core_device *vdev, + unsigned long num) +{ + vdev->ctx = kcalloc(num, sizeof(struct vfio_pci_irq_ctx), + GFP_KERNEL_ACCOUNT); + if (!vdev->ctx) + return -ENOMEM; + + return 0; +} + /* * INTx */ @@ -55,14 +80,21 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) { struct vfio_pci_core_device *vdev = opaque; - if (likely(is_intx(vdev) && !vdev->virq_disabled)) - eventfd_signal(vdev->ctx[0].trigger, 1); + if (likely(is_intx(vdev) && !vdev->virq_disabled)) { + struct vfio_pci_irq_ctx *ctx; + + ctx = vfio_irq_ctx_get(vdev, 0); + if (WARN_ON_ONCE(!ctx)) + return; + eventfd_signal(ctx->trigger, 1); + } } /* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_irq_ctx *ctx; unsigned long flags; bool masked_changed = false; @@ -77,7 +109,14 @@ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) if (unlikely(!is_intx(vdev))) { if (vdev->pci_2_3) pci_intx(pdev, 0); - } else if (!vdev->ctx[0].masked) { + goto out_unlock; + } + + ctx = vfio_irq_ctx_get(vdev, 0); + if (WARN_ON_ONCE(!ctx)) + goto out_unlock; + + if (!ctx->masked) { /* * Can't use check_and_mask here because we always want to * mask, not just when something is pending. @@ -87,10 +126,11 @@ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) else disable_irq_nosync(pdev->irq); - vdev->ctx[0].masked = true; + ctx->masked = true; masked_changed = true; } +out_unlock: spin_unlock_irqrestore(&vdev->irqlock, flags); return masked_changed; } @@ -105,6 +145,7 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) { struct vfio_pci_core_device *vdev = opaque; struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_irq_ctx *ctx; unsigned long flags; int ret = 0; @@ -117,7 +158,14 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) if (unlikely(!is_intx(vdev))) { if (vdev->pci_2_3) pci_intx(pdev, 1); - } else if (vdev->ctx[0].masked && !vdev->virq_disabled) { + goto out_unlock; + } + + ctx = vfio_irq_ctx_get(vdev, 0); + if (WARN_ON_ONCE(!ctx)) + goto out_unlock; + + if (ctx->masked && !vdev->virq_disabled) { /* * A pending interrupt here would immediately trigger, * but we can avoid that overhead by just re-sending @@ -129,9 +177,10 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) } else enable_irq(pdev->irq); - vdev->ctx[0].masked = (ret > 0); + ctx->masked = (ret > 0); } +out_unlock: spin_unlock_irqrestore(&vdev->irqlock, flags); return ret; @@ -146,18 +195,23 @@ void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) static irqreturn_t vfio_intx_handler(int irq, void *dev_id) { struct vfio_pci_core_device *vdev = dev_id; + struct vfio_pci_irq_ctx *ctx; unsigned long flags; int ret = IRQ_NONE; + ctx = vfio_irq_ctx_get(vdev, 0); + if (WARN_ON_ONCE(!ctx)) + return ret; + spin_lock_irqsave(&vdev->irqlock, flags); if (!vdev->pci_2_3) { disable_irq_nosync(vdev->pdev->irq); - vdev->ctx[0].masked = true; + ctx->masked = true; ret = IRQ_HANDLED; - } else if (!vdev->ctx[0].masked && /* may be shared */ + } else if (!ctx->masked && /* may be shared */ pci_check_and_mask_intx(vdev->pdev)) { - vdev->ctx[0].masked = true; + ctx->masked = true; ret = IRQ_HANDLED; } @@ -171,15 +225,24 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id) static int vfio_intx_enable(struct vfio_pci_core_device *vdev) { + struct vfio_pci_irq_ctx *ctx; + int ret; + if (!is_irq_none(vdev)) return -EINVAL; if (!vdev->pdev->irq) return -ENODEV; - vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL_ACCOUNT); - if (!vdev->ctx) - return -ENOMEM; + ret = vfio_irq_ctx_alloc_num(vdev, 1); + if (ret) + return ret; + + ctx = vfio_irq_ctx_get(vdev, 0); + if (!ctx) { + vfio_irq_ctx_free_all(vdev); + return -EINVAL; + } vdev->num_ctx = 1; @@ -189,9 +252,9 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev) * here, non-PCI-2.3 devices will have to wait until the * interrupt is enabled. */ - vdev->ctx[0].masked = vdev->virq_disabled; + ctx->masked = vdev->virq_disabled; if (vdev->pci_2_3) - pci_intx(vdev->pdev, !vdev->ctx[0].masked); + pci_intx(vdev->pdev, !ctx->masked); vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; @@ -202,41 +265,46 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) { struct pci_dev *pdev = vdev->pdev; unsigned long irqflags = IRQF_SHARED; + struct vfio_pci_irq_ctx *ctx; struct eventfd_ctx *trigger; unsigned long flags; int ret; - if (vdev->ctx[0].trigger) { + ctx = vfio_irq_ctx_get(vdev, 0); + if (WARN_ON_ONCE(!ctx)) + return -EINVAL; + + if (ctx->trigger) { free_irq(pdev->irq, vdev); - kfree(vdev->ctx[0].name); - eventfd_ctx_put(vdev->ctx[0].trigger); - vdev->ctx[0].trigger = NULL; + kfree(ctx->name); + eventfd_ctx_put(ctx->trigger); + ctx->trigger = NULL; } if (fd < 0) /* Disable only */ return 0; - vdev->ctx[0].name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", - pci_name(pdev)); - if (!vdev->ctx[0].name) + ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", + pci_name(pdev)); + if (!ctx->name) return -ENOMEM; trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { - kfree(vdev->ctx[0].name); + kfree(ctx->name); return PTR_ERR(trigger); } - vdev->ctx[0].trigger = trigger; + ctx->trigger = trigger; if (!vdev->pci_2_3) irqflags = 0; ret = request_irq(pdev->irq, vfio_intx_handler, - irqflags, vdev->ctx[0].name, vdev); + irqflags, ctx->name, vdev); if (ret) { - vdev->ctx[0].trigger = NULL; - kfree(vdev->ctx[0].name); + ctx->trigger = NULL; + kfree(ctx->name); eventfd_ctx_put(trigger); return ret; } @@ -246,7 +314,7 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) * disable_irq won't. */ spin_lock_irqsave(&vdev->irqlock, flags); - if (!vdev->pci_2_3 && vdev->ctx[0].masked) + if (!vdev->pci_2_3 && ctx->masked) disable_irq_nosync(pdev->irq); spin_unlock_irqrestore(&vdev->irqlock, flags); @@ -255,12 +323,18 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) static void vfio_intx_disable(struct vfio_pci_core_device *vdev) { - vfio_virqfd_disable(&vdev->ctx[0].unmask); - vfio_virqfd_disable(&vdev->ctx[0].mask); + struct vfio_pci_irq_ctx *ctx; + + ctx = vfio_irq_ctx_get(vdev, 0); + WARN_ON_ONCE(!ctx); + if (ctx) { + vfio_virqfd_disable(&ctx->unmask); + vfio_virqfd_disable(&ctx->mask); + } vfio_intx_set_signal(vdev, -1); vdev->irq_type = VFIO_PCI_NUM_IRQS; vdev->num_ctx = 0; - kfree(vdev->ctx); + vfio_irq_ctx_free_all(vdev); } /* @@ -284,10 +358,9 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi if (!is_irq_none(vdev)) return -EINVAL; - vdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), - GFP_KERNEL_ACCOUNT); - if (!vdev->ctx) - return -ENOMEM; + ret = vfio_irq_ctx_alloc_num(vdev, nvec); + if (ret) + return ret; /* return the number of supported vectors if we can't get all: */ cmd = vfio_pci_memory_lock_and_enable(vdev); @@ -296,7 +369,7 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi if (ret > 0) pci_free_irq_vectors(pdev); vfio_pci_memory_unlock_and_restore(vdev, cmd); - kfree(vdev->ctx); + vfio_irq_ctx_free_all(vdev); return ret; } vfio_pci_memory_unlock_and_restore(vdev, cmd); @@ -320,6 +393,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, unsigned int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_irq_ctx *ctx; struct eventfd_ctx *trigger; int irq, ret; u16 cmd; @@ -327,33 +401,33 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (vector >= vdev->num_ctx) return -EINVAL; + ctx = vfio_irq_ctx_get(vdev, vector); + if (!ctx) + return -EINVAL; irq = pci_irq_vector(pdev, vector); - if (vdev->ctx[vector].trigger) { - irq_bypass_unregister_producer(&vdev->ctx[vector].producer); + if (ctx->trigger) { + irq_bypass_unregister_producer(&ctx->producer); cmd = vfio_pci_memory_lock_and_enable(vdev); - free_irq(irq, vdev->ctx[vector].trigger); + free_irq(irq, ctx->trigger); vfio_pci_memory_unlock_and_restore(vdev, cmd); - - kfree(vdev->ctx[vector].name); - eventfd_ctx_put(vdev->ctx[vector].trigger); - vdev->ctx[vector].trigger = NULL; + kfree(ctx->name); + eventfd_ctx_put(ctx->trigger); + ctx->trigger = NULL; } if (fd < 0) return 0; - vdev->ctx[vector].name = kasprintf(GFP_KERNEL_ACCOUNT, - "vfio-msi%s[%d](%s)", - msix ? "x" : "", vector, - pci_name(pdev)); - if (!vdev->ctx[vector].name) + ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", + msix ? "x" : "", vector, pci_name(pdev)); + if (!ctx->name) return -ENOMEM; trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { - kfree(vdev->ctx[vector].name); + kfree(ctx->name); return PTR_ERR(trigger); } @@ -372,26 +446,25 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, pci_write_msi_msg(irq, &msg); } - ret = request_irq(irq, vfio_msihandler, 0, - vdev->ctx[vector].name, trigger); + ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); vfio_pci_memory_unlock_and_restore(vdev, cmd); if (ret) { - kfree(vdev->ctx[vector].name); + kfree(ctx->name); eventfd_ctx_put(trigger); return ret; } - vdev->ctx[vector].producer.token = trigger; - vdev->ctx[vector].producer.irq = irq; - ret = irq_bypass_register_producer(&vdev->ctx[vector].producer); + ctx->producer.token = trigger; + ctx->producer.irq = irq; + ret = irq_bypass_register_producer(&ctx->producer); if (unlikely(ret)) { dev_info(&pdev->dev, "irq bypass producer (token %p) registration fails: %d\n", - vdev->ctx[vector].producer.token, ret); + ctx->producer.token, ret); - vdev->ctx[vector].producer.token = NULL; + ctx->producer.token = NULL; } - vdev->ctx[vector].trigger = trigger; + ctx->trigger = trigger; return 0; } @@ -421,13 +494,17 @@ static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) { struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_irq_ctx *ctx; unsigned int i; u16 cmd; for (i = 0; i < vdev->num_ctx; i++) { - vfio_virqfd_disable(&vdev->ctx[i].unmask); - vfio_virqfd_disable(&vdev->ctx[i].mask); - vfio_msi_set_vector_signal(vdev, i, -1, msix); + ctx = vfio_irq_ctx_get(vdev, i); + if (ctx) { + vfio_virqfd_disable(&ctx->unmask); + vfio_virqfd_disable(&ctx->mask); + vfio_msi_set_vector_signal(vdev, i, -1, msix); + } } cmd = vfio_pci_memory_lock_and_enable(vdev); @@ -443,7 +520,7 @@ static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) vdev->irq_type = VFIO_PCI_NUM_IRQS; vdev->num_ctx = 0; - kfree(vdev->ctx); + vfio_irq_ctx_free_all(vdev); } /* @@ -463,14 +540,18 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev, if (unmask) vfio_pci_intx_unmask(vdev); } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0); int32_t fd = *(int32_t *)data; + + if (WARN_ON_ONCE(!ctx)) + return -EINVAL; if (fd >= 0) return vfio_virqfd_enable((void *) vdev, vfio_pci_intx_unmask_handler, vfio_send_intx_eventfd, NULL, - &vdev->ctx[0].unmask, fd); + &ctx->unmask, fd); - vfio_virqfd_disable(&vdev->ctx[0].unmask); + vfio_virqfd_disable(&ctx->unmask); } return 0; @@ -543,6 +624,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) { + struct vfio_pci_irq_ctx *ctx; unsigned int i; bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; @@ -577,14 +659,15 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return -EINVAL; for (i = start; i < start + count; i++) { - if (!vdev->ctx[i].trigger) + ctx = vfio_irq_ctx_get(vdev, i); + if (!ctx || !ctx->trigger) continue; if (flags & VFIO_IRQ_SET_DATA_NONE) { - eventfd_signal(vdev->ctx[i].trigger, 1); + eventfd_signal(ctx->trigger, 1); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t *bools = data; if (bools[i - start]) - eventfd_signal(vdev->ctx[i].trigger, 1); + eventfd_signal(ctx->trigger, 1); } } return 0; From 8850336588fbcccdca484b91631819eabaafd915 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:31 -0700 Subject: [PATCH 05/19] vfio/pci: Move to single error path Enabling and disabling of an interrupt involves several steps that can fail. Cleanup after failure is done when the error is encountered, resulting in some repetitive code. Support for dynamic contexts will introduce more steps during interrupt enabling and disabling. Transition to centralized exit path in preparation for dynamic contexts to eliminate duplicate error handling code. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/72dddae8aa710ce522a74130120733af61cffe4d.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 6094679349d9..96396e1ad085 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -427,8 +427,8 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { - kfree(ctx->name); - return PTR_ERR(trigger); + ret = PTR_ERR(trigger); + goto out_free_name; } /* @@ -448,11 +448,8 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); vfio_pci_memory_unlock_and_restore(vdev, cmd); - if (ret) { - kfree(ctx->name); - eventfd_ctx_put(trigger); - return ret; - } + if (ret) + goto out_put_eventfd_ctx; ctx->producer.token = trigger; ctx->producer.irq = irq; @@ -467,6 +464,12 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, ctx->trigger = trigger; return 0; + +out_put_eventfd_ctx: + eventfd_ctx_put(trigger); +out_free_name: + kfree(ctx->name); + return ret; } static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, From b156e48fffa9f1caea490e4812a1451adb5c0ef4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:32 -0700 Subject: [PATCH 06/19] vfio/pci: Use xarray for interrupt context storage Interrupt context is statically allocated at the time interrupts are allocated. Following allocation, the context is managed by directly accessing the elements of the array using the vector as index. The storage is released when interrupts are disabled. It is possible to dynamically allocate a single MSI-X interrupt after MSI-X is enabled. A dynamic storage for interrupt context is needed to support this. Replace the interrupt context array with an xarray (similar to what the core uses as store for MSI descriptors) that can support the dynamic expansion while maintaining the custom that uses the vector as index. With a dynamic storage it is no longer required to pre-allocate interrupt contexts at the time the interrupts are allocated. MSI and MSI-X interrupt contexts are only used when interrupts are enabled. Their allocation can thus be delayed until interrupt enabling. Only enabled interrupts will have associated interrupt contexts. Whether an interrupt has been allocated (a Linux irq number exists for it) becomes the criteria for whether an interrupt can be enabled. Signed-off-by: Reinette Chatre Link: https://lore.kernel.org/lkml/20230404122444.59e36a99.alex.williamson@redhat.com/ Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/40e235f38d427aff79ae35eda0ced42502aa0937.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 1 + drivers/vfio/pci/vfio_pci_intrs.c | 99 ++++++++++++++++--------------- include/linux/vfio_pci_core.h | 2 +- 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a5ab416cf476..ae0e161c7fc9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2102,6 +2102,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); + xa_init(&vdev->ctx); return 0; } diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 96396e1ad085..77957274027c 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -52,25 +52,33 @@ static struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, unsigned long index) { - if (index >= vdev->num_ctx) + return xa_load(&vdev->ctx, index); +} + +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, + struct vfio_pci_irq_ctx *ctx, unsigned long index) +{ + xa_erase(&vdev->ctx, index); + kfree(ctx); +} + +static struct vfio_pci_irq_ctx * +vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index) +{ + struct vfio_pci_irq_ctx *ctx; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); + if (!ctx) return NULL; - return &vdev->ctx[index]; -} -static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) -{ - kfree(vdev->ctx); -} + ret = xa_insert(&vdev->ctx, index, ctx, GFP_KERNEL_ACCOUNT); + if (ret) { + kfree(ctx); + return NULL; + } -static int vfio_irq_ctx_alloc_num(struct vfio_pci_core_device *vdev, - unsigned long num) -{ - vdev->ctx = kcalloc(num, sizeof(struct vfio_pci_irq_ctx), - GFP_KERNEL_ACCOUNT); - if (!vdev->ctx) - return -ENOMEM; - - return 0; + return ctx; } /* @@ -226,7 +234,6 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id) static int vfio_intx_enable(struct vfio_pci_core_device *vdev) { struct vfio_pci_irq_ctx *ctx; - int ret; if (!is_irq_none(vdev)) return -EINVAL; @@ -234,15 +241,9 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev) if (!vdev->pdev->irq) return -ENODEV; - ret = vfio_irq_ctx_alloc_num(vdev, 1); - if (ret) - return ret; - - ctx = vfio_irq_ctx_get(vdev, 0); - if (!ctx) { - vfio_irq_ctx_free_all(vdev); - return -EINVAL; - } + ctx = vfio_irq_ctx_alloc(vdev, 0); + if (!ctx) + return -ENOMEM; vdev->num_ctx = 1; @@ -334,7 +335,7 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) vfio_intx_set_signal(vdev, -1); vdev->irq_type = VFIO_PCI_NUM_IRQS; vdev->num_ctx = 0; - vfio_irq_ctx_free_all(vdev); + vfio_irq_ctx_free(vdev, ctx, 0); } /* @@ -358,10 +359,6 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi if (!is_irq_none(vdev)) return -EINVAL; - ret = vfio_irq_ctx_alloc_num(vdev, nvec); - if (ret) - return ret; - /* return the number of supported vectors if we can't get all: */ cmd = vfio_pci_memory_lock_and_enable(vdev); ret = pci_alloc_irq_vectors(pdev, 1, nvec, flag); @@ -369,7 +366,6 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi if (ret > 0) pci_free_irq_vectors(pdev); vfio_pci_memory_unlock_and_restore(vdev, cmd); - vfio_irq_ctx_free_all(vdev); return ret; } vfio_pci_memory_unlock_and_restore(vdev, cmd); @@ -401,12 +397,13 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (vector >= vdev->num_ctx) return -EINVAL; - ctx = vfio_irq_ctx_get(vdev, vector); - if (!ctx) - return -EINVAL; irq = pci_irq_vector(pdev, vector); + if (irq < 0) + return -EINVAL; - if (ctx->trigger) { + ctx = vfio_irq_ctx_get(vdev, vector); + + if (ctx) { irq_bypass_unregister_producer(&ctx->producer); cmd = vfio_pci_memory_lock_and_enable(vdev); @@ -414,16 +411,22 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, vfio_pci_memory_unlock_and_restore(vdev, cmd); kfree(ctx->name); eventfd_ctx_put(ctx->trigger); - ctx->trigger = NULL; + vfio_irq_ctx_free(vdev, ctx, vector); } if (fd < 0) return 0; + ctx = vfio_irq_ctx_alloc(vdev, vector); + if (!ctx) + return -ENOMEM; + ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", msix ? "x" : "", vector, pci_name(pdev)); - if (!ctx->name) - return -ENOMEM; + if (!ctx->name) { + ret = -ENOMEM; + goto out_free_ctx; + } trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { @@ -469,6 +472,8 @@ out_put_eventfd_ctx: eventfd_ctx_put(trigger); out_free_name: kfree(ctx->name); +out_free_ctx: + vfio_irq_ctx_free(vdev, ctx, vector); return ret; } @@ -498,16 +503,13 @@ static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_irq_ctx *ctx; - unsigned int i; + unsigned long i; u16 cmd; - for (i = 0; i < vdev->num_ctx; i++) { - ctx = vfio_irq_ctx_get(vdev, i); - if (ctx) { - vfio_virqfd_disable(&ctx->unmask); - vfio_virqfd_disable(&ctx->mask); - vfio_msi_set_vector_signal(vdev, i, -1, msix); - } + xa_for_each(&vdev->ctx, i, ctx) { + vfio_virqfd_disable(&ctx->unmask); + vfio_virqfd_disable(&ctx->mask); + vfio_msi_set_vector_signal(vdev, i, -1, msix); } cmd = vfio_pci_memory_lock_and_enable(vdev); @@ -523,7 +525,6 @@ static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) vdev->irq_type = VFIO_PCI_NUM_IRQS; vdev->num_ctx = 0; - vfio_irq_ctx_free_all(vdev); } /* @@ -663,7 +664,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, for (i = start; i < start + count; i++) { ctx = vfio_irq_ctx_get(vdev, i); - if (!ctx || !ctx->trigger) + if (!ctx) continue; if (flags & VFIO_IRQ_SET_DATA_NONE) { eventfd_signal(ctx->trigger, 1); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 367fd79226a3..61d7873a3973 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -59,7 +59,7 @@ struct vfio_pci_core_device { struct perm_bits *msi_perm; spinlock_t irqlock; struct mutex igate; - struct vfio_pci_irq_ctx *ctx; + struct xarray ctx; int num_ctx; int irq_type; int num_regions; From 63972f63a63f9c3b113cac34dc8692a7c9ae671d Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:33 -0700 Subject: [PATCH 07/19] vfio/pci: Remove interrupt context counter struct vfio_pci_core_device::num_ctx counts how many interrupt contexts have been allocated. When all interrupt contexts are allocated simultaneously num_ctx provides the upper bound of all vectors that can be used as indices into the interrupt context array. With the upcoming support for dynamic MSI-X the number of interrupt contexts does not necessarily span the range of allocated interrupts. Consequently, num_ctx is no longer a trusted upper bound for valid indices. Stop using num_ctx to determine if a provided vector is valid. Use the existence of allocated interrupt. This changes behavior on the error path when user space provides an invalid vector range. Behavior changes from early exit without any modifications to possible modifications to valid vectors within the invalid range. This is acceptable considering that an invalid range is not a valid scenario, see link to discussion. The checks that ensure that user space provides a range of vectors that is valid for the device are untouched. Signed-off-by: Reinette Chatre Link: https://lore.kernel.org/lkml/20230316155646.07ae266f.alex.williamson@redhat.com/ Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/e27d350f02a65b8cbacd409b4321f5ce35b3186d.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 13 +------------ include/linux/vfio_pci_core.h | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 77957274027c..e40eca69a293 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -245,8 +245,6 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev) if (!ctx) return -ENOMEM; - vdev->num_ctx = 1; - /* * If the virtual interrupt is masked, restore it. Devices * supporting DisINTx can be masked at the hardware level @@ -334,7 +332,6 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) } vfio_intx_set_signal(vdev, -1); vdev->irq_type = VFIO_PCI_NUM_IRQS; - vdev->num_ctx = 0; vfio_irq_ctx_free(vdev, ctx, 0); } @@ -370,7 +367,6 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi } vfio_pci_memory_unlock_and_restore(vdev, cmd); - vdev->num_ctx = nvec; vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; @@ -394,9 +390,6 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, int irq, ret; u16 cmd; - if (vector >= vdev->num_ctx) - return -EINVAL; - irq = pci_irq_vector(pdev, vector); if (irq < 0) return -EINVAL; @@ -483,9 +476,6 @@ static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, unsigned int i, j; int ret = 0; - if (start >= vdev->num_ctx || start + count > vdev->num_ctx) - return -EINVAL; - for (i = 0, j = start; i < count && !ret; i++, j++) { int fd = fds ? fds[i] : -1; ret = vfio_msi_set_vector_signal(vdev, j, fd, msix); @@ -524,7 +514,6 @@ static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) pci_intx(pdev, 0); vdev->irq_type = VFIO_PCI_NUM_IRQS; - vdev->num_ctx = 0; } /* @@ -659,7 +648,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return ret; } - if (!irq_is(vdev, index) || start + count > vdev->num_ctx) + if (!irq_is(vdev, index)) return -EINVAL; for (i = start; i < start + count; i++) { diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 61d7873a3973..148fd1ae6c1c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -60,7 +60,6 @@ struct vfio_pci_core_device { spinlock_t irqlock; struct mutex igate; struct xarray ctx; - int num_ctx; int irq_type; int num_regions; struct vfio_pci_region *region; From 9387cf59dc6f987db875148dd596c45bc60813f8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:34 -0700 Subject: [PATCH 08/19] vfio/pci: Update stale comment In preparation for surrounding code change it is helpful to ensure that existing comments are accurate. Remove inaccurate comment about direct access and update the rest of the comment to reflect the purpose of writing the cached MSI message to the device. Suggested-by: Alex Williamson Link: https://lore.kernel.org/lkml/20230330164050.0069e2a5.alex.williamson@redhat.com/ Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5b605ce7dcdab5a5dfef19cec4d73ae2fdad3ae1.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index e40eca69a293..867327e159c1 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -428,11 +428,9 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, } /* - * The MSIx vector table resides in device memory which may be cleared - * via backdoor resets. We don't allow direct access to the vector - * table so even if a userspace driver attempts to save/restore around - * such a reset it would be unsuccessful. To avoid this, restore the - * cached value of the message prior to enabling. + * If the vector was previously allocated, refresh the on-device + * message data before enabling in case it had been cleared or + * corrupted (e.g. due to backdoor resets) since writing. */ cmd = vfio_pci_memory_lock_and_enable(vdev); if (msix) { From 9cd0f6d5cbb6fda09aa83beb8146c287a552017e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:35 -0700 Subject: [PATCH 09/19] vfio/pci: Use bitfield for struct vfio_pci_core_device flags struct vfio_pci_core_device contains eleven boolean flags. Boolean flags clearly indicate their usage but space usage starts to be a concern when there are many. An upcoming change adds another boolean flag to struct vfio_pci_core_device, thereby increasing the concern that the boolean flags are consuming unnecessary space. Transition the boolean flags to use bitfields. On a system that uses one byte per boolean this reduces the space consumed by existing flags from 11 bytes to 2 bytes with room for a few more flags without increasing the structure's size. Suggested-by: Jason Gunthorpe Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/cf34bf0499c889554a8105eeb18cc0ab673005be.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 148fd1ae6c1c..adb47e2914d7 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -68,17 +68,17 @@ struct vfio_pci_core_device { u16 msix_size; u32 msix_offset; u32 rbar[7]; - bool pci_2_3; - bool virq_disabled; - bool reset_works; - bool extended_caps; - bool bardirty; - bool has_vga; - bool needs_reset; - bool nointx; - bool needs_pm_restore; - bool pm_intx_masked; - bool pm_runtime_engaged; + bool pci_2_3:1; + bool virq_disabled:1; + bool reset_works:1; + bool extended_caps:1; + bool bardirty:1; + bool has_vga:1; + bool needs_reset:1; + bool nointx:1; + bool needs_pm_restore:1; + bool pm_intx_masked:1; + bool pm_runtime_engaged:1; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; From dd27a707003818fc8435d8621527d4b3af7d2ab1 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:36 -0700 Subject: [PATCH 10/19] vfio/pci: Probe and store ability to support dynamic MSI-X Not all MSI-X devices support dynamic MSI-X allocation. Whether a device supports dynamic MSI-X should be queried using pci_msix_can_alloc_dyn(). Instead of scattering code with pci_msix_can_alloc_dyn(), probe this ability once and store it as a property of the virtual device. Suggested-by: Alex Williamson Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/f1ae022c060ecb7e527f4f53c8ccafe80768da47.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 5 ++++- include/linux/vfio_pci_core.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ae0e161c7fc9..a3635a8e54c8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -530,8 +530,11 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; - } else + vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev); + } else { vdev->msix_bar = 0xFF; + vdev->has_dyn_msix = false; + } if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index adb47e2914d7..562e8754869d 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -68,6 +68,7 @@ struct vfio_pci_core_device { u16 msix_size; u32 msix_offset; u32 rbar[7]; + bool has_dyn_msix:1; bool pci_2_3:1; bool virq_disabled:1; bool reset_works:1; From e4163438e01583194d043c07adce326b29786f94 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:37 -0700 Subject: [PATCH 11/19] vfio/pci: Support dynamic MSI-X pci_msix_alloc_irq_at() enables an individual MSI-X interrupt to be allocated after MSI-X enabling. Use dynamic MSI-X (if supported by the device) to allocate an interrupt after MSI-X is enabled. An MSI-X interrupt is dynamically allocated at the time a valid eventfd is assigned. This is different behavior from a range provided during MSI-X enabling where interrupts are allocated for the entire range whether a valid eventfd is provided for each interrupt or not. The PCI-MSIX API requires that some number of irqs are allocated for an initial set of vectors when enabling MSI-X on the device. When dynamic MSIX allocation is not supported, the vector table, and thus the allocated irq set can only be resized by disabling and re-enabling MSI-X with a different range. In that case the irq allocation is essentially a cache for configuring vectors within the previously allocated vector range. When dynamic MSI-X allocation is supported, the API still requires some initial set of irqs to be allocated, but also supports allocating and freeing specific irq vectors both within and beyond the initially allocated range. For consistency between modes, as well as to reduce latency and improve reliability of allocations, and also simplicity, this implementation only releases irqs via pci_free_irq_vectors() when either the interrupt mode changes or the device is released. Signed-off-by: Reinette Chatre Link: https://lore.kernel.org/lkml/20230403211841.0e206b67.alex.williamson@redhat.com/ Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/956c47057ae9fd45591feaa82e9ae20929889249.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 47 +++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 867327e159c1..cbb4bcbfbf83 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -381,27 +381,55 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi return 0; } +/* + * vfio_msi_alloc_irq() returns the Linux IRQ number of an MSI or MSI-X device + * interrupt vector. If a Linux IRQ number is not available then a new + * interrupt is allocated if dynamic MSI-X is supported. + * + * Where is vfio_msi_free_irq()? Allocated interrupts are maintained, + * essentially forming a cache that subsequent allocations can draw from. + * Interrupts are freed using pci_free_irq_vectors() when MSI/MSI-X is + * disabled. + */ +static int vfio_msi_alloc_irq(struct vfio_pci_core_device *vdev, + unsigned int vector, bool msix) +{ + struct pci_dev *pdev = vdev->pdev; + struct msi_map map; + int irq; + u16 cmd; + + irq = pci_irq_vector(pdev, vector); + if (WARN_ON_ONCE(irq == 0)) + return -EINVAL; + if (irq > 0 || !msix || !vdev->has_dyn_msix) + return irq; + + cmd = vfio_pci_memory_lock_and_enable(vdev); + map = pci_msix_alloc_irq_at(pdev, vector, NULL); + vfio_pci_memory_unlock_and_restore(vdev, cmd); + + return map.index < 0 ? map.index : map.virq; +} + static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, unsigned int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_irq_ctx *ctx; struct eventfd_ctx *trigger; - int irq, ret; + int irq = -EINVAL, ret; u16 cmd; - irq = pci_irq_vector(pdev, vector); - if (irq < 0) - return -EINVAL; - ctx = vfio_irq_ctx_get(vdev, vector); if (ctx) { irq_bypass_unregister_producer(&ctx->producer); - + irq = pci_irq_vector(pdev, vector); cmd = vfio_pci_memory_lock_and_enable(vdev); free_irq(irq, ctx->trigger); vfio_pci_memory_unlock_and_restore(vdev, cmd); + /* Interrupt stays allocated, will be freed at MSI-X disable. */ kfree(ctx->name); eventfd_ctx_put(ctx->trigger); vfio_irq_ctx_free(vdev, ctx, vector); @@ -410,6 +438,13 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (fd < 0) return 0; + if (irq == -EINVAL) { + /* Interrupt stays allocated, will be freed at MSI-X disable. */ + irq = vfio_msi_alloc_irq(vdev, vector, msix); + if (irq < 0) + return irq; + } + ctx = vfio_irq_ctx_alloc(vdev, vector); if (!ctx) return -ENOMEM; From 6c8017c6a58d06c2fcce3b034944ad056ccf02ce Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:38 -0700 Subject: [PATCH 12/19] vfio/pci: Clear VFIO_IRQ_INFO_NORESIZE for MSI-X Dynamic MSI-X is supported. Clear VFIO_IRQ_INFO_NORESIZE to provide guidance to user space. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/fd1ef2bf6ae972da8e2805bc95d5155af5a8fb0a.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 2 +- include/uapi/linux/vfio.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a3635a8e54c8..ec7e662de033 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1114,7 +1114,7 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, if (info.index == VFIO_PCI_INTX_IRQ_INDEX) info.flags |= (VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED); - else + else if (info.index != VFIO_PCI_MSIX_IRQ_INDEX || !vdev->has_dyn_msix) info.flags |= VFIO_IRQ_INFO_NORESIZE; return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0552e8dcf0cb..1a36134cae5c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -511,6 +511,9 @@ struct vfio_region_info_cap_nvlink2_lnkspd { * then add and unmask vectors, it's up to userspace to make the decision * whether to allocate the maximum supported number of vectors or tear * down setup and incrementally increase the vectors as each is enabled. + * Absence of the NORESIZE flag indicates that vectors can be enabled + * and disabled dynamically without impacting other vectors within the + * index. */ struct vfio_irq_info { __u32 argsz; From d9824f70e52c736498c9177688cee5aa789e560c Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 23 May 2023 16:52:50 -0600 Subject: [PATCH 13/19] vfio/pci: Also demote hiding standard cap messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the same logic as commit 912b625b4dcf ("vfio/pci: demote hiding ecap messages to debug level") for the less common case of hiding standard capabilities. Reviewed-by: Oleksandr Natalenko Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20230523225250.1215911-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 1d95fe435f0e..7e2e62ab0869 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1566,8 +1566,8 @@ static int vfio_cap_init(struct vfio_pci_core_device *vdev) } if (!len) { - pci_info(pdev, "%s: hiding cap %#x@%#x\n", __func__, - cap, pos); + pci_dbg(pdev, "%s: hiding cap %#x@%#x\n", __func__, + cap, pos); *prev = next; pos = next; continue; From a5bfe22db2a4a1ae467f31cfa1d72043eb9f1877 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 19 May 2023 15:47:48 -0600 Subject: [PATCH 14/19] vfio/pci-core: Add capability for AtomicOp completer support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test and enable PCIe AtomicOp completer support of various widths and report via device-info capability to userspace. Reviewed-by: Cédric Le Goater Reviewed-by: Robin Voetter Tested-by: Robin Voetter Link: https://lore.kernel.org/r/20230519214748.402003-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 38 ++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 14 ++++++++++++ 2 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ec7e662de033..20d7b69ea6ff 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -885,6 +885,37 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, } EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); +static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_device_info_cap_pci_atomic_comp cap = { + .header.id = VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP, + .header.version = 1 + }; + struct pci_dev *pdev = pci_physfn(vdev->pdev); + u32 devcap2; + + pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &devcap2); + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32)) + cap.flags |= VFIO_PCI_ATOMIC_COMP32; + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64)) + cap.flags |= VFIO_PCI_ATOMIC_COMP64; + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP128) && + !pci_enable_atomic_ops_to_root(pdev, + PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + cap.flags |= VFIO_PCI_ATOMIC_COMP128; + + if (!cap.flags) + return -ENODEV; + + return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); +} + static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, struct vfio_device_info __user *arg) { @@ -923,6 +954,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return ret; } + ret = vfio_pci_info_atomic_cap(vdev, &caps); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup AtomicOps info capability\n"); + return ret; + } + if (caps.size) { info.flags |= VFIO_DEVICE_FLAGS_CAPS; if (info.argsz < sizeof(info) + caps.size) { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 1a36134cae5c..4f48bad09a37 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -240,6 +240,20 @@ struct vfio_device_info { #define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 #define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 +/* + * The following VFIO_DEVICE_INFO capability reports support for PCIe AtomicOp + * completion to the root bus with supported widths provided via flags. + */ +#define VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP 5 +struct vfio_device_info_cap_pci_atomic_comp { + struct vfio_info_cap_header header; + __u32 flags; +#define VFIO_PCI_ATOMIC_COMP32 (1 << 0) +#define VFIO_PCI_ATOMIC_COMP64 (1 << 1) +#define VFIO_PCI_ATOMIC_COMP128 (1 << 2) + __u32 reserved; +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) From 8cc75183b78e91455a03ad3a1a68cd0612f66446 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 14 Jun 2023 13:39:46 -0600 Subject: [PATCH 15/19] vfio/pci: Cleanup Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It should be possible to select vfio-pci variant drivers without building vfio-pci itself, which implies each variant driver should select vfio-pci-core. Fix the top level vfio Makefile to traverse pci based on vfio-pci-core rather than vfio-pci. Mark MMAP and INTX options depending on vfio-pci-core to cleanup resulting config if core is not enabled. Push all PCI related vfio options to a sub-menu and make descriptions consistent. Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Shameer Kolothum Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20230614193948.477036-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 2 +- drivers/vfio/pci/Kconfig | 8 ++++++-- drivers/vfio/pci/hisilicon/Kconfig | 4 ++-- drivers/vfio/pci/mlx5/Kconfig | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 70e7dcb302ef..151e816b2ff9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -10,7 +10,7 @@ vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_VFIO_PCI) += pci/ +obj-$(CONFIG_VFIO_PCI_CORE) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index f9d0c908e738..86bb7835cf3c 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -if PCI && MMU +menu "VFIO support for PCI devices" + depends on PCI && MMU + config VFIO_PCI_CORE tristate select VFIO_VIRQFD @@ -7,9 +9,11 @@ config VFIO_PCI_CORE config VFIO_PCI_MMAP def_bool y if !S390 + depends on VFIO_PCI_CORE config VFIO_PCI_INTX def_bool y if !S390 + depends on VFIO_PCI_CORE config VFIO_PCI tristate "Generic VFIO support for any PCI device" @@ -59,4 +63,4 @@ source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" -endif +endmenu diff --git a/drivers/vfio/pci/hisilicon/Kconfig b/drivers/vfio/pci/hisilicon/Kconfig index 5daa0f45d2f9..cbf1c32f6ebf 100644 --- a/drivers/vfio/pci/hisilicon/Kconfig +++ b/drivers/vfio/pci/hisilicon/Kconfig @@ -1,13 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only config HISI_ACC_VFIO_PCI - tristate "VFIO PCI support for HiSilicon ACC devices" + tristate "VFIO support for HiSilicon ACC PCI devices" depends on ARM64 || (COMPILE_TEST && 64BIT) - depends on VFIO_PCI_CORE depends on PCI_MSI depends on CRYPTO_DEV_HISI_QM depends on CRYPTO_DEV_HISI_HPRE depends on CRYPTO_DEV_HISI_SEC2 depends on CRYPTO_DEV_HISI_ZIP + select VFIO_PCI_CORE help This provides generic PCI support for HiSilicon ACC devices using the VFIO framework. diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig index 29ba9c504a75..7088edc4fb28 100644 --- a/drivers/vfio/pci/mlx5/Kconfig +++ b/drivers/vfio/pci/mlx5/Kconfig @@ -2,7 +2,7 @@ config MLX5_VFIO_PCI tristate "VFIO support for MLX5 PCI devices" depends on MLX5_CORE - depends on VFIO_PCI_CORE + select VFIO_PCI_CORE help This provides migration support for MLX5 devices using the VFIO framework. From 8bee6f00fce25e7a0db85cbb52b19d729d28273e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 14 Jun 2023 13:39:47 -0600 Subject: [PATCH 16/19] vfio/platform: Cleanup Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like vfio-pci, there's also a base module here where vfio-amba depends on vfio-platform, when really it only needs vfio-platform-base. Create a sub-menu for platform drivers and a nested menu for reset drivers. Cleanup Makefile to make use of new CONFIG_VFIO_PLATFORM_BASE for building the shared modules and traversing reset modules. Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20230614193948.477036-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 2 +- drivers/vfio/platform/Kconfig | 18 ++++++++++++++---- drivers/vfio/platform/Makefile | 9 +++------ drivers/vfio/platform/reset/Kconfig | 2 ++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 151e816b2ff9..8da44aa1ea16 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -11,6 +11,6 @@ vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI_CORE) += pci/ -obj-$(CONFIG_VFIO_PLATFORM) += platform/ +obj-$(CONFIG_VFIO_PLATFORM_BASE) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig index 331a5920f5ab..88fcde51f024 100644 --- a/drivers/vfio/platform/Kconfig +++ b/drivers/vfio/platform/Kconfig @@ -1,8 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only -config VFIO_PLATFORM - tristate "VFIO support for platform devices" +menu "VFIO support for platform devices" depends on ARM || ARM64 || COMPILE_TEST + +config VFIO_PLATFORM_BASE + tristate select VFIO_VIRQFD + +config VFIO_PLATFORM + tristate "Generic VFIO support for any platform device" + select VFIO_PLATFORM_BASE help Support for platform devices with VFIO. This is required to make use of platform devices present on the system using the VFIO @@ -10,10 +16,10 @@ config VFIO_PLATFORM If you don't know what to do here, say N. -if VFIO_PLATFORM config VFIO_AMBA tristate "VFIO support for AMBA devices" depends on ARM_AMBA || COMPILE_TEST + select VFIO_PLATFORM_BASE help Support for ARM AMBA devices with VFIO. This is required to make use of ARM AMBA devices present on the system using the VFIO @@ -21,5 +27,9 @@ config VFIO_AMBA If you don't know what to do here, say N. +menu "VFIO platform reset drivers" + depends on VFIO_PLATFORM_BASE + source "drivers/vfio/platform/reset/Kconfig" -endif +endmenu +endmenu diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile index 3f3a24e7c4ef..ee4fb6a82ca8 100644 --- a/drivers/vfio/platform/Makefile +++ b/drivers/vfio/platform/Makefile @@ -1,13 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 vfio-platform-base-y := vfio_platform_common.o vfio_platform_irq.o -vfio-platform-y := vfio_platform.o +obj-$(CONFIG_VFIO_PLATFORM_BASE) += vfio-platform-base.o +obj-$(CONFIG_VFIO_PLATFORM_BASE) += reset/ +vfio-platform-y := vfio_platform.o obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o -obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o -obj-$(CONFIG_VFIO_PLATFORM) += reset/ vfio-amba-y := vfio_amba.o - obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o -obj-$(CONFIG_VFIO_AMBA) += vfio-platform-base.o -obj-$(CONFIG_VFIO_AMBA) += reset/ diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig index 12f5f3d80387..dcc08dc145a5 100644 --- a/drivers/vfio/platform/reset/Kconfig +++ b/drivers/vfio/platform/reset/Kconfig @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +if VFIO_PLATFORM config VFIO_PLATFORM_CALXEDAXGMAC_RESET tristate "VFIO support for calxeda xgmac reset" help @@ -21,3 +22,4 @@ config VFIO_PLATFORM_BCMFLEXRM_RESET Enables the VFIO platform driver to handle reset for Broadcom FlexRM If you don't know what to do here, say N. +endif From 1e44c58cc485ce265696422c5a9282677ec45473 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 14 Jun 2023 13:39:48 -0600 Subject: [PATCH 17/19] vfio/fsl: Create Kconfig sub-menu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For consistency with pci and platform, push the vfio-fsl-mc option into a sub-menu. Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20230614193948.477036-4-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig index 597d338c5c8a..7d1d690348f0 100644 --- a/drivers/vfio/fsl-mc/Kconfig +++ b/drivers/vfio/fsl-mc/Kconfig @@ -1,6 +1,8 @@ +menu "VFIO support for FSL_MC bus devices" + depends on FSL_MC_BUS + config VFIO_FSL_MC tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" - depends on FSL_MC_BUS select EVENTFD help Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc @@ -8,3 +10,5 @@ config VFIO_FSL_MC fsl-mc bus devices using the VFIO framework. If you don't know what to do here, say N. + +endmenu From 234489ac561300ceed33e64c3bf3a810b9e2051d Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Wed, 31 May 2023 18:15:57 +0530 Subject: [PATCH 18/19] vfio/cdx: add support for CDX bus vfio-cdx driver enables IOCTLs for user space to query MMIO regions for CDX devices and mmap them. This change also adds support for reset of CDX devices. With VFIO enabled on CDX devices, user-space applications can also exercise DMA securely via IOMMU on these devices. This change adds the VFIO CDX driver and enables the following ioctls for CDX devices: - VFIO_DEVICE_GET_INFO: - VFIO_DEVICE_GET_REGION_INFO - VFIO_DEVICE_RESET Signed-off-by: Nipun Gupta Reviewed-by: Pieter Jansen van Vuuren Tested-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20230531124557.11009-1-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- MAINTAINERS | 7 + drivers/vfio/Kconfig | 1 + drivers/vfio/Makefile | 1 + drivers/vfio/cdx/Kconfig | 17 +++ drivers/vfio/cdx/Makefile | 8 + drivers/vfio/cdx/main.c | 234 ++++++++++++++++++++++++++++++ drivers/vfio/cdx/private.h | 28 ++++ include/linux/cdx/cdx_bus.h | 1 - include/linux/mod_devicetable.h | 6 + include/uapi/linux/vfio.h | 1 + scripts/mod/devicetable-offsets.c | 1 + scripts/mod/file2alias.c | 17 ++- 12 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/cdx/Kconfig create mode 100644 drivers/vfio/cdx/Makefile create mode 100644 drivers/vfio/cdx/main.c create mode 100644 drivers/vfio/cdx/private.h diff --git a/MAINTAINERS b/MAINTAINERS index 27ef11624748..ce6ac552d8f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22085,6 +22085,13 @@ F: Documentation/filesystems/vfat.rst F: fs/fat/ F: tools/testing/selftests/filesystems/fat/ +VFIO CDX DRIVER +M: Nipun Gupta +M: Nikhil Agarwal +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/cdx/* + VFIO DRIVER M: Alex Williamson L: kvm@vger.kernel.org diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 89e06c981e43..aba36f5be4ec 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -57,6 +57,7 @@ source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "drivers/vfio/mdev/Kconfig" source "drivers/vfio/fsl-mc/Kconfig" +source "drivers/vfio/cdx/Kconfig" endif source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 8da44aa1ea16..66f418aef5a9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_VFIO_PCI_CORE) += pci/ obj-$(CONFIG_VFIO_PLATFORM_BASE) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ +obj-$(CONFIG_VFIO_CDX) += cdx/ diff --git a/drivers/vfio/cdx/Kconfig b/drivers/vfio/cdx/Kconfig new file mode 100644 index 000000000000..e6de0a0caa32 --- /dev/null +++ b/drivers/vfio/cdx/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# VFIO CDX configuration +# +# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. +# + +config VFIO_CDX + tristate "VFIO support for CDX bus devices" + depends on CDX_BUS + select EVENTFD + help + Driver to enable VFIO support for the devices on CDX bus. + This is required to make use of CDX devices present in + the system using the VFIO framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile new file mode 100644 index 000000000000..cd4a2e6fe609 --- /dev/null +++ b/drivers/vfio/cdx/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. +# + +obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o + +vfio-cdx-objs := main.o diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c new file mode 100644 index 000000000000..c376a69d2db2 --- /dev/null +++ b/drivers/vfio/cdx/main.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#include +#include + +#include "private.h" + +static int vfio_cdx_open_device(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + int count = cdx_dev->res_count; + int i; + + vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region), + GFP_KERNEL_ACCOUNT); + if (!vdev->regions) + return -ENOMEM; + + for (i = 0; i < count; i++) { + struct resource *res = &cdx_dev->res[i]; + + vdev->regions[i].addr = res->start; + vdev->regions[i].size = resource_size(res); + vdev->regions[i].type = res->flags; + /* + * Only regions addressed with PAGE granularity may be + * MMAP'ed securely. + */ + if (!(vdev->regions[i].addr & ~PAGE_MASK) && + !(vdev->regions[i].size & ~PAGE_MASK)) + vdev->regions[i].flags |= + VFIO_REGION_INFO_FLAG_MMAP; + vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ; + if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY)) + vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE; + } + + return 0; +} + +static void vfio_cdx_close_device(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + + kfree(vdev->regions); + cdx_dev_reset(core_vdev->dev); +} + +static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, + struct vfio_device_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_device_info info; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_CDX; + info.flags |= VFIO_DEVICE_FLAGS_RESET; + + info.num_regions = cdx_dev->res_count; + info.num_irqs = 0; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_region_info info; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= cdx_dev->res_count) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = vfio_cdx_index_to_offset(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static long vfio_cdx_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return vfio_cdx_ioctl_get_info(vdev, uarg); + case VFIO_DEVICE_GET_REGION_INFO: + return vfio_cdx_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_RESET: + return cdx_dev_reset(core_vdev->dev); + default: + return -ENOTTY; + } +} + +static int vfio_cdx_mmap_mmio(struct vfio_cdx_region region, + struct vm_area_struct *vma) +{ + u64 size = vma->vm_end - vma->vm_start; + u64 pgoff, base; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + base = pgoff << PAGE_SHIFT; + + if (base + size > region.size) + return -EINVAL; + + vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_device(vma->vm_page_prot); + + return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot); +} + +static int vfio_cdx_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + unsigned int index; + + index = vma->vm_pgoff >> (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT); + + if (index >= cdx_dev->res_count) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + return vfio_cdx_mmap_mmio(vdev->regions[index], vma); +} + +static const struct vfio_device_ops vfio_cdx_ops = { + .name = "vfio-cdx", + .open_device = vfio_cdx_open_device, + .close_device = vfio_cdx_close_device, + .ioctl = vfio_cdx_ioctl, + .mmap = vfio_cdx_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, +}; + +static int vfio_cdx_probe(struct cdx_device *cdx_dev) +{ + struct vfio_cdx_device *vdev; + struct device *dev = &cdx_dev->dev; + int ret; + + vdev = vfio_alloc_device(vfio_cdx_device, vdev, dev, + &vfio_cdx_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_uninit; + + dev_set_drvdata(dev, vdev); + return 0; + +out_uninit: + vfio_put_device(&vdev->vdev); + return ret; +} + +static int vfio_cdx_remove(struct cdx_device *cdx_dev) +{ + struct device *dev = &cdx_dev->dev; + struct vfio_cdx_device *vdev = dev_get_drvdata(dev); + + vfio_unregister_group_dev(&vdev->vdev); + vfio_put_device(&vdev->vdev); + + return 0; +} + +static const struct cdx_device_id vfio_cdx_table[] = { + { CDX_DEVICE_DRIVER_OVERRIDE(CDX_ANY_ID, CDX_ANY_ID, + CDX_ID_F_VFIO_DRIVER_OVERRIDE) }, /* match all by default */ + {} +}; + +MODULE_DEVICE_TABLE(cdx, vfio_cdx_table); + +static struct cdx_driver vfio_cdx_driver = { + .probe = vfio_cdx_probe, + .remove = vfio_cdx_remove, + .match_id_table = vfio_cdx_table, + .driver = { + .name = "vfio-cdx", + .owner = THIS_MODULE, + }, + .driver_managed_dma = true, +}; + +module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver"); diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h new file mode 100644 index 000000000000..8bdc117ea88e --- /dev/null +++ b/drivers/vfio/cdx/private.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef VFIO_CDX_PRIVATE_H +#define VFIO_CDX_PRIVATE_H + +#define VFIO_CDX_OFFSET_SHIFT 40 + +static inline u64 vfio_cdx_index_to_offset(u32 index) +{ + return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT); +} + +struct vfio_cdx_region { + u32 flags; + u32 type; + u64 addr; + resource_size_t size; +}; + +struct vfio_cdx_device { + struct vfio_device vdev; + struct vfio_cdx_region *regions; +}; + +#endif /* VFIO_CDX_PRIVATE_H */ diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 35ef41d8a61a..bead71b7bc73 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -14,7 +14,6 @@ #include #define MAX_CDX_DEV_RESOURCES 4 -#define CDX_ANY_ID (0xFFFF) #define CDX_CONTROLLER_ID_SHIFT 4 #define CDX_BUS_NUM_MASK 0xF diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index ccaaeda792c0..ccf017353bb6 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -912,6 +912,12 @@ struct ishtp_device_id { kernel_ulong_t driver_data; }; +#define CDX_ANY_ID (0xFFFF) + +enum { + CDX_ID_F_VFIO_DRIVER_OVERRIDE = 1, +}; + /** * struct cdx_device_id - CDX device identifier * @vendor: Vendor ID diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4f48bad09a37..9ab864c6f1ff 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -213,6 +213,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ +#define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c index 62dc988df84d..abe65f8968dd 100644 --- a/scripts/mod/devicetable-offsets.c +++ b/scripts/mod/devicetable-offsets.c @@ -265,6 +265,7 @@ int main(void) DEVID(cdx_device_id); DEVID_FIELD(cdx_device_id, vendor); DEVID_FIELD(cdx_device_id, device); + DEVID_FIELD(cdx_device_id, override_only); return 0; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 28da34ba4359..38120f932b0d 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1458,8 +1458,23 @@ static int do_cdx_entry(const char *filename, void *symval, { DEF_FIELD(symval, cdx_device_id, vendor); DEF_FIELD(symval, cdx_device_id, device); + DEF_FIELD(symval, cdx_device_id, override_only); - sprintf(alias, "cdx:v%08Xd%08Xd", vendor, device); + switch (override_only) { + case 0: + strcpy(alias, "cdx:"); + break; + case CDX_ID_F_VFIO_DRIVER_OVERRIDE: + strcpy(alias, "vfio_cdx:"); + break; + default: + warn("Unknown CDX driver_override alias %08X\n", + override_only); + return 0; + } + + ADD(alias, "v", vendor != CDX_ANY_ID, vendor); + ADD(alias, "d", device != CDX_ANY_ID, device); return 1; } From ff598081e5b9d0bdd6874bfe340811bbb75b35e4 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Mon, 26 Jun 2023 15:36:42 +0200 Subject: [PATCH 19/19] vfio/mdev: Move the compat_class initialization to module init The pointer to mdev_bus_compat_class is statically defined at the top of mdev_core, and was originally (commit 7b96953bc640 ("vfio: Mediated device Core driver") serialized by the parent_list_lock. The blamed commit removed this mutex, leaving the pointer initialization unserialized. As a result, the creation of multiple MDEVs in parallel (such as during boot) can encounter errors during the creation of the sysfs entries, such as: [ 8.337509] sysfs: cannot create duplicate filename '/class/mdev_bus' [ 8.337514] vfio_ccw 0.0.01d8: MDEV: Registered [ 8.337516] CPU: 13 PID: 946 Comm: driverctl Not tainted 6.4.0-rc7 #20 [ 8.337522] Hardware name: IBM 3906 M05 780 (LPAR) [ 8.337525] Call Trace: [ 8.337528] [<0000000162b0145a>] dump_stack_lvl+0x62/0x80 [ 8.337540] [<00000001622aeb30>] sysfs_warn_dup+0x78/0x88 [ 8.337549] [<00000001622aeca6>] sysfs_create_dir_ns+0xe6/0xf8 [ 8.337552] [<0000000162b04504>] kobject_add_internal+0xf4/0x340 [ 8.337557] [<0000000162b04d48>] kobject_add+0x78/0xd0 [ 8.337561] [<0000000162b04e0a>] kobject_create_and_add+0x6a/0xb8 [ 8.337565] [<00000001627a110e>] class_compat_register+0x5e/0x90 [ 8.337572] [<000003ff7fd815da>] mdev_register_parent+0x102/0x130 [mdev] [ 8.337581] [<000003ff7fdc7f2c>] vfio_ccw_sch_probe+0xe4/0x178 [vfio_ccw] [ 8.337588] [<0000000162a7833c>] css_probe+0x44/0x80 [ 8.337599] [<000000016279f4da>] really_probe+0xd2/0x460 [ 8.337603] [<000000016279fa08>] driver_probe_device+0x40/0xf0 [ 8.337606] [<000000016279fb78>] __device_attach_driver+0xc0/0x140 [ 8.337610] [<000000016279cbe0>] bus_for_each_drv+0x90/0xd8 [ 8.337618] [<00000001627a00b0>] __device_attach+0x110/0x190 [ 8.337621] [<000000016279c7c8>] bus_rescan_devices_helper+0x60/0xb0 [ 8.337626] [<000000016279cd48>] drivers_probe_store+0x48/0x80 [ 8.337632] [<00000001622ac9b0>] kernfs_fop_write_iter+0x138/0x1f0 [ 8.337635] [<00000001621e5e14>] vfs_write+0x1ac/0x2f8 [ 8.337645] [<00000001621e61d8>] ksys_write+0x70/0x100 [ 8.337650] [<0000000162b2bdc4>] __do_syscall+0x1d4/0x200 [ 8.337656] [<0000000162b3c828>] system_call+0x70/0x98 [ 8.337664] kobject: kobject_add_internal failed for mdev_bus with -EEXIST, don't try to register things with the same name in the same directory. [ 8.337668] kobject: kobject_create_and_add: kobject_add error: -17 [ 8.337674] vfio_ccw: probe of 0.0.01d9 failed with error -12 [ 8.342941] vfio_ccw_mdev aeb9ca91-10c6-42bc-a168-320023570aea: Adding to iommu group 2 Move the initialization of the mdev_bus_compat_class pointer to the init path, to match the cleanup in module exit. This way the code in mdev_register_parent() can simply link the new parent to it, rather than determining whether initialization is required first. Fixes: 89345d5177aa ("vfio/mdev: embedd struct mdev_parent in the parent data structure") Reported-by: Alexander Egorenkov Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230626133642.2939168-1-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/vfio/mdev/mdev_core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 58f91b3bd670..ed4737de4528 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -72,12 +72,6 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, parent->nr_types = nr_types; atomic_set(&parent->available_instances, mdev_driver->max_instances); - if (!mdev_bus_compat_class) { - mdev_bus_compat_class = class_compat_register("mdev_bus"); - if (!mdev_bus_compat_class) - return -ENOMEM; - } - ret = parent_create_sysfs_files(parent); if (ret) return ret; @@ -251,13 +245,24 @@ int mdev_device_remove(struct mdev_device *mdev) static int __init mdev_init(void) { - return bus_register(&mdev_bus_type); + int ret; + + ret = bus_register(&mdev_bus_type); + if (ret) + return ret; + + mdev_bus_compat_class = class_compat_register("mdev_bus"); + if (!mdev_bus_compat_class) { + bus_unregister(&mdev_bus_type); + return -ENOMEM; + } + + return 0; } static void __exit mdev_exit(void) { - if (mdev_bus_compat_class) - class_compat_unregister(mdev_bus_compat_class); + class_compat_unregister(mdev_bus_compat_class); bus_unregister(&mdev_bus_type); }