From 6aca480b76eacb5ae938131c37d06523962904c9 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 1 Aug 2023 22:54:13 +0800 Subject: [PATCH 1/9] xen/evtchn: Remove unused function declaration xen_set_affinity_evtchn() Commit 67473b8194bc ("xen/events: Remove disfunct affinity spreading") leave this unused declaration. Signed-off-by: Yue Haibing Reviewed-by: Juergen Gross Reviewed-by: Rahul Singh Link: https://lore.kernel.org/r/20230801145413.40684-1-yuehaibing@huawei.com Signed-off-by: Juergen Gross --- include/xen/events.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/xen/events.h b/include/xen/events.h index 95970a2f7695..95d5e28de324 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -75,7 +75,6 @@ void evtchn_put(evtchn_port_t evtchn); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector); void rebind_evtchn_irq(evtchn_port_t evtchn, int irq); -int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu); static inline void notify_remote_via_evtchn(evtchn_port_t port) { From d826c9e61c99120f8996f8fed6417167e32eb922 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 2 Aug 2023 18:31:51 +0200 Subject: [PATCH 2/9] xen: remove a confusing comment on auto-translated guest I/O After removing the conditional return from xen_create_contiguous_region(), the accompanying comment was left in place, but it now precedes an unrelated conditional and confuses readers. Fixes: 989513a735f5 ("xen: cleanup pvh leftovers from pv-only sources") Signed-off-by: Petr Tesarik Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20230802163151.1486-1-petrtesarik@huaweicloud.com Signed-off-by: Juergen Gross --- arch/x86/xen/mmu_pv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e0a975165de7..804a5441324c 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2310,12 +2310,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - if (unlikely(order > MAX_CONTIG_ORDER)) return -ENOMEM; From 067e4f174f6d4ecf8872ad93d859d57bab44f7b6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 10 Aug 2023 09:27:04 +0200 Subject: [PATCH 3/9] x86/xen: Make virt_to_pfn() a static inline Making virt_to_pfn() a static inline taking a strongly typed (const void *) makes the contract of a passing a pointer of that type to the function explicit and exposes any misuse of the macro virt_to_pfn() acting polymorphic and accepting many types such as (void *), (unitptr_t) or (unsigned long) as arguments without warnings. Also fix all offending call sites to pass a (void *) rather than an unsigned long. Since virt_to_mfn() is wrapping virt_to_pfn() this function has become polymorphic as well so the usage need to be fixed up. Signed-off-by: Linus Walleij Acked-by: Juergen Gross Link: https://lore.kernel.org/r/20230810-virt-to-phys-x86-xen-v1-1-9e966d333e7a@linaro.org Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/page.h | 5 ++++- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/mmu_pv.c | 12 ++++++------ arch/x86/xen/setup.c | 4 ++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fa9ec20783fa..85e63d58c074 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +static inline unsigned long virt_to_pfn(const void *v) +{ + return PFN_DOWN(__pa(v)); +} #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 93b658248d01..7178976c5dcf 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -523,7 +523,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - pfn = virt_to_pfn(va); + pfn = virt_to_pfn((void *)va); mfn = pfn_to_mfn(pfn); pte = pfn_pte(pfn, PAGE_KERNEL_RO); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 804a5441324c..eb3bac0b22a1 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2202,13 +2202,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, mcs = __xen_mc_entry(0); if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); + in_frames[i] = virt_to_mfn((void *)vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY); if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); + out_frames[i] = virt_to_pfn((void *)vaddr); } xen_mc_issue(0); } @@ -2250,7 +2250,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, MULTI_update_va_mapping(mcs.mc, vaddr, mfn_pte(mfn, PAGE_KERNEL), flags); - set_phys_to_machine(virt_to_pfn(vaddr), mfn); + set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn); } xen_mc_issue(0); @@ -2321,7 +2321,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, xen_zap_pfn_range(vstart, order, in_frames, NULL); /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); + out_frame = virt_to_pfn((void *)vstart); success = xen_exchange_memory(1UL << order, 0, in_frames, 1, order, &out_frame, address_bits); @@ -2354,7 +2354,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); + in_frame = virt_to_mfn((void *)vstart); /* 2. Zap current PTEs. */ xen_zap_pfn_range(vstart, order, NULL, out_frames); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8b5cf7bb1f47..50c998b844fb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -340,7 +340,7 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; ident_pfn_iter < ident_end_pfn; @@ -503,7 +503,7 @@ void __init xen_remap_memory(void) unsigned long pfn_s = ~0UL; unsigned long len = 0; - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); while (xen_remap_mfn != INVALID_P2M_ENTRY) { /* Map the remap information */ From 3e0d473dcb688905dc443f3cfe986fa44b5fe8c4 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 8 Aug 2023 23:09:12 +0800 Subject: [PATCH 4/9] xen-pciback: Remove unused function declarations Commit a92336a1176b ("xen/pciback: Drop two backends, squash and cleanup some code.") declared but never implemented these functions. Signed-off-by: Yue Haibing Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230808150912.43416-1-yuehaibing@huawei.com Signed-off-by: Juergen Gross --- drivers/xen/xen-pciback/conf_space_quirks.h | 2 -- drivers/xen/xen-pciback/pciback.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h index d873abe35bf6..fc1557dfef49 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.h +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -21,8 +21,6 @@ struct xen_pcibk_config_quirk { int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field *field); -int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); - int xen_pcibk_config_quirks_init(struct pci_dev *dev); void xen_pcibk_config_field_free(struct config_field *field); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index 9a64196e831d..f9599ed2f2e2 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -201,6 +201,3 @@ static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev, int xen_pcibk_xenbus_register(void); void xen_pcibk_xenbus_unregister(void); #endif - -/* Handles shared IRQs that can to device domain and control domain. */ -void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); From 71281ec9c82608c42d6841a75ced97aecd4274be Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Tue, 15 Aug 2023 17:24:34 +0800 Subject: [PATCH 5/9] xen: Switch to use kmemdup() helper Use kmemdup() helper instead of open-coding to simplify the code. Signed-off-by: Ruan Jinjie Reviewed-by: Juergen Gross Reviewed-by: Chen Jiahao Link: https://lore.kernel.org/r/20230815092434.1206386-1-ruanjinjie@huawei.com Signed-off-by: Juergen Gross --- drivers/xen/xen-acpi-processor.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 9cb61db67efd..296703939846 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -473,11 +473,8 @@ static int xen_upload_processor_pm_data(void) if (!_pr) continue; - if (!pr_backup) { - pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (pr_backup) - memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); - } + if (!pr_backup) + pr_backup = kmemdup(_pr, sizeof(*_pr), GFP_KERNEL); (void)upload_pm_data(_pr); } From 035a69586f329e3e73750299695401e0c2b76ee2 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 17 Aug 2023 09:47:36 +0800 Subject: [PATCH 6/9] xen: xenbus: Use helper function IS_ERR_OR_NULL() Use IS_ERR_OR_NULL() to detect an error pointer or a null pointer open-coding to simplify the code. Signed-off-by: Li Zetao Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230817014736.3094289-1-lizetao1@huawei.com Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_probe_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 3f3836cb7279..fcb335bb7b18 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -429,7 +429,7 @@ static void xenbus_check_frontend(char *class, char *dev) printk(KERN_DEBUG "XENBUS: frontend %s %s\n", frontend, xenbus_strstate(fe_state)); backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); - if (!backend || IS_ERR(backend)) + if (IS_ERR_OR_NULL(backend)) goto out; err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); if (err == 1) From 187b4c0d34e34cc4a275fd98393a3bc21a460a66 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 31 Jul 2023 11:00:37 +0800 Subject: [PATCH 7/9] xen: Fix one kernel-doc comment Use colon to separate parameter name from their specific meaning. silence the warning: drivers/xen/grant-table.c:1051: warning: Function parameter or member 'nr_pages' not described in 'gnttab_free_pages' Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6030 Signed-off-by: Yang Li Acked-by: Juergen Gross Link: https://lore.kernel.org/r/20230731030037.123946-1-yang.lee@linux.alibaba.com Signed-off-by: Juergen Gross --- drivers/xen/grant-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index f13c3b76ad1e..35659bf70746 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1044,7 +1044,7 @@ EXPORT_SYMBOL_GPL(gnttab_pages_clear_private); /** * gnttab_free_pages - free pages allocated by gnttab_alloc_pages() - * @nr_pages; number of pages to free + * @nr_pages: number of pages to free * @pages: the pages */ void gnttab_free_pages(int nr_pages, struct page **pages) From 442466e04f5f1b4616d3f023ff19166b82e19989 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 7 Jun 2023 14:36:24 +0200 Subject: [PATCH 8/9] xen/xenbus: Avoid a lockdep warning when adding a watch The following lockdep warning appears during boot on a Xen dom0 system: [ 96.388794] ====================================================== [ 96.388797] WARNING: possible circular locking dependency detected [ 96.388799] 6.4.0-rc5-default+ #8 Tainted: G EL [ 96.388803] ------------------------------------------------------ [ 96.388804] xenconsoled/1330 is trying to acquire lock: [ 96.388808] ffffffff82acdd10 (xs_watch_rwsem){++++}-{3:3}, at: register_xenbus_watch+0x45/0x140 [ 96.388847] but task is already holding lock: [ 96.388849] ffff888100c92068 (&u->msgbuffer_mutex){+.+.}-{3:3}, at: xenbus_file_write+0x2c/0x600 [ 96.388862] which lock already depends on the new lock. [ 96.388864] the existing dependency chain (in reverse order) is: [ 96.388866] -> #2 (&u->msgbuffer_mutex){+.+.}-{3:3}: [ 96.388874] __mutex_lock+0x85/0xb30 [ 96.388885] xenbus_dev_queue_reply+0x48/0x2b0 [ 96.388890] xenbus_thread+0x1d7/0x950 [ 96.388897] kthread+0xe7/0x120 [ 96.388905] ret_from_fork+0x2c/0x50 [ 96.388914] -> #1 (xs_response_mutex){+.+.}-{3:3}: [ 96.388923] __mutex_lock+0x85/0xb30 [ 96.388930] xenbus_backend_ioctl+0x56/0x1c0 [ 96.388935] __x64_sys_ioctl+0x90/0xd0 [ 96.388942] do_syscall_64+0x5c/0x90 [ 96.388950] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 96.388957] -> #0 (xs_watch_rwsem){++++}-{3:3}: [ 96.388965] __lock_acquire+0x1538/0x2260 [ 96.388972] lock_acquire+0xc6/0x2b0 [ 96.388976] down_read+0x2d/0x160 [ 96.388983] register_xenbus_watch+0x45/0x140 [ 96.388990] xenbus_file_write+0x53d/0x600 [ 96.388994] vfs_write+0xe4/0x490 [ 96.389003] ksys_write+0xb8/0xf0 [ 96.389011] do_syscall_64+0x5c/0x90 [ 96.389017] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 96.389023] other info that might help us debug this: [ 96.389025] Chain exists of: xs_watch_rwsem --> xs_response_mutex --> &u->msgbuffer_mutex [ 96.413429] Possible unsafe locking scenario: [ 96.413430] CPU0 CPU1 [ 96.413430] ---- ---- [ 96.413431] lock(&u->msgbuffer_mutex); [ 96.413432] lock(xs_response_mutex); [ 96.413433] lock(&u->msgbuffer_mutex); [ 96.413434] rlock(xs_watch_rwsem); [ 96.413436] *** DEADLOCK *** [ 96.413436] 1 lock held by xenconsoled/1330: [ 96.413438] #0: ffff888100c92068 (&u->msgbuffer_mutex){+.+.}-{3:3}, at: xenbus_file_write+0x2c/0x600 [ 96.413446] An ioctl call IOCTL_XENBUS_BACKEND_SETUP (record #1 in the report) results in calling xenbus_alloc() -> xs_suspend() which introduces ordering xs_watch_rwsem --> xs_response_mutex. The xenbus_thread() operation (record #2) creates xs_response_mutex --> &u->msgbuffer_mutex. An XS_WATCH write to the xenbus file then results in a complain about the opposite lock order &u->msgbuffer_mutex --> xs_watch_rwsem. The dependency xs_watch_rwsem --> xs_response_mutex is spurious. Avoid it and the warning by changing the ordering in xs_suspend(), first acquire xs_response_mutex and then xs_watch_rwsem. Reverse also the unlocking order in xs_suspend_cancel() for consistency, but keep xs_resume() as is because it needs to have xs_watch_rwsem unlocked only after exiting xs suspend and re-adding all watches. Signed-off-by: Petr Pavlu Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230607123624.15739-1-petr.pavlu@suse.com Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_xs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 12e02eb01f59..028a182bcc9e 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -840,8 +840,8 @@ void xs_suspend(void) { xs_suspend_enter(); - down_write(&xs_watch_rwsem); mutex_lock(&xs_response_mutex); + down_write(&xs_watch_rwsem); } void xs_resume(void) @@ -866,8 +866,8 @@ void xs_resume(void) void xs_suspend_cancel(void) { - mutex_unlock(&xs_response_mutex); up_write(&xs_watch_rwsem); + mutex_unlock(&xs_response_mutex); xs_suspend_exit(); } From f8941e6c4c712948663ec5d7bbb546f1a0f4e3f6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 22 Aug 2023 15:15:07 +0530 Subject: [PATCH 9/9] xen: privcmd: Add support for irqfd Xen provides support for injecting interrupts to the guests via the HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based device backend implementations, in an inefficient manner currently. Generally, the Virtio backends are implemented to work with the Eventfd based mechanism. In order to make such backends work with Xen, another software layer needs to poll the Eventfds and raise an interrupt to the guest using the Xen based mechanism. This results in an extra context switch. This is not a new problem in Linux though. It is present with other hypervisors like KVM, etc. as well. The generic solution implemented in the kernel for them is to provide an IOCTL call to pass the interrupt details and eventfd, which lets the kernel take care of polling the eventfd and raising of the interrupt, instead of handling this in user space (which involves an extra context switch). This patch adds support to inject a specific interrupt to guest using the eventfd mechanism, by preventing the extra context switch. Inspired by existing implementations for KVM, etc.. Signed-off-by: Viresh Kumar Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/8e724ac1f50c2bc1eb8da9b3ff6166f1372570aa.1692697321.git.viresh.kumar@linaro.org Signed-off-by: Juergen Gross --- drivers/xen/Kconfig | 7 + drivers/xen/privcmd.c | 282 ++++++++++++++++++++++++++++++++++++- include/uapi/xen/privcmd.h | 14 ++ 3 files changed, 301 insertions(+), 2 deletions(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index d5d7c402b651..d43153fec18e 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -269,6 +269,13 @@ config XEN_PRIVCMD disaggregated Xen setups this driver might be needed for other domains, too. +config XEN_PRIVCMD_IRQFD + bool "Xen irqfd support" + depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD + help + Using the irqfd mechanism a virtio backend running in a daemon can + speed up interrupt injection into a guest. + config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index f447cd37cc4c..f00ad5f5f1d4 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -9,11 +9,16 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include +#include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -833,6 +838,263 @@ out: return rc; } +#ifdef CONFIG_XEN_PRIVCMD_IRQFD +/* Irqfd support */ +static struct workqueue_struct *irqfd_cleanup_wq; +static DEFINE_MUTEX(irqfds_lock); +static LIST_HEAD(irqfds_list); + +struct privcmd_kernel_irqfd { + struct xen_dm_op_buf xbufs; + domid_t dom; + bool error; + struct eventfd_ctx *eventfd; + struct work_struct shutdown; + wait_queue_entry_t wait; + struct list_head list; + poll_table pt; +}; + +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) +{ + lockdep_assert_held(&irqfds_lock); + + list_del_init(&kirqfd->list); + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); +} + +static void irqfd_shutdown(struct work_struct *work) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(work, struct privcmd_kernel_irqfd, shutdown); + u64 cnt; + + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); + eventfd_ctx_put(kirqfd->eventfd); + kfree(kirqfd); +} + +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) +{ + u64 cnt; + long rc; + + eventfd_ctx_do_read(kirqfd->eventfd, &cnt); + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); + xen_preemptible_hcall_end(); + + /* Don't repeat the error message for consecutive failures */ + if (rc && !kirqfd->error) { + pr_err("Failed to configure irq for guest domain: %d\n", + kirqfd->dom); + } + + kirqfd->error = rc; +} + +static int +irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(wait, struct privcmd_kernel_irqfd, wait); + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLIN) + irqfd_inject(kirqfd); + + if (flags & EPOLLHUP) { + mutex_lock(&irqfds_lock); + irqfd_deactivate(kirqfd); + mutex_unlock(&irqfds_lock); + } + + return 0; +} + +static void +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(pt, struct privcmd_kernel_irqfd, pt); + + add_wait_queue_priority(wqh, &kirqfd->wait); +} + +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + __poll_t events; + struct fd f; + void *dm_op; + int ret; + + kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); + if (!kirqfd) + return -ENOMEM; + dm_op = kirqfd + 1; + + if (copy_from_user(dm_op, irqfd->dm_op, irqfd->size)) { + ret = -EFAULT; + goto error_kfree; + } + + kirqfd->xbufs.size = irqfd->size; + set_xen_guest_handle(kirqfd->xbufs.h, dm_op); + kirqfd->dom = irqfd->dom; + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); + + f = fdget(irqfd->fd); + if (!f.file) { + ret = -EBADF; + goto error_kfree; + } + + kirqfd->eventfd = eventfd_ctx_fileget(f.file); + if (IS_ERR(kirqfd->eventfd)) { + ret = PTR_ERR(kirqfd->eventfd); + goto error_fd_put; + } + + /* + * Install our own custom wake-up handling so we are notified via a + * callback whenever someone signals the underlying eventfd. + */ + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); + + mutex_lock(&irqfds_lock); + + list_for_each_entry(tmp, &irqfds_list, list) { + if (kirqfd->eventfd == tmp->eventfd) { + ret = -EBUSY; + mutex_unlock(&irqfds_lock); + goto error_eventfd; + } + } + + list_add_tail(&kirqfd->list, &irqfds_list); + mutex_unlock(&irqfds_lock); + + /* + * Check if there was an event already pending on the eventfd before we + * registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(f.file, &kirqfd->pt); + if (events & EPOLLIN) + irqfd_inject(kirqfd); + + /* + * Do not drop the file until the kirqfd is fully initialized, otherwise + * we might race against the EPOLLHUP. + */ + fdput(f); + return 0; + +error_eventfd: + eventfd_ctx_put(kirqfd->eventfd); + +error_fd_put: + fdput(f); + +error_kfree: + kfree(kirqfd); + return ret; +} + +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(irqfd->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&irqfds_lock); + + list_for_each_entry(kirqfd, &irqfds_list, list) { + if (kirqfd->eventfd == eventfd) { + irqfd_deactivate(kirqfd); + break; + } + } + + mutex_unlock(&irqfds_lock); + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed so + * that we guarantee there will not be any more interrupts once this + * deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_irqfd irqfd; + + if (copy_from_user(&irqfd, udata, sizeof(irqfd))) + return -EFAULT; + + /* No other flags should be set */ + if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) + return -EINVAL; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) + return -EPERM; + + if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) + return privcmd_irqfd_deassign(&irqfd); + + return privcmd_irqfd_assign(&irqfd); +} + +static int privcmd_irqfd_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void privcmd_irqfd_exit(void) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + + mutex_lock(&irqfds_lock); + + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) + irqfd_deactivate(kirqfd); + + mutex_unlock(&irqfds_lock); + + destroy_workqueue(irqfd_cleanup_wq); +} +#else +static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + return -EOPNOTSUPP; +} + +static inline int privcmd_irqfd_init(void) +{ + return 0; +} + +static inline void privcmd_irqfd_exit(void) +{ +} +#endif /* CONFIG_XEN_PRIVCMD_IRQFD */ + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { @@ -868,6 +1130,10 @@ static long privcmd_ioctl(struct file *file, ret = privcmd_ioctl_mmap_resource(file, udata); break; + case IOCTL_PRIVCMD_IRQFD: + ret = privcmd_ioctl_irqfd(file, udata); + break; + default: break; } @@ -992,15 +1258,27 @@ static int __init privcmd_init(void) err = misc_register(&xen_privcmdbuf_dev); if (err != 0) { pr_err("Could not register Xen hypercall-buf device\n"); - misc_deregister(&privcmd_dev); - return err; + goto err_privcmdbuf; + } + + err = privcmd_irqfd_init(); + if (err != 0) { + pr_err("irqfd init failed\n"); + goto err_irqfd; } return 0; + +err_irqfd: + misc_deregister(&xen_privcmdbuf_dev); +err_privcmdbuf: + misc_deregister(&privcmd_dev); + return err; } static void __exit privcmd_exit(void) { + privcmd_irqfd_exit(); misc_deregister(&privcmd_dev); misc_deregister(&xen_privcmdbuf_dev); } diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h index d2029556083e..375718ba4ab6 100644 --- a/include/uapi/xen/privcmd.h +++ b/include/uapi/xen/privcmd.h @@ -98,6 +98,18 @@ struct privcmd_mmap_resource { __u64 addr; }; +/* For privcmd_irqfd::flags */ +#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0) + +struct privcmd_irqfd { + void __user *dm_op; + __u32 size; /* Size of structure pointed by dm_op */ + __u32 fd; + __u32 flags; + domid_t dom; + __u8 pad[2]; +}; + /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t @@ -125,5 +137,7 @@ struct privcmd_mmap_resource { _IOC(_IOC_NONE, 'P', 6, sizeof(domid_t)) #define IOCTL_PRIVCMD_MMAP_RESOURCE \ _IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource)) +#define IOCTL_PRIVCMD_IRQFD \ + _IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */