From 4d8df0f5f79f747d75a7d356d9b9ea40a4e4c8a9 Mon Sep 17 00:00:00 2001 From: Prathu Baronia Date: Mon, 22 May 2023 14:20:19 +0530 Subject: [PATCH 01/12] vhost: use kzalloc() instead of kmalloc() followed by memset() Use kzalloc() to allocate new zeroed out msg node instead of memsetting a node allocated with kmalloc(). Signed-off-by: Prathu Baronia Message-Id: <20230522085019.42914-1-prathubaronia2011@gmail.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 074273020849..ecb3b397bb38 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2563,12 +2563,11 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify); /* Create a new message. */ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type) { - struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL); + /* Make sure all padding within the structure is initialized. */ + struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return NULL; - /* Make sure all padding within the structure is initialized. */ - memset(&node->msg, 0, sizeof node->msg); node->vq = vq; node->msg.type = type; return node; From a90e8608eb0ed93d31ac0feb055f77ce59512542 Mon Sep 17 00:00:00 2001 From: Sheng Zhao Date: Tue, 30 May 2023 11:36:26 +0800 Subject: [PATCH 02/12] vduse: avoid empty string for dev name Syzkaller hits a kernel WARN when the first character of the dev name provided is NULL. Solution is to add a NULL check before calling cdev_device_add() in vduse_create_dev(). kobject: (0000000072042169): attempted to be registered with empty name! WARNING: CPU: 0 PID: 112695 at lib/kobject.c:236 Call Trace: kobject_add_varg linux/src/lib/kobject.c:390 [inline] kobject_add+0xf6/0x150 linux/src/lib/kobject.c:442 device_add+0x28f/0xc20 linux/src/drivers/base/core.c:2167 cdev_device_add+0x83/0xc0 linux/src/fs/char_dev.c:546 vduse_create_dev linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2254 [inline] vduse_ioctl+0x7b5/0xf30 linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2316 vfs_ioctl linux/src/fs/ioctl.c:47 [inline] file_ioctl linux/src/fs/ioctl.c:510 [inline] do_vfs_ioctl+0x14b/0xa80 linux/src/fs/ioctl.c:697 ksys_ioctl+0x7c/0xa0 linux/src/fs/ioctl.c:714 __do_sys_ioctl linux/src/fs/ioctl.c:721 [inline] __se_sys_ioctl linux/src/fs/ioctl.c:719 [inline] __x64_sys_ioctl+0x42/0x50 linux/src/fs/ioctl.c:719 do_syscall_64+0x94/0x330 linux/src/arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace") Cc: "Xie Yongji" Reported-by: Xianjun Zeng Signed-off-by: Sheng Zhao Message-Id: <20230530033626.1266794-1-sheng.zhao@bytedance.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Xie Yongji Cc: "Michael S. Tsirkin", "Jason Wang", Reviewed-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index de97e38c3b82..5f5c21674fdc 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1685,6 +1685,9 @@ static bool vduse_validate_config(struct vduse_dev_config *config) if (config->vq_num > 0xffff) return false; + if (!config->name[0]) + return false; + if (!device_is_allowed(config->device_id)) return false; From 57380fd1249b20ef772549af2c58ef57b21faba7 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Wed, 24 May 2023 20:31:24 +0800 Subject: [PATCH 03/12] tools/virtio: Fix arm64 ringtest compilation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add cpu_relax() for arm64 instead of directly assert(), and add assert.h header file. Also, add smp_wmb and smp_mb for arm64. Compilation error as follows, avoid __always_inline undefined. $ make cc -Wall -pthread -O2 -ggdb -flto -fwhole-program -c -o ring.o ring.c In file included from ring.c:10: main.h: In function ‘busy_wait’: main.h:99:21: warning: implicit declaration of function ‘assert’ [-Wimplicit-function-declaration] 99 | #define cpu_relax() assert(0) | ^~~~~~ main.h:107:17: note: in expansion of macro ‘cpu_relax’ 107 | cpu_relax(); | ^~~~~~~~~ main.h:12:1: note: ‘assert’ is defined in header ‘’; did you forget to ‘#include ’? 11 | #include +++ |+#include 12 | main.h: At top level: main.h:143:23: error: expected ‘;’ before ‘void’ 143 | static __always_inline | ^ | ; 144 | void __read_once_size(const volatile void *p, void *res, int size) | ~~~~ main.h:158:23: error: expected ‘;’ before ‘void’ 158 | static __always_inline void __write_once_size(volatile void *p, void *res, int size) | ^~~~~ | ; make: *** [: ring.o] Error 1 Signed-off-by: Rong Tao Message-Id: Signed-off-by: Michael S. Tsirkin --- tools/virtio/ringtest/main.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h index b68920d52750..d18dd317e27f 100644 --- a/tools/virtio/ringtest/main.h +++ b/tools/virtio/ringtest/main.h @@ -8,6 +8,7 @@ #ifndef MAIN_H #define MAIN_H +#include #include extern int param; @@ -95,6 +96,8 @@ extern unsigned ring_size; #define cpu_relax() asm ("rep; nop" ::: "memory") #elif defined(__s390x__) #define cpu_relax() barrier() +#elif defined(__aarch64__) +#define cpu_relax() asm ("yield" ::: "memory") #else #define cpu_relax() assert(0) #endif @@ -112,6 +115,8 @@ static inline void busy_wait(void) #if defined(__x86_64__) || defined(__i386__) #define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc") +#elif defined(__aarch64__) +#define smp_mb() asm volatile("dmb ish" ::: "memory") #else /* * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized @@ -136,10 +141,16 @@ static inline void busy_wait(void) #if defined(__i386__) || defined(__x86_64__) || defined(__s390x__) #define smp_wmb() barrier() +#elif defined(__aarch64__) +#define smp_wmb() asm volatile("dmb ishst" ::: "memory") #else #define smp_wmb() smp_release() #endif +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { From c66dbc39a7104c5a9f033c0450dfa6f697a71f94 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Thu, 25 May 2023 16:37:28 +0800 Subject: [PATCH 04/12] tools/virtio: Add .gitignore for ringtest Ignore executables for ringtest. Signed-off-by: Rong Tao Message-Id: Signed-off-by: Michael S. Tsirkin --- tools/virtio/ringtest/.gitignore | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tools/virtio/ringtest/.gitignore diff --git a/tools/virtio/ringtest/.gitignore b/tools/virtio/ringtest/.gitignore new file mode 100644 index 000000000000..100b9e30c0f4 --- /dev/null +++ b/tools/virtio/ringtest/.gitignore @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +/noring +/ptr_ring +/ring +/virtio_ring_0_9 +/virtio_ring_inorder +/virtio_ring_poll From 73790bdfba076c0886f0f14fd46ff2c70ee31ce9 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Tue, 16 May 2023 12:58:01 +0300 Subject: [PATCH 05/12] vdpa/mlx5: Fix hang when cvq commands are triggered during device unregister Currently the vdpa device is unregistered after the workqueue that processes vq commands is disabled. However, the device unregister process can still send commands to the cvq (a vlan delete for example) which leads to a hang because the handing workqueue has been disabled and the command never finishes: [ 2263.095764] rcu: INFO: rcu_sched self-detected stall on CPU [ 2263.096307] rcu: 9-....: (5250 ticks this GP) idle=dac4/1/0x4000000000000000 softirq=111009/111009 fqs=2544 [ 2263.097154] rcu: (t=5251 jiffies g=393549 q=347 ncpus=10) [ 2263.097648] CPU: 9 PID: 94300 Comm: kworker/u20:2 Not tainted 6.3.0-rc6_for_upstream_min_debug_2023_04_14_00_02 #1 [ 2263.098535] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 2263.099481] Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core] [ 2263.100143] RIP: 0010:virtnet_send_command+0x109/0x170 [ 2263.100621] Code: 1d df f5 ff 85 c0 78 5c 48 8b 7b 08 e8 d0 c5 f5 ff 84 c0 75 11 eb 22 48 8b 7b 08 e8 01 b7 f5 ff 84 c0 75 15 f3 90 48 8b 7b 08 <48> 8d 74 24 04 e8 8d c5 f5 ff 48 85 c0 74 de 48 8b 83 f8 00 00 00 [ 2263.102148] RSP: 0018:ffff888139cf36e8 EFLAGS: 00000246 [ 2263.102624] RAX: 0000000000000000 RBX: ffff888166bea940 RCX: 0000000000000001 [ 2263.103244] RDX: 0000000000000000 RSI: ffff888139cf36ec RDI: ffff888146763800 [ 2263.103864] RBP: ffff888139cf3710 R08: ffff88810d201000 R09: 0000000000000000 [ 2263.104473] R10: 0000000000000002 R11: 0000000000000003 R12: 0000000000000002 [ 2263.105082] R13: 0000000000000002 R14: ffff888114528400 R15: ffff888166bea000 [ 2263.105689] FS: 0000000000000000(0000) GS:ffff88852cc80000(0000) knlGS:0000000000000000 [ 2263.106404] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2263.106925] CR2: 00007f31f394b000 CR3: 000000010615b006 CR4: 0000000000370ea0 [ 2263.107542] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2263.108163] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2263.108769] Call Trace: [ 2263.109059] [ 2263.109320] ? check_preempt_wakeup+0x11f/0x230 [ 2263.109750] virtnet_vlan_rx_kill_vid+0x5a/0xa0 [ 2263.110180] vlan_vid_del+0x9c/0x170 [ 2263.110546] vlan_device_event+0x351/0x760 [8021q] [ 2263.111004] raw_notifier_call_chain+0x41/0x60 [ 2263.111426] dev_close_many+0xcb/0x120 [ 2263.111808] unregister_netdevice_many_notify+0x130/0x770 [ 2263.112297] ? wq_worker_running+0xa/0x30 [ 2263.112688] unregister_netdevice_queue+0x89/0xc0 [ 2263.113128] unregister_netdev+0x18/0x20 [ 2263.113512] virtnet_remove+0x4f/0x230 [ 2263.113885] virtio_dev_remove+0x31/0x70 [ 2263.114273] device_release_driver_internal+0x18f/0x1f0 [ 2263.114746] bus_remove_device+0xc6/0x130 [ 2263.115146] device_del+0x173/0x3c0 [ 2263.115502] ? kernfs_find_ns+0x35/0xd0 [ 2263.115895] device_unregister+0x1a/0x60 [ 2263.116279] unregister_virtio_device+0x11/0x20 [ 2263.116706] device_release_driver_internal+0x18f/0x1f0 [ 2263.117182] bus_remove_device+0xc6/0x130 [ 2263.117576] device_del+0x173/0x3c0 [ 2263.117929] ? vdpa_dev_remove+0x20/0x20 [vdpa] [ 2263.118364] device_unregister+0x1a/0x60 [ 2263.118752] mlx5_vdpa_dev_del+0x4c/0x80 [mlx5_vdpa] [ 2263.119232] vdpa_match_remove+0x21/0x30 [vdpa] [ 2263.119663] bus_for_each_dev+0x71/0xc0 [ 2263.120054] vdpa_mgmtdev_unregister+0x57/0x70 [vdpa] [ 2263.120520] mlx5v_remove+0x12/0x20 [mlx5_vdpa] [ 2263.120953] auxiliary_bus_remove+0x18/0x30 [ 2263.121356] device_release_driver_internal+0x18f/0x1f0 [ 2263.121830] bus_remove_device+0xc6/0x130 [ 2263.122223] device_del+0x173/0x3c0 [ 2263.122581] ? devl_param_driverinit_value_get+0x29/0x90 [ 2263.123070] mlx5_rescan_drivers_locked+0xc4/0x2d0 [mlx5_core] [ 2263.123633] mlx5_unregister_device+0x54/0x80 [mlx5_core] [ 2263.124169] mlx5_uninit_one+0x54/0x150 [mlx5_core] [ 2263.124656] mlx5_sf_dev_remove+0x45/0x90 [mlx5_core] [ 2263.125153] auxiliary_bus_remove+0x18/0x30 [ 2263.125560] device_release_driver_internal+0x18f/0x1f0 [ 2263.126052] bus_remove_device+0xc6/0x130 [ 2263.126451] device_del+0x173/0x3c0 [ 2263.126815] mlx5_sf_dev_remove+0x39/0xf0 [mlx5_core] [ 2263.127318] mlx5_sf_dev_state_change_handler+0x178/0x270 [mlx5_core] [ 2263.127920] blocking_notifier_call_chain+0x5a/0x80 [ 2263.128379] mlx5_vhca_state_work_handler+0x151/0x200 [mlx5_core] [ 2263.128951] process_one_work+0x1bb/0x3c0 [ 2263.129355] ? process_one_work+0x3c0/0x3c0 [ 2263.129766] worker_thread+0x4d/0x3c0 [ 2263.130140] ? process_one_work+0x3c0/0x3c0 [ 2263.130548] kthread+0xb9/0xe0 [ 2263.130895] ? kthread_complete_and_exit+0x20/0x20 [ 2263.131349] ret_from_fork+0x1f/0x30 [ 2263.131717] The fix is to disable and destroy the workqueue after the device unregister. It is expected that vhost will not trigger kicks after the unregister. But even if it would, the wq is disabled already by setting the pointer to NULL (done so in the referenced commit). Fixes: ad6dc1daaf29 ("vdpa/mlx5: Avoid processing works if workqueue was destroyed") Signed-off-by: Dragos Tatulea Message-Id: <20230516095800.3549932-1-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Tariq Toukan Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index e29e32b306ad..279ac6a558d2 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3349,10 +3349,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mlx5_vdpa_remove_debugfs(ndev->debugfs); ndev->debugfs = NULL; unregister_link_notifier(ndev); + _vdpa_unregister_device(dev); wq = mvdev->wq; mvdev->wq = NULL; destroy_workqueue(wq); - _vdpa_unregister_device(dev); mgtdev->ndev = NULL; } From 376daf317753ccb6b1ecbdece66018f7f6313a7f Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 24 Apr 2023 15:50:29 -0700 Subject: [PATCH 06/12] vhost_vdpa: tell vqs about the negotiated As is done in the net, iscsi, and vsock vhost support, let the vdpa vqs know about the features that have been negotiated. This allows vhost to more safely make decisions based on the features, such as when using PACKED vs split queues. Signed-off-by: Shannon Nelson Acked-by: Jason Wang Message-Id: <20230424225031.18947-2-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 8c1aefc865f0..89d5ecfe7d2e 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -407,7 +407,10 @@ static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_dev *d = &v->vdev; + u64 actual_features; u64 features; + int i; /* * It's not allowed to change the features after they have @@ -422,6 +425,16 @@ static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) if (vdpa_set_features(vdpa, features)) return -EINVAL; + /* let the vqs know what has been configured */ + actual_features = ops->get_driver_features(vdpa); + for (i = 0; i < d->nvqs; ++i) { + struct vhost_virtqueue *vq = d->vqs[i]; + + mutex_lock(&vq->mutex); + vq->acked_features = actual_features; + mutex_unlock(&vq->mutex); + } + return 0; } From 1f5d2e3bab16369d5d4b4020a25db4ab1f4f082c Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Mon, 24 Apr 2023 23:44:11 +0300 Subject: [PATCH 07/12] vhost_net: revert upend_idx only on retriable error Fix possible virtqueue used buffers leak and corresponding stuck in case of temporary -EIO from sendmsg() which is produced by tun driver while backend device is not up. In case of no-retriable error and zcopy do not revert upend_idx to pass packet data (that is update used_idx in corresponding vhost_zerocopy_signal_used()) as if packet data has been transferred successfully. v2: set vq->heads[ubuf->desc].len equal to VHOST_DMA_DONE_LEN in case of fake successful transmit. Signed-off-by: Andrey Smetanin Message-Id: <20230424204411.24888-1-asmetanin@yandex-team.ru> Signed-off-by: Michael S. Tsirkin Signed-off-by: Andrey Smetanin Acked-by: Jason Wang --- drivers/vhost/net.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 07181cd8d52e..ae2273196b0c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -935,13 +935,18 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { + bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; + if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); - nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) - % UIO_MAXIOV; + if (retry) + nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) + % UIO_MAXIOV; + else + vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } - if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; From a284f09effea1908db7985dbfb5458c9100038d8 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 7 Jun 2023 14:23:37 -0500 Subject: [PATCH 08/12] vhost: Fix crash during early vhost_transport_send_pkt calls If userspace does VHOST_VSOCK_SET_GUEST_CID before VHOST_SET_OWNER we can race where: 1. thread0 calls vhost_transport_send_pkt -> vhost_work_queue 2. thread1 does VHOST_SET_OWNER which calls vhost_worker_create. 3. vhost_worker_create will set the dev->worker pointer before setting the worker->vtsk pointer. 4. thread0's vhost_work_queue will see the dev->worker pointer is set and try to call vhost_task_wake using not yet set worker->vtsk pointer. 5. We then crash since vtsk is NULL. Before commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), we only had the worker pointer so we could just check it to see if VHOST_SET_OWNER has been done. After that commit we have the vhost_worker and vhost_task pointer, so we can now hit the bug above. This patch embeds the vhost_worker in the vhost_dev and moves the work list initialization back to vhost_dev_init, so we can just check the worker.vtsk pointer to check if VHOST_SET_OWNER has been done like before. Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") Signed-off-by: Mike Christie Message-Id: <20230607192338.6041-2-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin Reported-by: syzbot+d0d442c22fa8db45ff0e@syzkaller.appspotmail.com Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 50 +++++++++++++++---------------------------- drivers/vhost/vhost.h | 2 +- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ecb3b397bb38..ca1041c88c68 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -235,7 +235,7 @@ void vhost_dev_flush(struct vhost_dev *dev) { struct vhost_flush_struct flush; - if (dev->worker) { + if (dev->worker.vtsk) { init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(vhost_dev_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { - if (!dev->worker) + if (!dev->worker.vtsk) return; if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { @@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * sure it was not in the list. * test_and_set_bit() implies a memory barrier. */ - llist_add(&work->node, &dev->worker->work_list); - vhost_task_wake(dev->worker->vtsk); + llist_add(&work->node, &dev->worker.work_list); + vhost_task_wake(dev->worker.vtsk); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue); /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { - return dev->worker && !llist_empty(&dev->worker->work_list); + return !llist_empty(&dev->worker.work_list); } EXPORT_SYMBOL_GPL(vhost_has_work); @@ -456,7 +456,8 @@ void vhost_dev_init(struct vhost_dev *dev, dev->umem = NULL; dev->iotlb = NULL; dev->mm = NULL; - dev->worker = NULL; + memset(&dev->worker, 0, sizeof(dev->worker)); + init_llist_head(&dev->worker.work_list); dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; @@ -530,47 +531,30 @@ static void vhost_detach_mm(struct vhost_dev *dev) static void vhost_worker_free(struct vhost_dev *dev) { - struct vhost_worker *worker = dev->worker; - - if (!worker) + if (!dev->worker.vtsk) return; - dev->worker = NULL; - WARN_ON(!llist_empty(&worker->work_list)); - vhost_task_stop(worker->vtsk); - kfree(worker); + WARN_ON(!llist_empty(&dev->worker.work_list)); + vhost_task_stop(dev->worker.vtsk); + dev->worker.kcov_handle = 0; + dev->worker.vtsk = NULL; } static int vhost_worker_create(struct vhost_dev *dev) { - struct vhost_worker *worker; struct vhost_task *vtsk; char name[TASK_COMM_LEN]; - int ret; - worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); - if (!worker) - return -ENOMEM; - - dev->worker = worker; - worker->kcov_handle = kcov_common_handle(); - init_llist_head(&worker->work_list); snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_worker, worker, name); - if (!vtsk) { - ret = -ENOMEM; - goto free_worker; - } + vtsk = vhost_task_create(vhost_worker, &dev->worker, name); + if (!vtsk) + return -ENOMEM; - worker->vtsk = vtsk; + dev->worker.kcov_handle = kcov_common_handle(); + dev->worker.vtsk = vtsk; vhost_task_start(vtsk); return 0; - -free_worker: - kfree(worker); - dev->worker = NULL; - return ret; } /* Caller should have device mutex */ diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 0308638cdeee..305ec8593d46 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -154,7 +154,7 @@ struct vhost_dev { struct vhost_virtqueue **vqs; int nvqs; struct eventfd_ctx *log_ctx; - struct vhost_worker *worker; + struct vhost_worker worker; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; From 4b13cbef797048fbb525f8c635a5279e9d209d93 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 7 Jun 2023 14:23:38 -0500 Subject: [PATCH 09/12] vhost: Fix worker hangs due to missed wake up calls We can race where we have added work to the work_list, but vhost_task_fn has passed that check but not yet set us into TASK_INTERRUPTIBLE. wake_up_process will see us in TASK_RUNNING and just return. This bug was intoduced in commit f9010dbdce91 ("fork, vhost: Use CLONE_THREAD to fix freezer/ps regression") when I moved the setting of TASK_INTERRUPTIBLE to simplfy the code and avoid get_signal from logging warnings about being in the wrong state. This moves the setting of TASK_INTERRUPTIBLE back to before we test if we need to stop the task to avoid a possible race there as well. We then have vhost_worker set TASK_RUNNING if it finds work similar to before. Fixes: f9010dbdce91 ("fork, vhost: Use CLONE_THREAD to fix freezer/ps regression") Signed-off-by: Mike Christie Message-Id: <20230607192338.6041-3-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 ++ kernel/vhost_task.c | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ca1041c88c68..1f80eac5d6ae 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -341,6 +341,8 @@ static bool vhost_worker(void *data) node = llist_del_all(&worker->work_list); if (node) { + __set_current_state(TASK_RUNNING); + node = llist_reverse_order(node); /* make sure flag is seen after deletion */ smp_wmb(); diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index f80d5c51ae67..da35e5b7f047 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -28,10 +28,6 @@ static int vhost_task_fn(void *data) for (;;) { bool did_work; - /* mb paired w/ vhost_task_stop */ - if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) - break; - if (!dead && signal_pending(current)) { struct ksignal ksig; /* @@ -48,11 +44,17 @@ static int vhost_task_fn(void *data) clear_thread_flag(TIF_SIGPENDING); } - did_work = vtsk->fn(vtsk->data); - if (!did_work) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); + /* mb paired w/ vhost_task_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { + __set_current_state(TASK_RUNNING); + break; } + + did_work = vtsk->fn(vtsk->data); + if (!did_work) + schedule(); } complete(&vtsk->exited); From 55d8122f5cd62d5aaa225d7167dcd14a44c850b9 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 24 Apr 2023 15:50:30 -0700 Subject: [PATCH 10/12] vhost: support PACKED when setting-getting vring_base Use the right structs for PACKED or split vqs when setting and getting the vring base. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Shannon Nelson Message-Id: <20230424225031.18947-3-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vhost.c | 18 +++++++++++++----- drivers/vhost/vhost.h | 8 ++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 1f80eac5d6ae..60c9ebd629dd 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1600,17 +1600,25 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg r = -EFAULT; break; } - if (s.num > 0xffff) { - r = -EINVAL; - break; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { + vq->last_avail_idx = s.num & 0xffff; + vq->last_used_idx = (s.num >> 16) & 0xffff; + } else { + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; } - vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; - s.num = vq->last_avail_idx; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) + s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16); + else + s.num = vq->last_avail_idx; if (copy_to_user(argp, &s, sizeof s)) r = -EFAULT; break; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 305ec8593d46..fc900be504b3 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -92,13 +92,17 @@ struct vhost_virtqueue { /* The routine to call when the Guest pings us, or timeout. */ vhost_work_fn_t handle_kick; - /* Last available index we saw. */ + /* Last available index we saw. + * Values are limited to 0x7fff, and the high bit is used as + * a wrap counter when using VIRTIO_F_RING_PACKED. */ u16 last_avail_idx; /* Caches available index value from user. */ u16 avail_idx; - /* Last index we used. */ + /* Last index we used. + * Values are limited to 0x7fff, and the high bit is used as + * a wrap counter when using VIRTIO_F_RING_PACKED. */ u16 last_used_idx; /* Used flags */ From beee7fdb5b56a46415a4992d28dd4c2d06eb52df Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 24 Apr 2023 15:50:31 -0700 Subject: [PATCH 11/12] vhost_vdpa: support PACKED when setting-getting vring_base Use the right structs for PACKED or split vqs when setting and getting the vring base. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Shannon Nelson Message-Id: <20230424225031.18947-4-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 89d5ecfe7d2e..bf77924d5b60 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -607,7 +607,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (r) return r; - vq->last_avail_idx = vq_state.split.avail_index; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { + vq->last_avail_idx = vq_state.packed.last_avail_idx | + (vq_state.packed.last_avail_counter << 15); + vq->last_used_idx = vq_state.packed.last_used_idx | + (vq_state.packed.last_used_counter << 15); + } else { + vq->last_avail_idx = vq_state.split.avail_index; + } break; } @@ -625,9 +632,15 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, break; case VHOST_SET_VRING_BASE: - vq_state.split.avail_index = vq->last_avail_idx; - if (ops->set_vq_state(vdpa, idx, &vq_state)) - r = -EINVAL; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { + vq_state.packed.last_avail_idx = vq->last_avail_idx & 0x7fff; + vq_state.packed.last_avail_counter = !!(vq->last_avail_idx & 0x8000); + vq_state.packed.last_used_idx = vq->last_used_idx & 0x7fff; + vq_state.packed.last_used_counter = !!(vq->last_used_idx & 0x8000); + } else { + vq_state.split.avail_index = vq->last_avail_idx; + } + r = ops->set_vq_state(vdpa, idx, &vq_state); break; case VHOST_SET_VRING_CALL: From 07496eeab577eef1d4912b3e1b502a2b52002ac3 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 15 Feb 2023 15:33:49 -0700 Subject: [PATCH 12/12] tools/virtio: use canonical ftrace path The canonical location for the tracefs filesystem is at /sys/kernel/tracing. But, from Documentation/trace/ftrace.rst: Before 4.1, all ftrace tracing control files were within the debugfs file system, which is typically located at /sys/kernel/debug/tracing. For backward compatibility, when mounting the debugfs file system, the tracefs file system will be automatically mounted at: /sys/kernel/debug/tracing A few spots in tools/virtio still refer to this older debugfs path, so let's update them to avoid confusion. Signed-off-by: Ross Zwisler Message-Id: <20230215223350.2658616-6-zwisler@google.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Mukesh Ojha --- tools/virtio/virtio-trace/README | 2 +- tools/virtio/virtio-trace/trace-agent.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README index 4fb9368bf751..0127ff0c54b0 100644 --- a/tools/virtio/virtio-trace/README +++ b/tools/virtio/virtio-trace/README @@ -95,7 +95,7 @@ Run 1) Enable ftrace in the guest - # echo 1 > /sys/kernel/debug/tracing/events/sched/enable + # echo 1 > /sys/kernel/tracing/events/sched/enable 2) Run trace agent in the guest This agent must be operated as root. diff --git a/tools/virtio/virtio-trace/trace-agent.c b/tools/virtio/virtio-trace/trace-agent.c index cdfe77c2b4c8..7e2d9bbf0b84 100644 --- a/tools/virtio/virtio-trace/trace-agent.c +++ b/tools/virtio/virtio-trace/trace-agent.c @@ -18,8 +18,9 @@ #define PIPE_DEF_BUFS 16 #define PIPE_MIN_SIZE (PAGE_SIZE*PIPE_DEF_BUFS) #define PIPE_MAX_SIZE (1024*1024) -#define READ_PATH_FMT \ - "/sys/kernel/debug/tracing/per_cpu/cpu%d/trace_pipe_raw" +#define TRACEFS "/sys/kernel/tracing" +#define DEBUGFS "/sys/kernel/debug/tracing" +#define READ_PATH_FMT "%s/per_cpu/cpu%d/trace_pipe_raw" #define WRITE_PATH_FMT "/dev/virtio-ports/trace-path-cpu%d" #define CTL_PATH "/dev/virtio-ports/agent-ctl-path" @@ -120,9 +121,12 @@ static const char *make_path(int cpu_num, bool this_is_write_path) if (this_is_write_path) /* write(output) path */ ret = snprintf(buf, PATH_MAX, WRITE_PATH_FMT, cpu_num); - else + else { /* read(input) path */ - ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, cpu_num); + ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, TRACEFS, cpu_num); + if (ret > 0 && access(buf, F_OK) != 0) + ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, DEBUGFS, cpu_num); + } if (ret <= 0) { pr_err("Failed to generate %s path(CPU#%d):%d\n",