From f8024f1f36a30a082b0457d5779c8847cea57f57 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 2 Oct 2023 18:14:08 -0600
Subject: [PATCH 1/4] io_uring/kbuf: don't allow registered buffer rings on
 highmem pages

syzbot reports that registering a mapped buffer ring on arm32 can
trigger an OOPS. Registered buffer rings have two modes, one of them
is the application passing in the memory that the buffer ring should
reside in. Once those pages are mapped, we use page_address() to get
a virtual address. This will obviously fail on highmem pages, which
aren't mapped.

Add a check if we have any highmem pages after mapping, and fail the
attempt to register a provided buffer ring if we do. This will return
the same error as kernels that don't support provided buffer rings to
begin with.

Link: https://lore.kernel.org/io-uring/000000000000af635c0606bcb889@google.com/
Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring")
Cc: stable@vger.kernel.org
Reported-by: syzbot+2113e61b8848fa7951d8@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 556f4df25b0f..9123138aa9f4 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -477,7 +477,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 {
 	struct io_uring_buf_ring *br;
 	struct page **pages;
-	int nr_pages;
+	int i, nr_pages;
 
 	pages = io_pin_pages(reg->ring_addr,
 			     flex_array_size(br, bufs, reg->ring_entries),
@@ -485,6 +485,17 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	/*
+	 * Apparently some 32-bit boxes (ARM) will return highmem pages,
+	 * which then need to be mapped. We could support that, but it'd
+	 * complicate the code and slowdown the common cases quite a bit.
+	 * So just error out, returning -EINVAL just like we did on kernels
+	 * that didn't support mapped buffer rings.
+	 */
+	for (i = 0; i < nr_pages; i++)
+		if (PageHighMem(pages[i]))
+			goto error_unpin;
+
 	br = page_address(pages[0]);
 #ifdef SHM_COLOUR
 	/*
@@ -496,13 +507,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 	 * this transparently.
 	 */
-	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
-		int i;
-
-		for (i = 0; i < nr_pages; i++)
-			unpin_user_page(pages[i]);
-		return -EINVAL;
-	}
+	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
+		goto error_unpin;
 #endif
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
@@ -510,6 +516,11 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->is_mapped = 1;
 	bl->is_mmap = 0;
 	return 0;
+error_unpin:
+	for (i = 0; i < nr_pages; i++)
+		unpin_user_page(pages[i]);
+	kvfree(pages);
+	return -EINVAL;
 }
 
 static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,

From 1658633c04653578429ff5dfc62fdc159203a8f2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 2 Oct 2023 19:51:38 -0600
Subject: [PATCH 2/4] io_uring: ensure io_lockdep_assert_cq_locked() handles
 disabled rings

io_lockdep_assert_cq_locked() checks that locking is correctly done when
a CQE is posted. If the ring is setup in a disabled state with
IORING_SETUP_R_DISABLED, then ctx->submitter_task isn't assigned until
the ring is later enabled. We generally don't post CQEs in this state,
as no SQEs can be submitted. However it is possible to generate a CQE
if tagged resources are being updated. If this happens and PROVE_LOCKING
is enabled, then the locking check helper will dereference
ctx->submitter_task, which hasn't been set yet.

Fixup io_lockdep_assert_cq_locked() to handle this case correctly. While
at it, convert it to a static inline as well, so that generated line
offsets will actually reflect which condition failed, rather than just
the line offset for io_lockdep_assert_cq_locked() itself.

Reported-and-tested-by: syzbot+efc45d4e7ba6ab4ef1eb@syzkaller.appspotmail.com
Fixes: f26cc9593581 ("io_uring: lockdep annotate CQ locking")
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 547c30582fb8..0bc145614a6e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -86,20 +86,33 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
-#define io_lockdep_assert_cq_locked(ctx)				\
-	do {								\
-		lockdep_assert(in_task());				\
-									\
-		if (ctx->flags & IORING_SETUP_IOPOLL) {			\
-			lockdep_assert_held(&ctx->uring_lock);		\
-		} else if (!ctx->task_complete) {			\
-			lockdep_assert_held(&ctx->completion_lock);	\
-		} else if (ctx->submitter_task->flags & PF_EXITING) {	\
-			lockdep_assert(current_work());			\
-		} else {						\
-			lockdep_assert(current == ctx->submitter_task);	\
-		}							\
-	} while (0)
+#if defined(CONFIG_PROVE_LOCKING)
+static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
+{
+	lockdep_assert(in_task());
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		lockdep_assert_held(&ctx->uring_lock);
+	} else if (!ctx->task_complete) {
+		lockdep_assert_held(&ctx->completion_lock);
+	} else if (ctx->submitter_task) {
+		/*
+		 * ->submitter_task may be NULL and we can still post a CQE,
+		 * if the ring has been setup with IORING_SETUP_R_DISABLED.
+		 * Not from an SQE, as those cannot be submitted, but via
+		 * updating tagged resources.
+		 */
+		if (ctx->submitter_task->flags & PF_EXITING)
+			lockdep_assert(current_work());
+		else
+			lockdep_assert(current == ctx->submitter_task);
+	}
+}
+#else
+static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
+{
+}
+#endif
 
 static inline void io_req_task_work_add(struct io_kiocb *req)
 {

From 223ef474316466e9f61f6e0064f3a6fe4923a2c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 3 Oct 2023 09:59:58 -0600
Subject: [PATCH 3/4] io_uring: don't allow IORING_SETUP_NO_MMAP rings on
 highmem pages

On at least arm32, but presumably any arch with highmem, if the
application passes in memory that resides in highmem for the rings,
then we should fail that ring creation. We fail it with -EINVAL, which
is what kernels that don't support IORING_SETUP_NO_MMAP will do as well.

Cc: stable@vger.kernel.org
Fixes: 03d89a2de25b ("io_uring: support for user allocated memory for rings/sqes")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 783ed0fff71b..d839a80a6751 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2686,7 +2686,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 {
 	struct page **page_array;
 	unsigned int nr_pages;
-	int ret;
+	int ret, i;
 
 	*npages = 0;
 
@@ -2716,6 +2716,20 @@ err:
 	 */
 	if (page_array[0] != page_array[ret - 1])
 		goto err;
+
+	/*
+	 * Can't support mapping user allocated ring memory on 32-bit archs
+	 * where it could potentially reside in highmem. Just fail those with
+	 * -EINVAL, just like we did on kernels that didn't support this
+	 * feature.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		if (PageHighMem(page_array[i])) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
 	*pages = page_array;
 	*npages = nr_pages;
 	return page_to_virt(page_array[0]);

From 0f8baa3c9802fbfe313c901e1598397b61b91ada Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 5 Oct 2023 13:55:31 -0400
Subject: [PATCH 4/4] io-wq: fully initialize wqe before calling
 cpuhp_state_add_instance_nocalls()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I received a bug report with the following signature:

[ 1759.937637] BUG: unable to handle page fault for address: ffffffffffffffe8
[ 1759.944564] #PF: supervisor read access in kernel mode
[ 1759.949732] #PF: error_code(0x0000) - not-present page
[ 1759.954901] PGD 7ab615067 P4D 7ab615067 PUD 7ab617067 PMD 0
[ 1759.960596] Oops: 0000 1 PREEMPT SMP PTI
[ 1759.964804] CPU: 15 PID: 109 Comm: cpuhp/15 Kdump: loaded Tainted: G X ------- — 5.14.0-362.3.1.el9_3.x86_64 #1
[ 1759.976609] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 06/20/2018
[ 1759.985181] RIP: 0010:io_wq_for_each_worker.isra.0+0x24/0xa0
[ 1759.990877] Code: 90 90 90 90 90 90 0f 1f 44 00 00 41 56 41 55 41 54 55 48 8d 6f 78 53 48 8b 47 78 48 39 c5 74 4f 49 89 f5 49 89 d4 48 8d 58 e8 <8b> 13 85 d2 74 32 8d 4a 01 89 d0 f0 0f b1 0b 75 5c 09 ca 78 3d 48
[ 1760.009758] RSP: 0000:ffffb6f403603e20 EFLAGS: 00010286
[ 1760.015013] RAX: 0000000000000000 RBX: ffffffffffffffe8 RCX: 0000000000000000
[ 1760.022188] RDX: ffffb6f403603e50 RSI: ffffffffb11e95b0 RDI: ffff9f73b09e9400
[ 1760.029362] RBP: ffff9f73b09e9478 R08: 000000000000000f R09: 0000000000000000
[ 1760.036536] R10: ffffffffffffff00 R11: ffffb6f403603d80 R12: ffffb6f403603e50
[ 1760.043712] R13: ffffffffb11e95b0 R14: ffffffffb28531e8 R15: ffff9f7a6fbdf548
[ 1760.050887] FS: 0000000000000000(0000) GS:ffff9f7a6fbc0000(0000) knlGS:0000000000000000
[ 1760.059025] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1760.064801] CR2: ffffffffffffffe8 CR3: 00000007ab610002 CR4: 00000000007706e0
[ 1760.071976] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1760.079150] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1760.086325] PKRU: 55555554
[ 1760.089044] Call Trace:
[ 1760.091501] <TASK>
[ 1760.093612] ? show_trace_log_lvl+0x1c4/0x2df
[ 1760.097995] ? show_trace_log_lvl+0x1c4/0x2df
[ 1760.102377] ? __io_wq_cpu_online+0x54/0xb0
[ 1760.106584] ? __die_body.cold+0x8/0xd
[ 1760.110356] ? page_fault_oops+0x134/0x170
[ 1760.114479] ? kernelmode_fixup_or_oops+0x84/0x110
[ 1760.119298] ? exc_page_fault+0xa8/0x150
[ 1760.123247] ? asm_exc_page_fault+0x22/0x30
[ 1760.127458] ? __pfx_io_wq_worker_affinity+0x10/0x10
[ 1760.132453] ? __pfx_io_wq_worker_affinity+0x10/0x10
[ 1760.137446] ? io_wq_for_each_worker.isra.0+0x24/0xa0
[ 1760.142527] __io_wq_cpu_online+0x54/0xb0
[ 1760.146558] cpuhp_invoke_callback+0x109/0x460
[ 1760.151029] ? __pfx_io_wq_cpu_offline+0x10/0x10
[ 1760.155673] ? __pfx_smpboot_thread_fn+0x10/0x10
[ 1760.160320] cpuhp_thread_fun+0x8d/0x140
[ 1760.164266] smpboot_thread_fn+0xd3/0x1a0
[ 1760.168297] kthread+0xdd/0x100
[ 1760.171457] ? __pfx_kthread+0x10/0x10
[ 1760.175225] ret_from_fork+0x29/0x50
[ 1760.178826] </TASK>
[ 1760.181022] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs rfkill sunrpc vfat fat dm_multipath intel_rapl_msr intel_rapl_common isst_if_common ipmi_ssif nfit libnvdimm mgag200 i2c_algo_bit ioatdma drm_shmem_helper drm_kms_helper acpi_ipmi syscopyarea x86_pkg_temp_thermal sysfillrect ipmi_si intel_powerclamp sysimgblt ipmi_devintf coretemp acpi_power_meter ipmi_msghandler rapl pcspkr dca intel_pch_thermal intel_cstate ses lpc_ich intel_uncore enclosure hpilo mei_me mei acpi_tad fuse drm xfs sd_mod sg bnx2x nvme nvme_core crct10dif_pclmul crc32_pclmul nvme_common ghash_clmulni_intel smartpqi tg3 t10_pi mdio uas libcrc32c crc32c_intel scsi_transport_sas usb_storage hpwdt wmi dm_mirror dm_region_hash dm_log dm_mod
[ 1760.248623] CR2: ffffffffffffffe8

A cpu hotplug callback was issued before wq->all_list was initialized.
This results in a null pointer dereference.  The fix is to fully setup
the io_wq before calling cpuhp_state_add_instance_nocalls().

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Link: https://lore.kernel.org/r/x49y1ghnecs.fsf@segfault.boston.devel.redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 1ecc8c748768..522196dfb0ff 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1151,9 +1151,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
-	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-	if (ret)
-		goto err_wq;
 
 	refcount_inc(&data->hash->refs);
 	wq->hash = data->hash;
@@ -1186,13 +1183,14 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	wq->task = get_task_struct(data->task);
 	atomic_set(&wq->worker_refs, 1);
 	init_completion(&wq->worker_done);
+	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
+	if (ret)
+		goto err;
+
 	return wq;
 err:
 	io_wq_put_hash(data->hash);
-	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-
 	free_cpumask_var(wq->cpu_mask);
-err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
 }