08bdcc35f0
If someone removes a node from a list, and then later adds it back to
a list, we can have invalid data in ->next. This can cause all sorts
of issues. One such use case is the IORING_OP_POLL_ADD command, which
will do just that if we race and get woken twice without any pending
events. This is a pretty rare case, but can happen under extreme loads.
Dan reports that he saw the following crash:
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD d283ce067 P4D d283ce067 PUD e5ca04067 PMD 0
Oops: 0002 [#1] SMP
CPU: 17 PID: 10726 Comm: tao:fast-fiber Kdump: loaded Not tainted 5.2.9-02851-gac7bc042d2d1 #116
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
RIP: 0010:io_wqe_enqueue+0x3e/0xd0
Code: 34 24 74 55 8b 47 58 48 8d 6f 50 85 c0 74 50 48 89 df e8 35 7c 75 00 48 83 7b 08 00 48 8b 14 24 0f 84 84 00 00 00 48 8b 4b 10 <48> 89 11 48 89 53 10 83 63 20 fe 48 89 c6 48 89 df e8 0c 7a 75 00
RSP: 0000:ffffc90006858a08 EFLAGS: 00010082
RAX: 0000000000000002 RBX: ffff889037492fc0 RCX: 0000000000000000
RDX: ffff888e40cc11a8 RSI: ffff888e40cc11a8 RDI: ffff889037492fc0
RBP: ffff889037493010 R08: 00000000000000c3 R09: ffffc90006858ab8
R10: 0000000000000000 R11: 0000000000000000 R12: ffff888e40cc11a8
R13: 0000000000000000 R14: 00000000000000c3 R15: ffff888e40cc1100
FS: 00007fcddc9db700(0000) GS:ffff88903fa40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000e479f5003 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<IRQ>
io_poll_wake+0x12f/0x2a0
__wake_up_common+0x86/0x120
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_rcv_established+0x557/0x630
tcp_v6_do_rcv+0x118/0x3c0
tcp_v6_rcv+0x97e/0x9d0
ip6_protocol_deliver_rcu+0xe3/0x440
ip6_input+0x3d/0xc0
? ip6_protocol_deliver_rcu+0x440/0x440
ipv6_rcv+0x56/0xd0
? ip6_rcv_finish_core.isra.18+0x80/0x80
__netif_receive_skb_one_core+0x50/0x70
netif_receive_skb_internal+0x2f/0xa0
napi_gro_receive+0x125/0x150
mlx5e_handle_rx_cqe+0x1d9/0x5a0
? mlx5e_poll_tx_cq+0x305/0x560
mlx5e_poll_rx_cq+0x49f/0x9c5
mlx5e_napi_poll+0xee/0x640
? smp_reschedule_interrupt+0x16/0xd0
? reschedule_interrupt+0xf/0x20
net_rx_action+0x286/0x3d0
__do_softirq+0xca/0x297
irq_exit+0x96/0xa0
do_IRQ+0x54/0xe0
common_interrupt+0xf/0xf
</IRQ>
RIP: 0033:0x7fdc627a2e3a
Code: 31 c0 85 d2 0f 88 f6 00 00 00 55 48 89 e5 41 57 41 56 4c 63 f2 41 55 41 54 53 48 83 ec 18 48 85 ff 0f 84 c7 00 00 00 48 8b 07 <41> 89 d4 49 89 f5 48 89 fb 48 85 c0 0f 84 64 01 00 00 48 83 78 10
when running a networked workload with about 5000 sockets being polled
for. Fix this by clearing node->next when the node is being removed from
the list.
Fixes: 6206f0e180
("io-wq: shrink io_wq_work a bit")
Reported-by: Dan Melnic <dmm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
125 lines
2.9 KiB
C
125 lines
2.9 KiB
C
#ifndef INTERNAL_IO_WQ_H
|
|
#define INTERNAL_IO_WQ_H
|
|
|
|
struct io_wq;
|
|
|
|
enum {
|
|
IO_WQ_WORK_CANCEL = 1,
|
|
IO_WQ_WORK_HAS_MM = 2,
|
|
IO_WQ_WORK_HASHED = 4,
|
|
IO_WQ_WORK_NEEDS_USER = 8,
|
|
IO_WQ_WORK_NEEDS_FILES = 16,
|
|
IO_WQ_WORK_UNBOUND = 32,
|
|
IO_WQ_WORK_INTERNAL = 64,
|
|
IO_WQ_WORK_CB = 128,
|
|
|
|
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
|
};
|
|
|
|
enum io_wq_cancel {
|
|
IO_WQ_CANCEL_OK, /* cancelled before started */
|
|
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
|
|
IO_WQ_CANCEL_NOTFOUND, /* work not found */
|
|
};
|
|
|
|
struct io_wq_work_node {
|
|
struct io_wq_work_node *next;
|
|
};
|
|
|
|
struct io_wq_work_list {
|
|
struct io_wq_work_node *first;
|
|
struct io_wq_work_node *last;
|
|
};
|
|
|
|
static inline void wq_list_add_tail(struct io_wq_work_node *node,
|
|
struct io_wq_work_list *list)
|
|
{
|
|
if (!list->first) {
|
|
list->first = list->last = node;
|
|
} else {
|
|
list->last->next = node;
|
|
list->last = node;
|
|
}
|
|
}
|
|
|
|
static inline void wq_node_del(struct io_wq_work_list *list,
|
|
struct io_wq_work_node *node,
|
|
struct io_wq_work_node *prev)
|
|
{
|
|
if (node == list->first)
|
|
list->first = node->next;
|
|
if (node == list->last)
|
|
list->last = prev;
|
|
if (prev)
|
|
prev->next = node->next;
|
|
node->next = NULL;
|
|
}
|
|
|
|
#define wq_list_for_each(pos, prv, head) \
|
|
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
|
|
|
|
#define wq_list_empty(list) ((list)->first == NULL)
|
|
#define INIT_WQ_LIST(list) do { \
|
|
(list)->first = NULL; \
|
|
(list)->last = NULL; \
|
|
} while (0)
|
|
|
|
struct io_wq_work {
|
|
union {
|
|
struct io_wq_work_node list;
|
|
void *data;
|
|
};
|
|
void (*func)(struct io_wq_work **);
|
|
struct files_struct *files;
|
|
unsigned flags;
|
|
};
|
|
|
|
#define INIT_IO_WORK(work, _func) \
|
|
do { \
|
|
(work)->list.next = NULL; \
|
|
(work)->func = _func; \
|
|
(work)->flags = 0; \
|
|
(work)->files = NULL; \
|
|
} while (0) \
|
|
|
|
typedef void (get_work_fn)(struct io_wq_work *);
|
|
typedef void (put_work_fn)(struct io_wq_work *);
|
|
|
|
struct io_wq_data {
|
|
struct mm_struct *mm;
|
|
struct user_struct *user;
|
|
const struct cred *creds;
|
|
|
|
get_work_fn *get_work;
|
|
put_work_fn *put_work;
|
|
};
|
|
|
|
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
|
void io_wq_destroy(struct io_wq *wq);
|
|
|
|
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
|
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
|
|
void io_wq_flush(struct io_wq *wq);
|
|
|
|
void io_wq_cancel_all(struct io_wq *wq);
|
|
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
|
|
|
|
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
|
|
|
|
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
|
|
void *data);
|
|
|
|
#if defined(CONFIG_IO_WQ)
|
|
extern void io_wq_worker_sleeping(struct task_struct *);
|
|
extern void io_wq_worker_running(struct task_struct *);
|
|
#else
|
|
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
|
|
{
|
|
}
|
|
static inline void io_wq_worker_running(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif /* CONFIG_IO_WQ */
|
|
|
|
#endif /* INTERNAL_IO_WQ_H */
|