linux/fs/io-wq.h

125 lines
3.0 KiB
C
Raw Normal View History

2019-10-22 19:25:58 +03:00
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
2019-10-22 19:25:58 +03:00
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
enum io_wq_cancel {
IO_WQ_CANCEL_OK, /* cancelled before started */
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
if (!list->first) {
list->last = node;
WRITE_ONCE(list->first, node);
} else {
list->last->next = node;
list->last = node;
}
}
static inline void wq_node_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
{
if (node == list->first)
WRITE_ONCE(list->first, node->next);
if (node == list->last)
list->last = prev;
if (prev)
prev->next = node->next;
io-wq: clear node->next on list deletion If someone removes a node from a list, and then later adds it back to a list, we can have invalid data in ->next. This can cause all sorts of issues. One such use case is the IORING_OP_POLL_ADD command, which will do just that if we race and get woken twice without any pending events. This is a pretty rare case, but can happen under extreme loads. Dan reports that he saw the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD d283ce067 P4D d283ce067 PUD e5ca04067 PMD 0 Oops: 0002 [#1] SMP CPU: 17 PID: 10726 Comm: tao:fast-fiber Kdump: loaded Not tainted 5.2.9-02851-gac7bc042d2d1 #116 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 RIP: 0010:io_wqe_enqueue+0x3e/0xd0 Code: 34 24 74 55 8b 47 58 48 8d 6f 50 85 c0 74 50 48 89 df e8 35 7c 75 00 48 83 7b 08 00 48 8b 14 24 0f 84 84 00 00 00 48 8b 4b 10 <48> 89 11 48 89 53 10 83 63 20 fe 48 89 c6 48 89 df e8 0c 7a 75 00 RSP: 0000:ffffc90006858a08 EFLAGS: 00010082 RAX: 0000000000000002 RBX: ffff889037492fc0 RCX: 0000000000000000 RDX: ffff888e40cc11a8 RSI: ffff888e40cc11a8 RDI: ffff889037492fc0 RBP: ffff889037493010 R08: 00000000000000c3 R09: ffffc90006858ab8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888e40cc11a8 R13: 0000000000000000 R14: 00000000000000c3 R15: ffff888e40cc1100 FS: 00007fcddc9db700(0000) GS:ffff88903fa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000e479f5003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: <IRQ> io_poll_wake+0x12f/0x2a0 __wake_up_common+0x86/0x120 __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_rcv_established+0x557/0x630 tcp_v6_do_rcv+0x118/0x3c0 tcp_v6_rcv+0x97e/0x9d0 ip6_protocol_deliver_rcu+0xe3/0x440 ip6_input+0x3d/0xc0 ? ip6_protocol_deliver_rcu+0x440/0x440 ipv6_rcv+0x56/0xd0 ? ip6_rcv_finish_core.isra.18+0x80/0x80 __netif_receive_skb_one_core+0x50/0x70 netif_receive_skb_internal+0x2f/0xa0 napi_gro_receive+0x125/0x150 mlx5e_handle_rx_cqe+0x1d9/0x5a0 ? mlx5e_poll_tx_cq+0x305/0x560 mlx5e_poll_rx_cq+0x49f/0x9c5 mlx5e_napi_poll+0xee/0x640 ? smp_reschedule_interrupt+0x16/0xd0 ? reschedule_interrupt+0xf/0x20 net_rx_action+0x286/0x3d0 __do_softirq+0xca/0x297 irq_exit+0x96/0xa0 do_IRQ+0x54/0xe0 common_interrupt+0xf/0xf </IRQ> RIP: 0033:0x7fdc627a2e3a Code: 31 c0 85 d2 0f 88 f6 00 00 00 55 48 89 e5 41 57 41 56 4c 63 f2 41 55 41 54 53 48 83 ec 18 48 85 ff 0f 84 c7 00 00 00 48 8b 07 <41> 89 d4 49 89 f5 48 89 fb 48 85 c0 0f 84 64 01 00 00 48 83 78 10 when running a networked workload with about 5000 sockets being polled for. Fix this by clearing node->next when the node is being removed from the list. Fixes: 6206f0e180d4 ("io-wq: shrink io_wq_work a bit") Reported-by: Dan Melnic <dmm@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-12-05 03:19:44 +03:00
node->next = NULL;
}
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list) do { \
(list)->first = NULL; \
(list)->last = NULL; \
} while (0)
2019-10-22 19:25:58 +03:00
struct io_wq_work {
union {
struct io_wq_work_node list;
void *data;
};
2019-10-22 19:25:58 +03:00
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
struct fs_struct *fs;
unsigned flags;
pid_t task_pid;
2019-10-22 19:25:58 +03:00
};
#define INIT_IO_WORK(work, _func) \
do { \
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
2019-10-22 19:25:58 +03:00
typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
2019-10-22 19:25:58 +03:00
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
2019-10-22 19:25:58 +03:00
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
2019-10-22 19:25:58 +03:00
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif
2019-10-22 19:25:58 +03:00
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER);
}
#endif