epoll: take epitem list out of struct file
Move the head of epitem list out of struct file; for epoll ones it's moved into struct eventpoll (->refs there), for non-epoll - into the new object (struct epitem_head). In place of ->f_ep_links we leave a pointer to the list head (->f_ep). ->f_ep is protected by ->f_lock and it's zeroed as soon as the list of epitems becomes empty (that can happen only in ep_remove() by now). The list of files for reverse path check is *not* going through struct file now - it's a single-linked list going through epitem_head instances. It's terminated by ERR_PTR(-1) (== EP_UNACTIVE_POINTER), so the elements of list can be distinguished by head->next != NULL. epitem_head instances are allocated at ep_insert() time (by attach_epitem()) and freed either by ep_remove() (if it empties the set of epitems *and* epitem_head does not belong to the reverse path check list) or by clear_tfile_check_list() when the list is emptied (if the set of epitems is empty by that point). Allocations are done from a separate slab - minimal kmalloc() size is too large on some architectures. As the result, we trim struct file _and_ get rid of the games with temporary file references. Locking and barriers are interesting (aren't they always); see unlist_file() and ep_remove() for details. The non-obvious part is that ep_remove() needs to decide if it will be the one to free the damn thing *before* actually storing NULL to head->epitems.first - that's what smp_load_acquire is for in there. unlist_file() lockless path is safe, since we hit it only if we observe NULL in head->epitems.first and whoever had done that store is guaranteed to have observed non-NULL in head->next. IOW, their last access had been the store of NULL into ->epitems.first and we can safely free the sucker. OTOH, we are under rcu_read_lock() and both epitem and epitem->file have their freeing RCU-delayed. So if we see non-NULL ->epitems.first, we can grab ->f_lock (all epitems in there share the same struct file) and safely recheck the emptiness of ->epitems; again, ->next is still non-NULL, so ep_remove() couldn't have freed head yet. ->f_lock serializes us wrt ep_remove(); the rest is trivial. Note that once head->epitems becomes NULL, nothing can get inserted into it - the only remaining reference to head after that point is from the reverse path check list. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
parent
d9f41e3c95
commit
319c151747
168
fs/eventpoll.c
168
fs/eventpoll.c
@ -215,6 +215,7 @@ struct eventpoll {
|
||||
|
||||
/* used to optimize loop detection check */
|
||||
u64 gen;
|
||||
struct hlist_head refs;
|
||||
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
/* used to track busy poll napi_id */
|
||||
@ -259,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly;
|
||||
* List of files with newly added links, where we may need to limit the number
|
||||
* of emanating paths. Protected by the epmutex.
|
||||
*/
|
||||
static LIST_HEAD(tfile_check_list);
|
||||
struct epitems_head {
|
||||
struct hlist_head epitems;
|
||||
struct epitems_head *next;
|
||||
};
|
||||
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
|
||||
|
||||
static struct kmem_cache *ephead_cache __read_mostly;
|
||||
|
||||
static inline void free_ephead(struct epitems_head *head)
|
||||
{
|
||||
if (head)
|
||||
kmem_cache_free(ephead_cache, head);
|
||||
}
|
||||
|
||||
static void list_file(struct file *file)
|
||||
{
|
||||
struct epitems_head *head;
|
||||
|
||||
head = container_of(file->f_ep, struct epitems_head, epitems);
|
||||
if (!head->next) {
|
||||
head->next = tfile_check_list;
|
||||
tfile_check_list = head;
|
||||
}
|
||||
}
|
||||
|
||||
static void unlist_file(struct epitems_head *head)
|
||||
{
|
||||
struct epitems_head *to_free = head;
|
||||
struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
|
||||
if (p) {
|
||||
struct epitem *epi= container_of(p, struct epitem, fllink);
|
||||
spin_lock(&epi->ffd.file->f_lock);
|
||||
if (!hlist_empty(&head->epitems))
|
||||
to_free = NULL;
|
||||
head->next = NULL;
|
||||
spin_unlock(&epi->ffd.file->f_lock);
|
||||
}
|
||||
free_ephead(to_free);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
||||
@ -632,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head)
|
||||
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
||||
{
|
||||
struct file *file = epi->ffd.file;
|
||||
struct epitems_head *to_free;
|
||||
struct hlist_head *head;
|
||||
|
||||
lockdep_assert_irqs_enabled();
|
||||
|
||||
@ -642,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
||||
|
||||
/* Remove the current item from the list of epoll hooks */
|
||||
spin_lock(&file->f_lock);
|
||||
to_free = NULL;
|
||||
head = file->f_ep;
|
||||
if (head->first == &epi->fllink && !epi->fllink.next) {
|
||||
file->f_ep = NULL;
|
||||
if (!is_file_epoll(file)) {
|
||||
struct epitems_head *v;
|
||||
v = container_of(head, struct epitems_head, epitems);
|
||||
if (!smp_load_acquire(&v->next))
|
||||
to_free = v;
|
||||
}
|
||||
}
|
||||
hlist_del_rcu(&epi->fllink);
|
||||
spin_unlock(&file->f_lock);
|
||||
free_ephead(to_free);
|
||||
|
||||
rb_erase_cached(&epi->rbn, &ep->rbr);
|
||||
|
||||
@ -852,7 +905,11 @@ void eventpoll_release_file(struct file *file)
|
||||
* Besides, ep_remove() acquires the lock, so we can't hold it here.
|
||||
*/
|
||||
mutex_lock(&epmutex);
|
||||
hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
|
||||
if (unlikely(!file->f_ep)) {
|
||||
mutex_unlock(&epmutex);
|
||||
return;
|
||||
}
|
||||
hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
|
||||
ep = epi->ep;
|
||||
mutex_lock_nested(&ep->mtx, 0);
|
||||
ep_remove(ep, epi);
|
||||
@ -1248,7 +1305,7 @@ static void path_count_init(void)
|
||||
path_count[i] = 0;
|
||||
}
|
||||
|
||||
static int reverse_path_check_proc(struct file *file, int depth)
|
||||
static int reverse_path_check_proc(struct hlist_head *refs, int depth)
|
||||
{
|
||||
int error = 0;
|
||||
struct epitem *epi;
|
||||
@ -1257,14 +1314,12 @@ static int reverse_path_check_proc(struct file *file, int depth)
|
||||
return -1;
|
||||
|
||||
/* CTL_DEL can remove links here, but that can't increase our count */
|
||||
hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
|
||||
struct file *recepient = epi->ep->file;
|
||||
if (WARN_ON(!is_file_epoll(recepient)))
|
||||
continue;
|
||||
if (hlist_empty(&recepient->f_ep_links))
|
||||
hlist_for_each_entry_rcu(epi, refs, fllink) {
|
||||
struct hlist_head *refs = &epi->ep->refs;
|
||||
if (hlist_empty(refs))
|
||||
error = path_count_inc(depth);
|
||||
else
|
||||
error = reverse_path_check_proc(recepient, depth + 1);
|
||||
error = reverse_path_check_proc(refs, depth + 1);
|
||||
if (error != 0)
|
||||
break;
|
||||
}
|
||||
@ -1272,7 +1327,7 @@ static int reverse_path_check_proc(struct file *file, int depth)
|
||||
}
|
||||
|
||||
/**
|
||||
* reverse_path_check - The tfile_check_list is list of file *, which have
|
||||
* reverse_path_check - The tfile_check_list is list of epitem_head, which have
|
||||
* links that are proposed to be newly added. We need to
|
||||
* make sure that those added links don't add too many
|
||||
* paths such that we will spend all our time waking up
|
||||
@ -1283,19 +1338,18 @@ static int reverse_path_check_proc(struct file *file, int depth)
|
||||
*/
|
||||
static int reverse_path_check(void)
|
||||
{
|
||||
int error = 0;
|
||||
struct file *current_file;
|
||||
struct epitems_head *p;
|
||||
|
||||
/* let's call this for all tfiles */
|
||||
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
|
||||
for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
|
||||
int error;
|
||||
path_count_init();
|
||||
rcu_read_lock();
|
||||
error = reverse_path_check_proc(current_file, 0);
|
||||
error = reverse_path_check_proc(&p->epitems, 0);
|
||||
rcu_read_unlock();
|
||||
if (error)
|
||||
break;
|
||||
return error;
|
||||
}
|
||||
return error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ep_create_wakeup_source(struct epitem *epi)
|
||||
@ -1336,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
|
||||
wakeup_source_unregister(ws);
|
||||
}
|
||||
|
||||
static int attach_epitem(struct file *file, struct epitem *epi)
|
||||
{
|
||||
struct epitems_head *to_free = NULL;
|
||||
struct hlist_head *head = NULL;
|
||||
struct eventpoll *ep = NULL;
|
||||
|
||||
if (is_file_epoll(file))
|
||||
ep = file->private_data;
|
||||
|
||||
if (ep) {
|
||||
head = &ep->refs;
|
||||
} else if (!READ_ONCE(file->f_ep)) {
|
||||
allocate:
|
||||
to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
|
||||
if (!to_free)
|
||||
return -ENOMEM;
|
||||
head = &to_free->epitems;
|
||||
}
|
||||
spin_lock(&file->f_lock);
|
||||
if (!file->f_ep) {
|
||||
if (unlikely(!head)) {
|
||||
spin_unlock(&file->f_lock);
|
||||
goto allocate;
|
||||
}
|
||||
file->f_ep = head;
|
||||
to_free = NULL;
|
||||
}
|
||||
hlist_add_head_rcu(&epi->fllink, file->f_ep);
|
||||
spin_unlock(&file->f_lock);
|
||||
free_ephead(to_free);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be called with "mtx" held.
|
||||
*/
|
||||
@ -1367,19 +1454,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
|
||||
epi->event = *event;
|
||||
epi->next = EP_UNACTIVE_PTR;
|
||||
|
||||
atomic_long_inc(&ep->user->epoll_watches);
|
||||
|
||||
if (tep)
|
||||
mutex_lock_nested(&tep->mtx, 1);
|
||||
/* Add the current item to the list of active epoll hook for this file */
|
||||
spin_lock(&tfile->f_lock);
|
||||
hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links);
|
||||
spin_unlock(&tfile->f_lock);
|
||||
if (full_check && !tep) {
|
||||
get_file(tfile);
|
||||
list_add(&tfile->f_tfile_llink, &tfile_check_list);
|
||||
if (unlikely(attach_epitem(tfile, epi) < 0)) {
|
||||
kmem_cache_free(epi_cache, epi);
|
||||
if (tep)
|
||||
mutex_unlock(&tep->mtx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (full_check && !tep)
|
||||
list_file(tfile);
|
||||
|
||||
atomic_long_inc(&ep->user->epoll_watches);
|
||||
|
||||
/*
|
||||
* Add the current item to the RB tree. All RB tree operations are
|
||||
* protected by "mtx", and ep_insert() is called with "mtx" held.
|
||||
@ -1813,11 +1902,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
|
||||
* not already there, and calling reverse_path_check()
|
||||
* during ep_insert().
|
||||
*/
|
||||
if (list_empty(&epi->ffd.file->f_tfile_llink)) {
|
||||
if (get_file_rcu(epi->ffd.file))
|
||||
list_add(&epi->ffd.file->f_tfile_llink,
|
||||
&tfile_check_list);
|
||||
}
|
||||
list_file(epi->ffd.file);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&ep->mtx);
|
||||
@ -1844,16 +1929,13 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
|
||||
|
||||
static void clear_tfile_check_list(void)
|
||||
{
|
||||
struct file *file;
|
||||
|
||||
/* first clear the tfile_check_list */
|
||||
while (!list_empty(&tfile_check_list)) {
|
||||
file = list_first_entry(&tfile_check_list, struct file,
|
||||
f_tfile_llink);
|
||||
list_del_init(&file->f_tfile_llink);
|
||||
fput(file);
|
||||
rcu_read_lock();
|
||||
while (tfile_check_list != EP_UNACTIVE_PTR) {
|
||||
struct epitems_head *head = tfile_check_list;
|
||||
tfile_check_list = head->next;
|
||||
unlist_file(head);
|
||||
}
|
||||
INIT_LIST_HEAD(&tfile_check_list);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2003,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
||||
if (error)
|
||||
goto error_tgt_fput;
|
||||
if (op == EPOLL_CTL_ADD) {
|
||||
if (!hlist_empty(&f.file->f_ep_links) ||
|
||||
ep->gen == loop_check_gen ||
|
||||
is_file_epoll(tf.file)) {
|
||||
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
|
||||
is_file_epoll(tf.file)) {
|
||||
mutex_unlock(&ep->mtx);
|
||||
error = epoll_mutex_lock(&epmutex, 0, nonblock);
|
||||
if (error)
|
||||
@ -2216,6 +2297,9 @@ static int __init eventpoll_init(void)
|
||||
pwq_cache = kmem_cache_create("eventpoll_pwq",
|
||||
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
|
||||
|
||||
ephead_cache = kmem_cache_create("ep_head",
|
||||
sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
fs_initcall(eventpoll_init);
|
||||
|
@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
|
||||
rwlock_init(&f->f_owner.lock);
|
||||
spin_lock_init(&f->f_lock);
|
||||
mutex_init(&f->f_pos_lock);
|
||||
eventpoll_init_file(f);
|
||||
f->f_flags = flags;
|
||||
f->f_mode = OPEN_FMODE(flags);
|
||||
/* f->f_version: 0 */
|
||||
|
@ -22,14 +22,6 @@ struct file;
|
||||
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
|
||||
#endif
|
||||
|
||||
/* Used to initialize the epoll bits inside the "struct file" */
|
||||
static inline void eventpoll_init_file(struct file *file)
|
||||
{
|
||||
INIT_HLIST_HEAD(&file->f_ep_links);
|
||||
INIT_LIST_HEAD(&file->f_tfile_llink);
|
||||
}
|
||||
|
||||
|
||||
/* Used to release the epoll bits inside the "struct file" */
|
||||
void eventpoll_release_file(struct file *file);
|
||||
|
||||
@ -50,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
|
||||
* because the file in on the way to be removed and nobody ( but
|
||||
* eventpoll ) has still a reference to this file.
|
||||
*/
|
||||
if (likely(hlist_empty(&file->f_ep_links)))
|
||||
if (likely(!file->f_ep))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -72,7 +64,6 @@ static inline int ep_op_has_event(int op)
|
||||
|
||||
#else
|
||||
|
||||
static inline void eventpoll_init_file(struct file *file) {}
|
||||
static inline void eventpoll_release(struct file *file) {}
|
||||
|
||||
#endif
|
||||
|
@ -923,7 +923,7 @@ struct file {
|
||||
const struct file_operations *f_op;
|
||||
|
||||
/*
|
||||
* Protects f_ep_links, f_flags.
|
||||
* Protects f_ep, f_flags.
|
||||
* Must not be taken from IRQ context.
|
||||
*/
|
||||
spinlock_t f_lock;
|
||||
@ -946,8 +946,7 @@ struct file {
|
||||
|
||||
#ifdef CONFIG_EPOLL
|
||||
/* Used by fs/eventpoll.c to link all the hooks to this file */
|
||||
struct hlist_head f_ep_links;
|
||||
struct list_head f_tfile_llink;
|
||||
struct hlist_head *f_ep;
|
||||
#endif /* #ifdef CONFIG_EPOLL */
|
||||
struct address_space *f_mapping;
|
||||
errseq_t f_wb_err;
|
||||
|
Loading…
x
Reference in New Issue
Block a user