mm: swapoff: shmem_unuse() stop eviction without igrab()
The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by778dd893ae
("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes:b56a2d8af9
("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins <hughd@google.com> Cc: "Alex Xu (Hello71)" <alex_y_xu@yahoo.ca> Cc: Huang Ying <ying.huang@intel.com> Cc: Kelley Nielsen <kelleynnn@gmail.com> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Rik van Riel <riel@surriel.com> Cc: Vineeth Pillai <vpillai@digitalocean.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
64165b1aff
commit
af53d3e9e0
@ -21,6 +21,7 @@ struct shmem_inode_info {
|
||||
struct list_head swaplist; /* chain of maybes on swap */
|
||||
struct shared_policy policy; /* NUMA memory alloc policy */
|
||||
struct simple_xattrs xattrs; /* list of xattrs */
|
||||
atomic_t stop_eviction; /* hold when working on inode */
|
||||
struct inode vfs_inode;
|
||||
};
|
||||
|
||||
|
40
mm/shmem.c
40
mm/shmem.c
@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode)
|
||||
}
|
||||
spin_unlock(&sbinfo->shrinklist_lock);
|
||||
}
|
||||
if (!list_empty(&info->swaplist)) {
|
||||
while (!list_empty(&info->swaplist)) {
|
||||
/* Wait while shmem_unuse() is scanning this inode... */
|
||||
wait_var_event(&info->stop_eviction,
|
||||
!atomic_read(&info->stop_eviction));
|
||||
mutex_lock(&shmem_swaplist_mutex);
|
||||
list_del_init(&info->swaplist);
|
||||
/* ...but beware of the race if we peeked too early */
|
||||
if (!atomic_read(&info->stop_eviction))
|
||||
list_del_init(&info->swaplist);
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
}
|
||||
}
|
||||
@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
{
|
||||
struct shmem_inode_info *info, *next;
|
||||
struct inode *inode;
|
||||
struct inode *prev_inode = NULL;
|
||||
int error = 0;
|
||||
|
||||
if (list_empty(&shmem_swaplist))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&shmem_swaplist_mutex);
|
||||
|
||||
/*
|
||||
* The extra refcount on the inode is necessary to safely dereference
|
||||
* p->next after re-acquiring the lock. New shmem inodes with swap
|
||||
* get added to the end of the list and we will scan them all.
|
||||
*/
|
||||
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
|
||||
if (!info->swapped) {
|
||||
list_del_init(&info->swaplist);
|
||||
continue;
|
||||
}
|
||||
|
||||
inode = igrab(&info->vfs_inode);
|
||||
if (!inode)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Drop the swaplist mutex while searching the inode for swap;
|
||||
* but before doing so, make sure shmem_evict_inode() will not
|
||||
* remove placeholder inode from swaplist, nor let it be freed
|
||||
* (igrab() would protect from unlink, but not from unmount).
|
||||
*/
|
||||
atomic_inc(&info->stop_eviction);
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
if (prev_inode)
|
||||
iput(prev_inode);
|
||||
prev_inode = inode;
|
||||
|
||||
error = shmem_unuse_inode(inode, type, frontswap,
|
||||
error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
|
||||
fs_pages_to_unuse);
|
||||
cond_resched();
|
||||
|
||||
@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap,
|
||||
next = list_next_entry(info, swaplist);
|
||||
if (!info->swapped)
|
||||
list_del_init(&info->swaplist);
|
||||
if (atomic_dec_and_test(&info->stop_eviction))
|
||||
wake_up_var(&info->stop_eviction);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
|
||||
if (prev_inode)
|
||||
iput(prev_inode);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
|
||||
info = SHMEM_I(inode);
|
||||
memset(info, 0, (char *)inode - (char *)info);
|
||||
spin_lock_init(&info->lock);
|
||||
atomic_set(&info->stop_eviction, 0);
|
||||
info->seals = F_SEAL_SEAL;
|
||||
info->flags = flags & VM_NORESERVE;
|
||||
INIT_LIST_HEAD(&info->shrinklist);
|
||||
|
@ -2116,12 +2116,11 @@ retry:
|
||||
* Under global memory pressure, swap entries can be reinserted back
|
||||
* into process space after the mmlist loop above passes over them.
|
||||
*
|
||||
* Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
|
||||
* a shmem inode using swap is being evicted; and when mmget_not_zero()
|
||||
* above fails, that mm is likely to be freeing swap from exit_mmap().
|
||||
* Both proceed at their own independent pace: we could move them to
|
||||
* separate lists, and wait for those lists to be emptied; but it's
|
||||
* easier and more robust (though cpu-intensive) just to keep retrying.
|
||||
* Limit the number of retries? No: when mmget_not_zero() above fails,
|
||||
* that mm is likely to be freeing swap from exit_mmap(), which proceeds
|
||||
* at its own independent pace; and even shmem_writepage() could have
|
||||
* been preempted after get_swap_page(), temporarily hiding that swap.
|
||||
* It's easy and robust (though cpu-intensive) just to keep retrying.
|
||||
*/
|
||||
if (si->inuse_pages) {
|
||||
if (!signal_pending(current))
|
||||
|
Loading…
Reference in New Issue
Block a user