diff --git a/mm/filemap.c b/mm/filemap.c index 64d014f940e9..2815cb79a246 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2390,28 +2390,92 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. */ -static void do_sync_mmap_readahead(struct vm_fault *vmf) +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; + + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; +} + + +/* + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. + */ +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (!ra->ra_pages) - return; + return fpin; if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2423,37 +2487,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf) * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2485,6 +2556,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2506,25 +2578,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf, page); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) + if (!page) { + if (fpin) + goto out_retry; return vmf_error(-ENOMEM); + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2541,6 +2614,16 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; + /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. @@ -2563,12 +2646,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2577,6 +2663,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault);