diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3c2326568193..55631cd73939 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1326,6 +1326,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) return; + /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return; + hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) @@ -1338,6 +1346,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage; + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); /* step 1: check all mapped PTEs are to the right huge page */ @@ -1384,12 +1405,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); + i_mmap_unlock_write(vma->vm_file->f_mapping); + drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1397,6 +1418,7 @@ drop_hpage: abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } @@ -1446,7 +1468,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue; @@ -1468,10 +1491,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); }