powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP
The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") added a force flush mode to the mmu_gather flush, which unconditionally flushes the entire address range being invalidated (even if actual ptes only covered a smaller range), to solve a problem with concurrent threads invalidating the same PTEs causing them to miss TLBs that need flushing. This does not work with powerpc that invalidates mmu_gather batches according to page size. Have powerpc flush all possible page sizes in the range if it encounters this concurrency condition. Patch 4647706ebe ("mm: always flush VMA ranges affected by zap_page_range") does add a TLB flush for all page sizes on powerpc for the zap_page_range case, but that is to be removed and replaced with the mmu_gather flush to avoid redundant flushing. It is also thought to not cover other obscure race conditions: https://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Hash does not have a problem because it invalidates TLBs inside the page table locks. Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
69a8405999
commit
02390f66bd
@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
|
|||||||
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
||||||
static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
|
static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
|
||||||
|
|
||||||
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
|
static inline void __radix__flush_tlb_range(struct mm_struct *mm,
|
||||||
unsigned long end)
|
unsigned long start, unsigned long end,
|
||||||
|
bool flush_all_sizes)
|
||||||
|
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = vma->vm_mm;
|
|
||||||
unsigned long pid;
|
unsigned long pid;
|
||||||
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
|
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
|
||||||
unsigned long page_size = 1UL << page_shift;
|
unsigned long page_size = 1UL << page_shift;
|
||||||
unsigned long nr_pages = (end - start) >> page_shift;
|
unsigned long nr_pages = (end - start) >> page_shift;
|
||||||
bool local, full;
|
bool local, full;
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
|
||||||
if (is_vm_hugetlb_page(vma))
|
|
||||||
return radix__flush_hugetlb_tlb_range(vma, start, end);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
pid = mm->context.id;
|
pid = mm->context.id;
|
||||||
if (unlikely(pid == MMU_NO_CONTEXT))
|
if (unlikely(pid == MMU_NO_CONTEXT))
|
||||||
return;
|
return;
|
||||||
@ -738,37 +733,64 @@ is_local:
|
|||||||
_tlbie_pid(pid, RIC_FLUSH_TLB);
|
_tlbie_pid(pid, RIC_FLUSH_TLB);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bool hflush = false;
|
bool hflush = flush_all_sizes;
|
||||||
|
bool gflush = flush_all_sizes;
|
||||||
unsigned long hstart, hend;
|
unsigned long hstart, hend;
|
||||||
|
unsigned long gstart, gend;
|
||||||
|
|
||||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
|
||||||
hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
|
|
||||||
hend = end >> HPAGE_PMD_SHIFT;
|
|
||||||
if (hstart < hend) {
|
|
||||||
hstart <<= HPAGE_PMD_SHIFT;
|
|
||||||
hend <<= HPAGE_PMD_SHIFT;
|
|
||||||
hflush = true;
|
hflush = true;
|
||||||
|
|
||||||
|
if (hflush) {
|
||||||
|
hstart = (start + PMD_SIZE - 1) & PMD_MASK;
|
||||||
|
hend = end & PMD_MASK;
|
||||||
|
if (hstart == hend)
|
||||||
|
hflush = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (gflush) {
|
||||||
|
gstart = (start + PUD_SIZE - 1) & PUD_MASK;
|
||||||
|
gend = end & PUD_MASK;
|
||||||
|
if (gstart == gend)
|
||||||
|
gflush = false;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
asm volatile("ptesync": : :"memory");
|
asm volatile("ptesync": : :"memory");
|
||||||
if (local) {
|
if (local) {
|
||||||
__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
|
__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
|
||||||
if (hflush)
|
if (hflush)
|
||||||
__tlbiel_va_range(hstart, hend, pid,
|
__tlbiel_va_range(hstart, hend, pid,
|
||||||
HPAGE_PMD_SIZE, MMU_PAGE_2M);
|
PMD_SIZE, MMU_PAGE_2M);
|
||||||
|
if (gflush)
|
||||||
|
__tlbiel_va_range(gstart, gend, pid,
|
||||||
|
PUD_SIZE, MMU_PAGE_1G);
|
||||||
asm volatile("ptesync": : :"memory");
|
asm volatile("ptesync": : :"memory");
|
||||||
} else {
|
} else {
|
||||||
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
|
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
|
||||||
if (hflush)
|
if (hflush)
|
||||||
__tlbie_va_range(hstart, hend, pid,
|
__tlbie_va_range(hstart, hend, pid,
|
||||||
HPAGE_PMD_SIZE, MMU_PAGE_2M);
|
PMD_SIZE, MMU_PAGE_2M);
|
||||||
|
if (gflush)
|
||||||
|
__tlbie_va_range(gstart, gend, pid,
|
||||||
|
PUD_SIZE, MMU_PAGE_1G);
|
||||||
fixup_tlbie();
|
fixup_tlbie();
|
||||||
asm volatile("eieio; tlbsync; ptesync": : :"memory");
|
asm volatile("eieio; tlbsync; ptesync": : :"memory");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
|
||||||
|
unsigned long end)
|
||||||
|
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
|
if (is_vm_hugetlb_page(vma))
|
||||||
|
return radix__flush_hugetlb_tlb_range(vma, start, end);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__radix__flush_tlb_range(vma->vm_mm, start, end, false);
|
||||||
|
}
|
||||||
EXPORT_SYMBOL(radix__flush_tlb_range);
|
EXPORT_SYMBOL(radix__flush_tlb_range);
|
||||||
|
|
||||||
static int radix_get_mmu_psize(int page_size)
|
static int radix_get_mmu_psize(int page_size)
|
||||||
@ -837,6 +859,8 @@ void radix__tlb_flush(struct mmu_gather *tlb)
|
|||||||
int psize = 0;
|
int psize = 0;
|
||||||
struct mm_struct *mm = tlb->mm;
|
struct mm_struct *mm = tlb->mm;
|
||||||
int page_size = tlb->page_size;
|
int page_size = tlb->page_size;
|
||||||
|
unsigned long start = tlb->start;
|
||||||
|
unsigned long end = tlb->end;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if page size is not something we understand, do a full mm flush
|
* if page size is not something we understand, do a full mm flush
|
||||||
@ -847,15 +871,45 @@ void radix__tlb_flush(struct mmu_gather *tlb)
|
|||||||
*/
|
*/
|
||||||
if (tlb->fullmm) {
|
if (tlb->fullmm) {
|
||||||
__flush_all_mm(mm, true);
|
__flush_all_mm(mm, true);
|
||||||
|
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
|
||||||
|
} else if (mm_tlb_flush_nested(mm)) {
|
||||||
|
/*
|
||||||
|
* If there is a concurrent invalidation that is clearing ptes,
|
||||||
|
* then it's possible this invalidation will miss one of those
|
||||||
|
* cleared ptes and miss flushing the TLB. If this invalidate
|
||||||
|
* returns before the other one flushes TLBs, that can result
|
||||||
|
* in it returning while there are still valid TLBs inside the
|
||||||
|
* range to be invalidated.
|
||||||
|
*
|
||||||
|
* See mm/memory.c:tlb_finish_mmu() for more details.
|
||||||
|
*
|
||||||
|
* The solution to this is ensure the entire range is always
|
||||||
|
* flushed here. The problem for powerpc is that the flushes
|
||||||
|
* are page size specific, so this "forced flush" would not
|
||||||
|
* do the right thing if there are a mix of page sizes in
|
||||||
|
* the range to be invalidated. So use __flush_tlb_range
|
||||||
|
* which invalidates all possible page sizes in the range.
|
||||||
|
*
|
||||||
|
* PWC flush probably is not be required because the core code
|
||||||
|
* shouldn't free page tables in this path, but accounting
|
||||||
|
* for the possibility makes us a bit more robust.
|
||||||
|
*
|
||||||
|
* need_flush_all is an uncommon case because page table
|
||||||
|
* teardown should be done with exclusive locks held (but
|
||||||
|
* after locks are dropped another invalidate could come
|
||||||
|
* in), it could be optimized further if necessary.
|
||||||
|
*/
|
||||||
|
if (!tlb->need_flush_all)
|
||||||
|
__radix__flush_tlb_range(mm, start, end, true);
|
||||||
|
else
|
||||||
|
radix__flush_all_mm(mm);
|
||||||
|
#endif
|
||||||
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
|
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
|
||||||
if (!tlb->need_flush_all)
|
if (!tlb->need_flush_all)
|
||||||
radix__flush_tlb_mm(mm);
|
radix__flush_tlb_mm(mm);
|
||||||
else
|
else
|
||||||
radix__flush_all_mm(mm);
|
radix__flush_all_mm(mm);
|
||||||
} else {
|
} else {
|
||||||
unsigned long start = tlb->start;
|
|
||||||
unsigned long end = tlb->end;
|
|
||||||
|
|
||||||
if (!tlb->need_flush_all)
|
if (!tlb->need_flush_all)
|
||||||
radix__flush_tlb_range_psize(mm, start, end, psize);
|
radix__flush_tlb_range_psize(mm, start, end, psize);
|
||||||
else
|
else
|
||||||
|
Loading…
x
Reference in New Issue
Block a user