2021-06-30 18:47:13 -07:00
// SPDX-License-Identifier: GPL-2.0
/*
2022-06-28 17:22:30 +08:00
* HugeTLB Vmemmap Optimization ( HVO )
2021-06-30 18:47:13 -07:00
*
2022-06-28 17:22:30 +08:00
* Copyright ( c ) 2020 , ByteDance . All rights reserved .
2021-06-30 18:47:13 -07:00
*
* Author : Muchun Song < songmuchun @ bytedance . com >
*
2022-06-27 09:00:26 +03:00
* See Documentation / mm / vmemmap_dedup . rst
2021-06-30 18:47:13 -07:00
*/
2021-06-30 18:47:25 -07:00
# define pr_fmt(fmt) "HugeTLB: " fmt
2022-06-28 17:22:31 +08:00
# include <linux/pgtable.h>
# include <linux/bootmem_info.h>
# include <asm/pgalloc.h>
# include <asm/tlbflush.h>
2021-06-30 18:47:13 -07:00
# include "hugetlb_vmemmap.h"
2022-06-28 17:22:31 +08:00
/**
* struct vmemmap_remap_walk - walk vmemmap page table
*
* @ remap_pte : called for each lowest - level entry ( PTE ) .
* @ nr_walked : the number of walked pte .
* @ reuse_page : the page which is reused for the tail vmemmap pages .
* @ reuse_addr : the virtual address of the @ reuse_page page .
* @ vmemmap_pages : the list head of the vmemmap pages that can be freed
* or is mapped from .
*/
struct vmemmap_remap_walk {
void ( * remap_pte ) ( pte_t * pte , unsigned long addr ,
struct vmemmap_remap_walk * walk ) ;
unsigned long nr_walked ;
struct page * reuse_page ;
unsigned long reuse_addr ;
struct list_head * vmemmap_pages ;
} ;
static int __split_vmemmap_huge_pmd ( pmd_t * pmd , unsigned long start )
{
pmd_t __pmd ;
int i ;
unsigned long addr = start ;
struct page * page = pmd_page ( * pmd ) ;
pte_t * pgtable = pte_alloc_one_kernel ( & init_mm ) ;
if ( ! pgtable )
return - ENOMEM ;
pmd_populate_kernel ( & init_mm , & __pmd , pgtable ) ;
2022-06-28 17:22:35 +08:00
for ( i = 0 ; i < PTRS_PER_PTE ; i + + , addr + = PAGE_SIZE ) {
2022-06-28 17:22:31 +08:00
pte_t entry , * pte ;
pgprot_t pgprot = PAGE_KERNEL ;
entry = mk_pte ( page + i , pgprot ) ;
pte = pte_offset_kernel ( & __pmd , addr ) ;
set_pte_at ( & init_mm , addr , pte , entry ) ;
}
spin_lock ( & init_mm . page_table_lock ) ;
if ( likely ( pmd_leaf ( * pmd ) ) ) {
/*
* Higher order allocations from buddy allocator must be able to
* be treated as indepdenent small pages ( as they can be freed
* individually ) .
*/
if ( ! PageReserved ( page ) )
split_page ( page , get_order ( PMD_SIZE ) ) ;
/* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb ( ) ;
pmd_populate_kernel ( & init_mm , pmd , pgtable ) ;
flush_tlb_kernel_range ( start , start + PMD_SIZE ) ;
} else {
pte_free_kernel ( & init_mm , pgtable ) ;
}
spin_unlock ( & init_mm . page_table_lock ) ;
return 0 ;
}
static int split_vmemmap_huge_pmd ( pmd_t * pmd , unsigned long start )
{
int leaf ;
spin_lock ( & init_mm . page_table_lock ) ;
leaf = pmd_leaf ( * pmd ) ;
spin_unlock ( & init_mm . page_table_lock ) ;
if ( ! leaf )
return 0 ;
return __split_vmemmap_huge_pmd ( pmd , start ) ;
}
static void vmemmap_pte_range ( pmd_t * pmd , unsigned long addr ,
unsigned long end ,
struct vmemmap_remap_walk * walk )
{
pte_t * pte = pte_offset_kernel ( pmd , addr ) ;
/*
* The reuse_page is found ' first ' in table walk before we start
* remapping ( which is calling @ walk - > remap_pte ) .
*/
if ( ! walk - > reuse_page ) {
walk - > reuse_page = pte_page ( * pte ) ;
/*
* Because the reuse address is part of the range that we are
* walking , skip the reuse address range .
*/
addr + = PAGE_SIZE ;
pte + + ;
walk - > nr_walked + + ;
}
for ( ; addr ! = end ; addr + = PAGE_SIZE , pte + + ) {
walk - > remap_pte ( pte , addr , walk ) ;
walk - > nr_walked + + ;
}
}
static int vmemmap_pmd_range ( pud_t * pud , unsigned long addr ,
unsigned long end ,
struct vmemmap_remap_walk * walk )
{
pmd_t * pmd ;
unsigned long next ;
pmd = pmd_offset ( pud , addr ) ;
do {
int ret ;
ret = split_vmemmap_huge_pmd ( pmd , addr & PMD_MASK ) ;
if ( ret )
return ret ;
next = pmd_addr_end ( addr , end ) ;
vmemmap_pte_range ( pmd , addr , next , walk ) ;
} while ( pmd + + , addr = next , addr ! = end ) ;
return 0 ;
}
static int vmemmap_pud_range ( p4d_t * p4d , unsigned long addr ,
unsigned long end ,
struct vmemmap_remap_walk * walk )
{
pud_t * pud ;
unsigned long next ;
pud = pud_offset ( p4d , addr ) ;
do {
int ret ;
next = pud_addr_end ( addr , end ) ;
ret = vmemmap_pmd_range ( pud , addr , next , walk ) ;
if ( ret )
return ret ;
} while ( pud + + , addr = next , addr ! = end ) ;
return 0 ;
}
static int vmemmap_p4d_range ( pgd_t * pgd , unsigned long addr ,
unsigned long end ,
struct vmemmap_remap_walk * walk )
{
p4d_t * p4d ;
unsigned long next ;
p4d = p4d_offset ( pgd , addr ) ;
do {
int ret ;
next = p4d_addr_end ( addr , end ) ;
ret = vmemmap_pud_range ( p4d , addr , next , walk ) ;
if ( ret )
return ret ;
} while ( p4d + + , addr = next , addr ! = end ) ;
return 0 ;
}
static int vmemmap_remap_range ( unsigned long start , unsigned long end ,
struct vmemmap_remap_walk * walk )
{
unsigned long addr = start ;
unsigned long next ;
pgd_t * pgd ;
VM_BUG_ON ( ! PAGE_ALIGNED ( start ) ) ;
VM_BUG_ON ( ! PAGE_ALIGNED ( end ) ) ;
pgd = pgd_offset_k ( addr ) ;
do {
int ret ;
next = pgd_addr_end ( addr , end ) ;
ret = vmemmap_p4d_range ( pgd , addr , next , walk ) ;
if ( ret )
return ret ;
} while ( pgd + + , addr = next , addr ! = end ) ;
/*
* We only change the mapping of the vmemmap virtual address range
* [ @ start + PAGE_SIZE , end ) , so we only need to flush the TLB which
* belongs to the range .
*/
flush_tlb_kernel_range ( start + PAGE_SIZE , end ) ;
return 0 ;
}
/*
* Free a vmemmap page . A vmemmap page can be allocated from the memblock
* allocator or buddy allocator . If the PG_reserved flag is set , it means
* that it allocated from the memblock allocator , just free it via the
* free_bootmem_page ( ) . Otherwise , use __free_page ( ) .
*/
static inline void free_vmemmap_page ( struct page * page )
{
if ( PageReserved ( page ) )
free_bootmem_page ( page ) ;
else
__free_page ( page ) ;
}
/* Free a list of the vmemmap pages */
static void free_vmemmap_page_list ( struct list_head * list )
{
struct page * page , * next ;
list_for_each_entry_safe ( page , next , list , lru ) {
list_del ( & page - > lru ) ;
free_vmemmap_page ( page ) ;
}
}
static void vmemmap_remap_pte ( pte_t * pte , unsigned long addr ,
struct vmemmap_remap_walk * walk )
{
/*
* Remap the tail pages as read - only to catch illegal write operation
* to the tail pages .
*/
pgprot_t pgprot = PAGE_KERNEL_RO ;
pte_t entry = mk_pte ( walk - > reuse_page , pgprot ) ;
struct page * page = pte_page ( * pte ) ;
list_add_tail ( & page - > lru , walk - > vmemmap_pages ) ;
set_pte_at ( & init_mm , addr , pte , entry ) ;
}
/*
* How many struct page structs need to be reset . When we reuse the head
* struct page , the special metadata ( e . g . page - > flags or page - > mapping )
* cannot copy to the tail struct page structs . The invalid value will be
* checked in the free_tail_pages_check ( ) . In order to avoid the message
* of " corrupted mapping in tail page " . We need to reset at least 3 ( one
* head struct page struct and two tail struct page structs ) struct page
* structs .
*/
# define NR_RESET_STRUCT_PAGE 3
static inline void reset_struct_pages ( struct page * start )
{
int i ;
struct page * from = start + NR_RESET_STRUCT_PAGE ;
for ( i = 0 ; i < NR_RESET_STRUCT_PAGE ; i + + )
memcpy ( start + i , from , sizeof ( * from ) ) ;
}
static void vmemmap_restore_pte ( pte_t * pte , unsigned long addr ,
struct vmemmap_remap_walk * walk )
{
pgprot_t pgprot = PAGE_KERNEL ;
struct page * page ;
void * to ;
BUG_ON ( pte_page ( * pte ) ! = walk - > reuse_page ) ;
page = list_first_entry ( walk - > vmemmap_pages , struct page , lru ) ;
list_del ( & page - > lru ) ;
to = page_to_virt ( page ) ;
copy_page ( to , ( void * ) walk - > reuse_addr ) ;
reset_struct_pages ( to ) ;
set_pte_at ( & init_mm , addr , pte , mk_pte ( page , pgprot ) ) ;
}
/**
* vmemmap_remap_free - remap the vmemmap virtual address range [ @ start , @ end )
* to the page which @ reuse is mapped to , then free vmemmap
* which the range are mapped to .
* @ start : start address of the vmemmap virtual address range that we want
* to remap .
* @ end : end address of the vmemmap virtual address range that we want to
* remap .
* @ reuse : reuse address .
*
* Return : % 0 on success , negative error code otherwise .
*/
static int vmemmap_remap_free ( unsigned long start , unsigned long end ,
unsigned long reuse )
{
int ret ;
LIST_HEAD ( vmemmap_pages ) ;
struct vmemmap_remap_walk walk = {
. remap_pte = vmemmap_remap_pte ,
. reuse_addr = reuse ,
. vmemmap_pages = & vmemmap_pages ,
} ;
/*
* In order to make remapping routine most efficient for the huge pages ,
* the routine of vmemmap page table walking has the following rules
* ( see more details from the vmemmap_pte_range ( ) ) :
*
* - The range [ @ start , @ end ) and the range [ @ reuse , @ reuse + PAGE_SIZE )
* should be continuous .
* - The @ reuse address is part of the range [ @ reuse , @ end ) that we are
* walking which is passed to vmemmap_remap_range ( ) .
* - The @ reuse address is the first in the complete range .
*
* So we need to make sure that @ start and @ reuse meet the above rules .
*/
BUG_ON ( start - reuse ! = PAGE_SIZE ) ;
mmap_read_lock ( & init_mm ) ;
ret = vmemmap_remap_range ( reuse , end , & walk ) ;
if ( ret & & walk . nr_walked ) {
end = reuse + walk . nr_walked * PAGE_SIZE ;
/*
* vmemmap_pages contains pages from the previous
* vmemmap_remap_range call which failed . These
* are pages which were removed from the vmemmap .
* They will be restored in the following call .
*/
walk = ( struct vmemmap_remap_walk ) {
. remap_pte = vmemmap_restore_pte ,
. reuse_addr = reuse ,
. vmemmap_pages = & vmemmap_pages ,
} ;
vmemmap_remap_range ( reuse , end , & walk ) ;
}
mmap_read_unlock ( & init_mm ) ;
free_vmemmap_page_list ( & vmemmap_pages ) ;
return ret ;
}
static int alloc_vmemmap_page_list ( unsigned long start , unsigned long end ,
gfp_t gfp_mask , struct list_head * list )
{
unsigned long nr_pages = ( end - start ) > > PAGE_SHIFT ;
int nid = page_to_nid ( ( struct page * ) start ) ;
struct page * page , * next ;
while ( nr_pages - - ) {
page = alloc_pages_node ( nid , gfp_mask , 0 ) ;
if ( ! page )
goto out ;
list_add_tail ( & page - > lru , list ) ;
}
return 0 ;
out :
list_for_each_entry_safe ( page , next , list , lru )
__free_pages ( page , 0 ) ;
return - ENOMEM ;
}
/**
* vmemmap_remap_alloc - remap the vmemmap virtual address range [ @ start , end )
* to the page which is from the @ vmemmap_pages
* respectively .
* @ start : start address of the vmemmap virtual address range that we want
* to remap .
* @ end : end address of the vmemmap virtual address range that we want to
* remap .
* @ reuse : reuse address .
* @ gfp_mask : GFP flag for allocating vmemmap pages .
*
* Return : % 0 on success , negative error code otherwise .
*/
static int vmemmap_remap_alloc ( unsigned long start , unsigned long end ,
unsigned long reuse , gfp_t gfp_mask )
{
LIST_HEAD ( vmemmap_pages ) ;
struct vmemmap_remap_walk walk = {
. remap_pte = vmemmap_restore_pte ,
. reuse_addr = reuse ,
. vmemmap_pages = & vmemmap_pages ,
} ;
/* See the comment in the vmemmap_remap_free(). */
BUG_ON ( start - reuse ! = PAGE_SIZE ) ;
if ( alloc_vmemmap_page_list ( start , end , gfp_mask , & vmemmap_pages ) )
return - ENOMEM ;
mmap_read_lock ( & init_mm ) ;
vmemmap_remap_range ( reuse , end , & walk ) ;
mmap_read_unlock ( & init_mm ) ;
return 0 ;
}
2022-06-28 17:22:29 +08:00
DEFINE_STATIC_KEY_FALSE ( hugetlb_optimize_vmemmap_key ) ;
2022-04-28 23:16:15 -07:00
EXPORT_SYMBOL ( hugetlb_optimize_vmemmap_key ) ;
2021-06-30 18:47:25 -07:00
2022-06-28 17:22:32 +08:00
static bool vmemmap_optimize_enabled = IS_ENABLED ( CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON ) ;
core_param ( hugetlb_free_vmemmap , vmemmap_optimize_enabled , bool , 0 ) ;
2021-06-30 18:47:13 -07:00
2022-06-28 17:22:33 +08:00
/**
* hugetlb_vmemmap_restore - restore previously optimized ( by
* hugetlb_vmemmap_optimize ( ) ) vmemmap pages which
* will be reallocated and remapped .
* @ h : struct hstate .
* @ head : the head page whose vmemmap pages will be restored .
*
* Return : % 0 if @ head ' s vmemmap pages have been reallocated and remapped ,
* negative error code otherwise .
2021-06-30 18:47:21 -07:00
*/
2022-06-28 17:22:33 +08:00
int hugetlb_vmemmap_restore ( const struct hstate * h , struct page * head )
2021-06-30 18:47:21 -07:00
{
int ret ;
2022-06-28 17:22:33 +08:00
unsigned long vmemmap_start = ( unsigned long ) head , vmemmap_end ;
unsigned long vmemmap_reuse ;
2021-06-30 18:47:21 -07:00
if ( ! HPageVmemmapOptimized ( head ) )
return 0 ;
2022-06-28 17:22:33 +08:00
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size ( h ) ;
vmemmap_reuse = vmemmap_start ;
vmemmap_start + = HUGETLB_VMEMMAP_RESERVE_SIZE ;
2022-04-28 23:16:14 -07:00
2021-06-30 18:47:21 -07:00
/*
2022-06-28 17:22:33 +08:00
* The pages which the vmemmap virtual address range [ @ vmemmap_start ,
2021-06-30 18:47:21 -07:00
* @ vmemmap_end ) are mapped to are freed to the buddy allocator , and
* the range is mapped to the page which @ vmemmap_reuse is mapped to .
* When a HugeTLB page is freed to the buddy allocator , previously
* discarded vmemmap pages must be allocated and remapping .
*/
2022-06-28 17:22:33 +08:00
ret = vmemmap_remap_alloc ( vmemmap_start , vmemmap_end , vmemmap_reuse ,
2021-06-30 18:47:21 -07:00
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE ) ;
2022-05-13 16:48:56 -07:00
if ( ! ret ) {
2021-06-30 18:47:21 -07:00
ClearHPageVmemmapOptimized ( head ) ;
2022-05-13 16:48:56 -07:00
static_branch_dec ( & hugetlb_optimize_vmemmap_key ) ;
}
2021-06-30 18:47:21 -07:00
return ret ;
}
2022-06-28 17:22:33 +08:00
/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
static bool vmemmap_should_optimize ( const struct hstate * h , const struct page * head )
2022-06-17 21:56:50 +08:00
{
2022-06-28 17:22:29 +08:00
if ( ! READ_ONCE ( vmemmap_optimize_enabled ) )
2022-06-28 17:22:33 +08:00
return false ;
if ( ! hugetlb_vmemmap_optimizable ( h ) )
return false ;
2022-06-17 21:56:50 +08:00
if ( IS_ENABLED ( CONFIG_MEMORY_HOTPLUG ) ) {
pmd_t * pmdp , pmd ;
struct page * vmemmap_page ;
unsigned long vaddr = ( unsigned long ) head ;
/*
* Only the vmemmap page ' s vmemmap page can be self - hosted .
* Walking the page tables to find the backing page of the
* vmemmap page .
*/
pmdp = pmd_off_k ( vaddr ) ;
/*
* The READ_ONCE ( ) is used to stabilize * pmdp in a register or
* on the stack so that it will stop changing under the code .
* The only concurrent operation where it can be changed is
* split_vmemmap_huge_pmd ( ) ( * pmdp will be stable after this
* operation ) .
*/
pmd = READ_ONCE ( * pmdp ) ;
if ( pmd_leaf ( pmd ) )
vmemmap_page = pmd_page ( pmd ) + pte_index ( vaddr ) ;
else
vmemmap_page = pte_page ( * pte_offset_kernel ( pmdp , vaddr ) ) ;
/*
* Due to HugeTLB alignment requirements and the vmemmap pages
* being at the start of the hotplugged memory region in
* memory_hotplug . memmap_on_memory case . Checking any vmemmap
* page ' s vmemmap page if it is marked as VmemmapSelfHosted is
* sufficient .
*
* [ hotplugged memory ]
* [ section ] [ . . . ] [ section ]
* [ vmemmap ] [ usable memory ]
* ^ | | |
* + - - - + | |
* ^ | |
* + - - - - - - - + |
* ^ |
* + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
*/
if ( PageVmemmapSelfHosted ( vmemmap_page ) )
2022-06-28 17:22:33 +08:00
return false ;
2022-06-17 21:56:50 +08:00
}
2022-06-28 17:22:33 +08:00
return true ;
2022-06-17 21:56:50 +08:00
}
2022-06-28 17:22:33 +08:00
/**
* hugetlb_vmemmap_optimize - optimize @ head page ' s vmemmap pages .
* @ h : struct hstate .
* @ head : the head page whose vmemmap pages will be optimized .
*
* This function only tries to optimize @ head ' s vmemmap pages and does not
* guarantee that the optimization will succeed after it returns . The caller
* can use HPageVmemmapOptimized ( @ head ) to detect if @ head ' s vmemmap pages
* have been optimized .
*/
void hugetlb_vmemmap_optimize ( const struct hstate * h , struct page * head )
2021-06-30 18:47:13 -07:00
{
2022-06-28 17:22:33 +08:00
unsigned long vmemmap_start = ( unsigned long ) head , vmemmap_end ;
unsigned long vmemmap_reuse ;
2021-06-30 18:47:13 -07:00
2022-06-28 17:22:33 +08:00
if ( ! vmemmap_should_optimize ( h , head ) )
2021-06-30 18:47:13 -07:00
return ;
2022-05-13 16:48:56 -07:00
static_branch_inc ( & hugetlb_optimize_vmemmap_key ) ;
2022-06-28 17:22:33 +08:00
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size ( h ) ;
vmemmap_reuse = vmemmap_start ;
vmemmap_start + = HUGETLB_VMEMMAP_RESERVE_SIZE ;
2021-06-30 18:47:13 -07:00
/*
2022-06-28 17:22:33 +08:00
* Remap the vmemmap virtual address range [ @ vmemmap_start , @ vmemmap_end )
2021-06-30 18:47:13 -07:00
* to the page which @ vmemmap_reuse is mapped to , then free the pages
2022-06-28 17:22:33 +08:00
* which the range [ @ vmemmap_start , @ vmemmap_end ] is mapped to .
2021-06-30 18:47:13 -07:00
*/
2022-06-28 17:22:33 +08:00
if ( vmemmap_remap_free ( vmemmap_start , vmemmap_end , vmemmap_reuse ) )
2022-05-13 16:48:56 -07:00
static_branch_dec ( & hugetlb_optimize_vmemmap_key ) ;
else
2021-06-30 18:48:22 -07:00
SetHPageVmemmapOptimized ( head ) ;
2021-06-30 18:47:13 -07:00
}
2021-06-30 18:47:33 -07:00
2022-05-13 16:48:56 -07:00
static struct ctl_table hugetlb_vmemmap_sysctls [ ] = {
{
. procname = " hugetlb_optimize_vmemmap " ,
2022-06-28 17:22:29 +08:00
. data = & vmemmap_optimize_enabled ,
. maxlen = sizeof ( int ) ,
2022-05-13 16:48:56 -07:00
. mode = 0644 ,
2022-06-28 17:22:29 +08:00
. proc_handler = proc_dobool ,
2022-05-13 16:48:56 -07:00
} ,
{ }
} ;
2022-06-28 17:22:33 +08:00
static int __init hugetlb_vmemmap_init ( void )
2022-05-13 16:48:56 -07:00
{
2022-06-28 17:22:33 +08:00
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
BUILD_BUG_ON ( __NR_USED_SUBPAGE * sizeof ( struct page ) > HUGETLB_VMEMMAP_RESERVE_SIZE ) ;
if ( IS_ENABLED ( CONFIG_PROC_SYSCTL ) ) {
const struct hstate * h ;
for_each_hstate ( h ) {
if ( hugetlb_vmemmap_optimizable ( h ) ) {
register_sysctl_init ( " vm " , hugetlb_vmemmap_sysctls ) ;
break ;
}
}
}
2022-05-13 16:48:56 -07:00
return 0 ;
}
2022-06-28 17:22:33 +08:00
late_initcall ( hugetlb_vmemmap_init ) ;