2009-02-20 10:29:08 +03:00
/*
* linux / mm / percpu . c - percpu memory allocator
*
* Copyright ( C ) 2009 SUSE Linux Products GmbH
* Copyright ( C ) 2009 Tejun Heo < tj @ kernel . org >
*
* This file is released under the GPLv2 .
*
* This is percpu allocator which can handle both static and dynamic
* areas . Percpu areas are allocated in chunks in vmalloc area . Each
* chunk is consisted of num_possible_cpus ( ) units and the first chunk
* is used for static percpu variables in the kernel image ( special
* boot time alloc / init handling necessary as these areas need to be
* brought up before allocation services are running ) . Unit grows as
* necessary and all units grow or shrink in unison . When a chunk is
* filled up , another chunk is allocated . ie . in vmalloc area
*
* c0 c1 c2
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* - - - - - - - - - - - - - - - - - - - . . . . . . - - - - - - - - - - - - - - - - - - - . . . . - - - - - - - - - - - -
*
* Allocation is done in offset - size areas of single unit space . Ie ,
* an area of 512 bytes at 6 k in c1 occupies 512 bytes at 6 k of c1 : u0 ,
* c1 : u1 , c1 : u2 and c1 : u3 . Percpu access can be done by configuring
2009-04-02 08:21:44 +04:00
* percpu base registers pcpu_unit_size apart .
2009-02-20 10:29:08 +03:00
*
* There are usually many small percpu allocations many of them as
* small as 4 bytes . The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one .
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
* area in the chunk . This helps the allocator not to iterate the
* chunk maps unnecessarily .
*
* Allocation state in each chunk is kept using an array of integers
* on chunk - > map . A positive value in the map represents a free
* region and negative allocated . Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry . This is mostly copied from the percpu_modalloc ( ) allocator .
2009-04-02 08:21:44 +04:00
* Chunks can be determined from the address using the index field
* in the page struct . The index field contains a pointer to the chunk .
2009-02-20 10:29:08 +03:00
*
* To use this allocator , arch code should do the followings .
*
2009-03-30 14:07:44 +04:00
* - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
2009-02-20 10:29:08 +03:00
*
* - define __addr_to_pcpu_ptr ( ) and __pcpu_ptr_to_addr ( ) to translate
2009-03-10 10:27:48 +03:00
* regular address to percpu pointer and back if they need to be
* different from the default
2009-02-20 10:29:08 +03:00
*
2009-02-24 05:57:21 +03:00
* - use pcpu_setup_first_chunk ( ) during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
2009-02-20 10:29:08 +03:00
*/
# include <linux/bitmap.h>
# include <linux/bootmem.h>
# include <linux/list.h>
# include <linux/mm.h>
# include <linux/module.h>
# include <linux/mutex.h>
# include <linux/percpu.h>
# include <linux/pfn.h>
# include <linux/slab.h>
2009-03-06 18:44:13 +03:00
# include <linux/spinlock.h>
2009-02-20 10:29:08 +03:00
# include <linux/vmalloc.h>
2009-03-06 18:44:11 +03:00
# include <linux/workqueue.h>
2009-02-20 10:29:08 +03:00
# include <asm/cacheflush.h>
2009-03-10 10:27:48 +03:00
# include <asm/sections.h>
2009-02-20 10:29:08 +03:00
# include <asm/tlbflush.h>
# define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
# define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
2009-03-10 10:27:48 +03:00
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
# ifndef __addr_to_pcpu_ptr
# define __addr_to_pcpu_ptr(addr) \
( void * ) ( ( unsigned long ) ( addr ) - ( unsigned long ) pcpu_base_addr \
+ ( unsigned long ) __per_cpu_start )
# endif
# ifndef __pcpu_ptr_to_addr
# define __pcpu_ptr_to_addr(ptr) \
( void * ) ( ( unsigned long ) ( ptr ) + ( unsigned long ) pcpu_base_addr \
- ( unsigned long ) __per_cpu_start )
# endif
2009-02-20 10:29:08 +03:00
struct pcpu_chunk {
struct list_head list ; /* linked to pcpu_slot lists */
int free_size ; /* free bytes in the chunk */
int contig_hint ; /* max contiguous size hint */
struct vm_struct * vm ; /* mapped vmalloc region */
int map_used ; /* # of map entries used */
int map_alloc ; /* # of map entries allocated */
int * map ; /* allocation map */
2009-02-24 05:57:21 +03:00
bool immutable ; /* no [de]population allowed */
2009-03-06 08:33:59 +03:00
struct page * * page ; /* points to page array */
struct page * page_ar [ ] ; /* #cpus * UNIT_PAGES */
2009-02-20 10:29:08 +03:00
} ;
2009-02-24 06:32:28 +03:00
static int pcpu_unit_pages __read_mostly ;
static int pcpu_unit_size __read_mostly ;
static int pcpu_chunk_size __read_mostly ;
static int pcpu_nr_slots __read_mostly ;
static size_t pcpu_chunk_struct_size __read_mostly ;
2009-02-20 10:29:08 +03:00
/* the address of the first chunk which starts with the kernel static area */
2009-02-24 06:32:28 +03:00
void * pcpu_base_addr __read_mostly ;
2009-02-20 10:29:08 +03:00
EXPORT_SYMBOL_GPL ( pcpu_base_addr ) ;
2009-04-02 08:19:54 +04:00
/*
* The first chunk which always exists . Note that unlike other
* chunks , this one can be allocated and mapped in several different
* ways and thus often doesn ' t live in the vmalloc area .
*/
static struct pcpu_chunk * pcpu_first_chunk ;
/*
* Optional reserved chunk . This chunk reserves part of the first
* chunk and serves it for reserved allocations . The amount of
* reserved offset is in pcpu_reserved_chunk_limit . When reserved
* area doesn ' t exist , the following variables contain NULL and 0
* respectively .
*/
2009-03-06 08:33:59 +03:00
static struct pcpu_chunk * pcpu_reserved_chunk ;
static int pcpu_reserved_chunk_limit ;
2009-02-20 10:29:08 +03:00
/*
2009-03-06 18:44:13 +03:00
* Synchronization rules .
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock . The former
* protects allocation / reclaim paths , chunks and chunk - > page arrays .
* The latter is a spinlock and protects the index data structures -
2009-04-02 08:21:44 +04:00
* chunk slots , chunks and area maps in chunks .
2009-03-06 18:44:13 +03:00
*
* During allocation , pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary . All actual memory
* allocations are done using GFP_KERNEL with pcpu_lock released .
*
* Free path accesses and alters only the index data structures , so it
* can be safely called from atomic context . When memory needs to be
* returned to the system , free path schedules reclaim_work which
* grabs both pcpu_alloc_mutex and pcpu_lock , unlinks chunks to be
* reclaimed , release both locks and frees the chunks . Note that it ' s
* necessary to grab both locks to remove a chunk from circulation as
* allocation path might be referencing the chunk with only
* pcpu_alloc_mutex locked .
2009-02-20 10:29:08 +03:00
*/
2009-03-06 18:44:13 +03:00
static DEFINE_MUTEX ( pcpu_alloc_mutex ) ; /* protects whole alloc and reclaim */
static DEFINE_SPINLOCK ( pcpu_lock ) ; /* protects index data structures */
2009-02-20 10:29:08 +03:00
2009-02-24 06:32:28 +03:00
static struct list_head * pcpu_slot __read_mostly ; /* chunk list slots */
2009-02-20 10:29:08 +03:00
2009-03-06 18:44:11 +03:00
/* reclaim work to release fully free chunks, scheduled from free path */
static void pcpu_reclaim ( struct work_struct * work ) ;
static DECLARE_WORK ( pcpu_reclaim_work , pcpu_reclaim ) ;
2009-02-24 05:57:21 +03:00
static int __pcpu_size_to_slot ( int size )
2009-02-20 10:29:08 +03:00
{
2009-02-21 10:56:23 +03:00
int highbit = fls ( size ) ; /* size is in bytes */
2009-02-20 10:29:08 +03:00
return max ( highbit - PCPU_SLOT_BASE_SHIFT + 2 , 1 ) ;
}
2009-02-24 05:57:21 +03:00
static int pcpu_size_to_slot ( int size )
{
if ( size = = pcpu_unit_size )
return pcpu_nr_slots - 1 ;
return __pcpu_size_to_slot ( size ) ;
}
2009-02-20 10:29:08 +03:00
static int pcpu_chunk_slot ( const struct pcpu_chunk * chunk )
{
if ( chunk - > free_size < sizeof ( int ) | | chunk - > contig_hint < sizeof ( int ) )
return 0 ;
return pcpu_size_to_slot ( chunk - > free_size ) ;
}
static int pcpu_page_idx ( unsigned int cpu , int page_idx )
{
2009-02-24 05:57:21 +03:00
return cpu * pcpu_unit_pages + page_idx ;
2009-02-20 10:29:08 +03:00
}
static struct page * * pcpu_chunk_pagep ( struct pcpu_chunk * chunk ,
unsigned int cpu , int page_idx )
{
return & chunk - > page [ pcpu_page_idx ( cpu , page_idx ) ] ;
}
static unsigned long pcpu_chunk_addr ( struct pcpu_chunk * chunk ,
unsigned int cpu , int page_idx )
{
return ( unsigned long ) chunk - > vm - > addr +
( pcpu_page_idx ( cpu , page_idx ) < < PAGE_SHIFT ) ;
}
static bool pcpu_chunk_page_occupied ( struct pcpu_chunk * chunk ,
int page_idx )
{
return * pcpu_chunk_pagep ( chunk , 0 , page_idx ) ! = NULL ;
}
2009-04-02 08:21:44 +04:00
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk ( struct page * page , struct pcpu_chunk * pcpu )
{
page - > index = ( unsigned long ) pcpu ;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk * pcpu_get_page_chunk ( struct page * page )
{
return ( struct pcpu_chunk * ) page - > index ;
}
2009-02-20 10:29:08 +03:00
/**
2009-03-06 18:44:09 +03:00
* pcpu_mem_alloc - allocate memory
* @ size : bytes to allocate
2009-02-20 10:29:08 +03:00
*
2009-03-06 18:44:09 +03:00
* Allocate @ size bytes . If @ size is smaller than PAGE_SIZE ,
* kzalloc ( ) is used ; otherwise , vmalloc ( ) is used . The returned
* memory is always zeroed .
2009-02-20 10:29:08 +03:00
*
2009-03-06 18:44:13 +03:00
* CONTEXT :
* Does GFP_KERNEL allocation .
*
2009-02-20 10:29:08 +03:00
* RETURNS :
2009-03-06 18:44:09 +03:00
* Pointer to the allocated area on success , NULL on failure .
2009-02-20 10:29:08 +03:00
*/
2009-03-06 18:44:09 +03:00
static void * pcpu_mem_alloc ( size_t size )
2009-02-20 10:29:08 +03:00
{
2009-03-06 18:44:09 +03:00
if ( size < = PAGE_SIZE )
return kzalloc ( size , GFP_KERNEL ) ;
else {
void * ptr = vmalloc ( size ) ;
if ( ptr )
memset ( ptr , 0 , size ) ;
return ptr ;
}
}
2009-02-20 10:29:08 +03:00
2009-03-06 18:44:09 +03:00
/**
* pcpu_mem_free - free memory
* @ ptr : memory to free
* @ size : size of the area
*
* Free @ ptr . @ ptr should have been allocated using pcpu_mem_alloc ( ) .
*/
static void pcpu_mem_free ( void * ptr , size_t size )
{
2009-02-20 10:29:08 +03:00
if ( size < = PAGE_SIZE )
2009-03-06 18:44:09 +03:00
kfree ( ptr ) ;
2009-02-20 10:29:08 +03:00
else
2009-03-06 18:44:09 +03:00
vfree ( ptr ) ;
2009-02-20 10:29:08 +03:00
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @ chunk : chunk of interest
* @ oslot : the previous slot it was on
*
* This function is called after an allocation or free changed @ chunk .
* New slot according to the changed state is determined and @ chunk is
2009-03-06 08:33:59 +03:00
* moved to the slot . Note that the reserved chunk is never put on
* chunk slots .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* pcpu_lock .
2009-02-20 10:29:08 +03:00
*/
static void pcpu_chunk_relocate ( struct pcpu_chunk * chunk , int oslot )
{
int nslot = pcpu_chunk_slot ( chunk ) ;
2009-03-06 08:33:59 +03:00
if ( chunk ! = pcpu_reserved_chunk & & oslot ! = nslot ) {
2009-02-20 10:29:08 +03:00
if ( oslot < nslot )
list_move ( & chunk - > list , & pcpu_slot [ nslot ] ) ;
else
list_move_tail ( & chunk - > list , & pcpu_slot [ nslot ] ) ;
}
}
/**
2009-04-02 08:21:44 +04:00
* pcpu_chunk_addr_search - determine chunk containing specified address
* @ addr : address for which the chunk needs to be determined .
2009-03-06 18:44:13 +03:00
*
2009-02-20 10:29:08 +03:00
* RETURNS :
* The address of the found chunk .
*/
static struct pcpu_chunk * pcpu_chunk_addr_search ( void * addr )
{
2009-04-02 08:19:54 +04:00
void * first_start = pcpu_first_chunk - > vm - > addr ;
2009-02-20 10:29:08 +03:00
2009-04-02 08:19:54 +04:00
/* is it in the first chunk? */
2009-07-04 03:10:58 +04:00
if ( addr > = first_start & & addr < first_start + pcpu_unit_size ) {
2009-04-02 08:19:54 +04:00
/* is it in the reserved area? */
if ( addr < first_start + pcpu_reserved_chunk_limit )
2009-03-06 08:33:59 +03:00
return pcpu_reserved_chunk ;
2009-04-02 08:19:54 +04:00
return pcpu_first_chunk ;
2009-03-06 08:33:59 +03:00
}
2009-04-02 08:21:44 +04:00
return pcpu_get_page_chunk ( vmalloc_to_page ( addr ) ) ;
2009-02-20 10:29:08 +03:00
}
2009-03-06 18:44:09 +03:00
/**
* pcpu_extend_area_map - extend area map for allocation
* @ chunk : target chunk
*
* Extend area map of @ chunk so that it can accomodate an allocation .
* A single allocation can split an area into three areas , so this
* function makes sure that @ chunk - > map has at least two extra slots .
*
2009-03-06 18:44:13 +03:00
* CONTEXT :
* pcpu_alloc_mutex , pcpu_lock . pcpu_lock is released and reacquired
* if area map is extended .
*
2009-03-06 18:44:09 +03:00
* RETURNS :
* 0 if noop , 1 if successfully extended , - errno on failure .
*/
static int pcpu_extend_area_map ( struct pcpu_chunk * chunk )
{
int new_alloc ;
int * new ;
size_t size ;
/* has enough? */
if ( chunk - > map_alloc > = chunk - > map_used + 2 )
return 0 ;
2009-03-06 18:44:13 +03:00
spin_unlock_irq ( & pcpu_lock ) ;
2009-03-06 18:44:09 +03:00
new_alloc = PCPU_DFL_MAP_ALLOC ;
while ( new_alloc < chunk - > map_used + 2 )
new_alloc * = 2 ;
new = pcpu_mem_alloc ( new_alloc * sizeof ( new [ 0 ] ) ) ;
2009-03-06 18:44:13 +03:00
if ( ! new ) {
spin_lock_irq ( & pcpu_lock ) ;
2009-03-06 18:44:09 +03:00
return - ENOMEM ;
2009-03-06 18:44:13 +03:00
}
/*
* Acquire pcpu_lock and switch to new area map . Only free
* could have happened inbetween , so map_used couldn ' t have
* grown .
*/
spin_lock_irq ( & pcpu_lock ) ;
BUG_ON ( new_alloc < chunk - > map_used + 2 ) ;
2009-03-06 18:44:09 +03:00
size = chunk - > map_alloc * sizeof ( chunk - > map [ 0 ] ) ;
memcpy ( new , chunk - > map , size ) ;
/*
* map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
* one of the first chunks and still using static map .
*/
if ( chunk - > map_alloc > = PCPU_DFL_MAP_ALLOC )
pcpu_mem_free ( chunk - > map , size ) ;
chunk - > map_alloc = new_alloc ;
chunk - > map = new ;
return 0 ;
}
2009-02-20 10:29:08 +03:00
/**
* pcpu_split_block - split a map block
* @ chunk : chunk of interest
* @ i : index of map block to split
2009-02-21 10:56:23 +03:00
* @ head : head size in bytes ( can be 0 )
* @ tail : tail size in bytes ( can be 0 )
2009-02-20 10:29:08 +03:00
*
* Split the @ i ' th map block into two or three blocks . If @ head is
* non - zero , @ head bytes block is inserted before block @ i moving it
* to @ i + 1 and reducing its size by @ head bytes .
*
* If @ tail is non - zero , the target block , which can be @ i or @ i + 1
* depending on @ head , is reduced by @ tail bytes and @ tail byte block
* is inserted after the target block .
*
2009-03-06 18:44:09 +03:00
* @ chunk - > map must have enough free slots to accomodate the split .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* pcpu_lock .
2009-02-20 10:29:08 +03:00
*/
2009-03-06 18:44:09 +03:00
static void pcpu_split_block ( struct pcpu_chunk * chunk , int i ,
int head , int tail )
2009-02-20 10:29:08 +03:00
{
int nr_extra = ! ! head + ! ! tail ;
2009-03-06 18:44:09 +03:00
2009-03-06 18:44:09 +03:00
BUG_ON ( chunk - > map_alloc < chunk - > map_used + nr_extra ) ;
2009-02-20 10:29:08 +03:00
2009-03-06 18:44:09 +03:00
/* insert new subblocks */
2009-02-20 10:29:08 +03:00
memmove ( & chunk - > map [ i + nr_extra ] , & chunk - > map [ i ] ,
sizeof ( chunk - > map [ 0 ] ) * ( chunk - > map_used - i ) ) ;
chunk - > map_used + = nr_extra ;
if ( head ) {
chunk - > map [ i + 1 ] = chunk - > map [ i ] - head ;
chunk - > map [ i + + ] = head ;
}
if ( tail ) {
chunk - > map [ i + + ] - = tail ;
chunk - > map [ i ] = tail ;
}
}
/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @ chunk : chunk of interest
2009-02-21 10:56:23 +03:00
* @ size : wanted size in bytes
2009-02-20 10:29:08 +03:00
* @ align : wanted align
*
* Try to allocate @ size bytes area aligned at @ align from @ chunk .
* Note that this function only allocates the offset . It doesn ' t
* populate or map the area .
*
2009-03-06 18:44:09 +03:00
* @ chunk - > map must have at least two free slots .
*
2009-03-06 18:44:13 +03:00
* CONTEXT :
* pcpu_lock .
*
2009-02-20 10:29:08 +03:00
* RETURNS :
2009-03-06 18:44:09 +03:00
* Allocated offset in @ chunk on success , - 1 if no matching area is
* found .
2009-02-20 10:29:08 +03:00
*/
static int pcpu_alloc_area ( struct pcpu_chunk * chunk , int size , int align )
{
int oslot = pcpu_chunk_slot ( chunk ) ;
int max_contig = 0 ;
int i , off ;
for ( i = 0 , off = 0 ; i < chunk - > map_used ; off + = abs ( chunk - > map [ i + + ] ) ) {
bool is_last = i + 1 = = chunk - > map_used ;
int head , tail ;
/* extra for alignment requirement */
head = ALIGN ( off , align ) - off ;
BUG_ON ( i = = 0 & & head ! = 0 ) ;
if ( chunk - > map [ i ] < 0 )
continue ;
if ( chunk - > map [ i ] < head + size ) {
max_contig = max ( chunk - > map [ i ] , max_contig ) ;
continue ;
}
/*
* If head is small or the previous block is free ,
* merge ' em . Note that ' small ' is defined as smaller
* than sizeof ( int ) , which is very small but isn ' t too
* uncommon for percpu allocations .
*/
if ( head & & ( head < sizeof ( int ) | | chunk - > map [ i - 1 ] > 0 ) ) {
if ( chunk - > map [ i - 1 ] > 0 )
chunk - > map [ i - 1 ] + = head ;
else {
chunk - > map [ i - 1 ] - = head ;
chunk - > free_size - = head ;
}
chunk - > map [ i ] - = head ;
off + = head ;
head = 0 ;
}
/* if tail is small, just keep it around */
tail = chunk - > map [ i ] - head - size ;
if ( tail < sizeof ( int ) )
tail = 0 ;
/* split if warranted */
if ( head | | tail ) {
2009-03-06 18:44:09 +03:00
pcpu_split_block ( chunk , i , head , tail ) ;
2009-02-20 10:29:08 +03:00
if ( head ) {
i + + ;
off + = head ;
max_contig = max ( chunk - > map [ i - 1 ] , max_contig ) ;
}
if ( tail )
max_contig = max ( chunk - > map [ i + 1 ] , max_contig ) ;
}
/* update hint and mark allocated */
if ( is_last )
chunk - > contig_hint = max_contig ; /* fully scanned */
else
chunk - > contig_hint = max ( chunk - > contig_hint ,
max_contig ) ;
chunk - > free_size - = chunk - > map [ i ] ;
chunk - > map [ i ] = - chunk - > map [ i ] ;
pcpu_chunk_relocate ( chunk , oslot ) ;
return off ;
}
chunk - > contig_hint = max_contig ; /* fully scanned */
pcpu_chunk_relocate ( chunk , oslot ) ;
2009-03-06 18:44:09 +03:00
/* tell the upper layer that this chunk has no matching area */
return - 1 ;
2009-02-20 10:29:08 +03:00
}
/**
* pcpu_free_area - free area to a pcpu_chunk
* @ chunk : chunk of interest
* @ freeme : offset of area to free
*
* Free area starting from @ freeme to @ chunk . Note that this function
* only modifies the allocation map . It doesn ' t depopulate or unmap
* the area .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* pcpu_lock .
2009-02-20 10:29:08 +03:00
*/
static void pcpu_free_area ( struct pcpu_chunk * chunk , int freeme )
{
int oslot = pcpu_chunk_slot ( chunk ) ;
int i , off ;
for ( i = 0 , off = 0 ; i < chunk - > map_used ; off + = abs ( chunk - > map [ i + + ] ) )
if ( off = = freeme )
break ;
BUG_ON ( off ! = freeme ) ;
BUG_ON ( chunk - > map [ i ] > 0 ) ;
chunk - > map [ i ] = - chunk - > map [ i ] ;
chunk - > free_size + = chunk - > map [ i ] ;
/* merge with previous? */
if ( i > 0 & & chunk - > map [ i - 1 ] > = 0 ) {
chunk - > map [ i - 1 ] + = chunk - > map [ i ] ;
chunk - > map_used - - ;
memmove ( & chunk - > map [ i ] , & chunk - > map [ i + 1 ] ,
( chunk - > map_used - i ) * sizeof ( chunk - > map [ 0 ] ) ) ;
i - - ;
}
/* merge with next? */
if ( i + 1 < chunk - > map_used & & chunk - > map [ i + 1 ] > = 0 ) {
chunk - > map [ i ] + = chunk - > map [ i + 1 ] ;
chunk - > map_used - - ;
memmove ( & chunk - > map [ i + 1 ] , & chunk - > map [ i + 2 ] ,
( chunk - > map_used - ( i + 1 ) ) * sizeof ( chunk - > map [ 0 ] ) ) ;
}
chunk - > contig_hint = max ( chunk - > map [ i ] , chunk - > contig_hint ) ;
pcpu_chunk_relocate ( chunk , oslot ) ;
}
/**
* pcpu_unmap - unmap pages out of a pcpu_chunk
* @ chunk : chunk of interest
* @ page_start : page index of the first page to unmap
* @ page_end : page index of the last page to unmap + 1
2009-06-22 06:56:23 +04:00
* @ flush_tlb : whether to flush tlb or not
2009-02-20 10:29:08 +03:00
*
* For each cpu , unmap pages [ @ page_start , @ page_end ) out of @ chunk .
* If @ flush is true , vcache is flushed before unmapping and tlb
* after .
*/
static void pcpu_unmap ( struct pcpu_chunk * chunk , int page_start , int page_end ,
2009-06-22 06:56:23 +04:00
bool flush_tlb )
2009-02-20 10:29:08 +03:00
{
unsigned int last = num_possible_cpus ( ) - 1 ;
unsigned int cpu ;
2009-02-24 05:57:21 +03:00
/* unmap must not be done on immutable chunk */
WARN_ON ( chunk - > immutable ) ;
2009-02-20 10:29:08 +03:00
/*
* Each flushing trial can be very expensive , issue flush on
* the whole region at once rather than doing it for each cpu .
* This could be an overkill but is more scalable .
*/
2009-06-22 06:56:23 +04:00
flush_cache_vunmap ( pcpu_chunk_addr ( chunk , 0 , page_start ) ,
pcpu_chunk_addr ( chunk , last , page_end ) ) ;
2009-02-20 10:29:08 +03:00
for_each_possible_cpu ( cpu )
unmap_kernel_range_noflush (
pcpu_chunk_addr ( chunk , cpu , page_start ) ,
( page_end - page_start ) < < PAGE_SHIFT ) ;
/* ditto as flush_cache_vunmap() */
2009-06-22 06:56:23 +04:00
if ( flush_tlb )
2009-02-20 10:29:08 +03:00
flush_tlb_kernel_range ( pcpu_chunk_addr ( chunk , 0 , page_start ) ,
pcpu_chunk_addr ( chunk , last , page_end ) ) ;
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @ chunk : chunk to depopulate
* @ off : offset to the area to depopulate
2009-02-21 10:56:23 +03:00
* @ size : size of the area to depopulate in bytes
2009-02-20 10:29:08 +03:00
* @ flush : whether to flush cache and tlb or not
*
* For each cpu , depopulate and unmap pages [ @ page_start , @ page_end )
* from @ chunk . If @ flush is true , vcache is flushed before unmapping
* and tlb after .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* pcpu_alloc_mutex .
2009-02-20 10:29:08 +03:00
*/
2009-02-21 10:56:23 +03:00
static void pcpu_depopulate_chunk ( struct pcpu_chunk * chunk , int off , int size ,
bool flush )
2009-02-20 10:29:08 +03:00
{
int page_start = PFN_DOWN ( off ) ;
int page_end = PFN_UP ( off + size ) ;
int unmap_start = - 1 ;
int uninitialized_var ( unmap_end ) ;
unsigned int cpu ;
int i ;
for ( i = page_start ; i < page_end ; i + + ) {
for_each_possible_cpu ( cpu ) {
struct page * * pagep = pcpu_chunk_pagep ( chunk , cpu , i ) ;
if ( ! * pagep )
continue ;
__free_page ( * pagep ) ;
/*
* If it ' s partial depopulation , it might get
* populated or depopulated again . Mark the
* page gone .
*/
* pagep = NULL ;
unmap_start = unmap_start < 0 ? i : unmap_start ;
unmap_end = i + 1 ;
}
}
if ( unmap_start > = 0 )
pcpu_unmap ( chunk , unmap_start , unmap_end , flush ) ;
}
2009-07-04 03:10:59 +04:00
static int __pcpu_map_pages ( unsigned long addr , struct page * * pages ,
int nr_pages )
{
return map_kernel_range_noflush ( addr , nr_pages < < PAGE_SHIFT ,
PAGE_KERNEL , pages ) ;
}
2009-02-20 10:29:08 +03:00
/**
* pcpu_map - map pages into a pcpu_chunk
* @ chunk : chunk of interest
* @ page_start : page index of the first page to map
* @ page_end : page index of the last page to map + 1
*
* For each cpu , map pages [ @ page_start , @ page_end ) into @ chunk .
* vcache is flushed afterwards .
*/
static int pcpu_map ( struct pcpu_chunk * chunk , int page_start , int page_end )
{
unsigned int last = num_possible_cpus ( ) - 1 ;
unsigned int cpu ;
int err ;
2009-02-24 05:57:21 +03:00
/* map must not be done on immutable chunk */
WARN_ON ( chunk - > immutable ) ;
2009-02-20 10:29:08 +03:00
for_each_possible_cpu ( cpu ) {
2009-07-04 03:10:59 +04:00
err = __pcpu_map_pages ( pcpu_chunk_addr ( chunk , cpu , page_start ) ,
pcpu_chunk_pagep ( chunk , cpu , page_start ) ,
page_end - page_start ) ;
2009-02-20 10:29:08 +03:00
if ( err < 0 )
return err ;
}
/* flush at once, please read comments in pcpu_unmap() */
flush_cache_vmap ( pcpu_chunk_addr ( chunk , 0 , page_start ) ,
pcpu_chunk_addr ( chunk , last , page_end ) ) ;
return 0 ;
}
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @ chunk : chunk of interest
* @ off : offset to the area to populate
2009-02-21 10:56:23 +03:00
* @ size : size of the area to populate in bytes
2009-02-20 10:29:08 +03:00
*
* For each cpu , populate and map pages [ @ page_start , @ page_end ) into
* @ chunk . The area is cleared on return .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* pcpu_alloc_mutex , does GFP_KERNEL allocation .
2009-02-20 10:29:08 +03:00
*/
static int pcpu_populate_chunk ( struct pcpu_chunk * chunk , int off , int size )
{
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD ;
int page_start = PFN_DOWN ( off ) ;
int page_end = PFN_UP ( off + size ) ;
int map_start = - 1 ;
2009-03-01 09:42:36 +03:00
int uninitialized_var ( map_end ) ;
2009-02-20 10:29:08 +03:00
unsigned int cpu ;
int i ;
for ( i = page_start ; i < page_end ; i + + ) {
if ( pcpu_chunk_page_occupied ( chunk , i ) ) {
if ( map_start > = 0 ) {
if ( pcpu_map ( chunk , map_start , map_end ) )
goto err ;
map_start = - 1 ;
}
continue ;
}
map_start = map_start < 0 ? i : map_start ;
map_end = i + 1 ;
for_each_possible_cpu ( cpu ) {
struct page * * pagep = pcpu_chunk_pagep ( chunk , cpu , i ) ;
* pagep = alloc_pages_node ( cpu_to_node ( cpu ) ,
alloc_mask , 0 ) ;
if ( ! * pagep )
goto err ;
2009-04-02 08:21:44 +04:00
pcpu_set_page_chunk ( * pagep , chunk ) ;
2009-02-20 10:29:08 +03:00
}
}
if ( map_start > = 0 & & pcpu_map ( chunk , map_start , map_end ) )
goto err ;
for_each_possible_cpu ( cpu )
2009-02-24 05:57:21 +03:00
memset ( chunk - > vm - > addr + cpu * pcpu_unit_size + off , 0 ,
2009-02-20 10:29:08 +03:00
size ) ;
return 0 ;
err :
/* likely under heavy memory pressure, give memory back */
pcpu_depopulate_chunk ( chunk , off , size , true ) ;
return - ENOMEM ;
}
static void free_pcpu_chunk ( struct pcpu_chunk * chunk )
{
if ( ! chunk )
return ;
if ( chunk - > vm )
free_vm_area ( chunk - > vm ) ;
2009-03-06 18:44:09 +03:00
pcpu_mem_free ( chunk - > map , chunk - > map_alloc * sizeof ( chunk - > map [ 0 ] ) ) ;
2009-02-20 10:29:08 +03:00
kfree ( chunk ) ;
}
static struct pcpu_chunk * alloc_pcpu_chunk ( void )
{
struct pcpu_chunk * chunk ;
chunk = kzalloc ( pcpu_chunk_struct_size , GFP_KERNEL ) ;
if ( ! chunk )
return NULL ;
2009-03-06 18:44:09 +03:00
chunk - > map = pcpu_mem_alloc ( PCPU_DFL_MAP_ALLOC * sizeof ( chunk - > map [ 0 ] ) ) ;
2009-02-20 10:29:08 +03:00
chunk - > map_alloc = PCPU_DFL_MAP_ALLOC ;
chunk - > map [ chunk - > map_used + + ] = pcpu_unit_size ;
2009-03-06 08:33:59 +03:00
chunk - > page = chunk - > page_ar ;
2009-02-20 10:29:08 +03:00
chunk - > vm = get_vm_area ( pcpu_chunk_size , GFP_KERNEL ) ;
if ( ! chunk - > vm ) {
free_pcpu_chunk ( chunk ) ;
return NULL ;
}
INIT_LIST_HEAD ( & chunk - > list ) ;
chunk - > free_size = pcpu_unit_size ;
chunk - > contig_hint = pcpu_unit_size ;
return chunk ;
}
/**
2009-03-06 08:33:59 +03:00
* pcpu_alloc - the percpu allocator
2009-02-21 10:56:23 +03:00
* @ size : size of area to allocate in bytes
2009-02-20 10:29:08 +03:00
* @ align : alignment of area ( max PAGE_SIZE )
2009-03-06 08:33:59 +03:00
* @ reserved : allocate from the reserved chunk if available
2009-02-20 10:29:08 +03:00
*
2009-03-06 18:44:13 +03:00
* Allocate percpu area of @ size bytes aligned at @ align .
*
* CONTEXT :
* Does GFP_KERNEL allocation .
2009-02-20 10:29:08 +03:00
*
* RETURNS :
* Percpu pointer to the allocated area on success , NULL on failure .
*/
2009-03-06 08:33:59 +03:00
static void * pcpu_alloc ( size_t size , size_t align , bool reserved )
2009-02-20 10:29:08 +03:00
{
struct pcpu_chunk * chunk ;
int slot , off ;
2009-02-24 05:57:21 +03:00
if ( unlikely ( ! size | | size > PCPU_MIN_UNIT_SIZE | | align > PAGE_SIZE ) ) {
2009-02-20 10:29:08 +03:00
WARN ( true , " illegal size (%zu) or align (%zu) for "
" percpu allocation \n " , size , align ) ;
return NULL ;
}
2009-03-06 18:44:13 +03:00
mutex_lock ( & pcpu_alloc_mutex ) ;
spin_lock_irq ( & pcpu_lock ) ;
2009-02-20 10:29:08 +03:00
2009-03-06 08:33:59 +03:00
/* serve reserved allocations from the reserved chunk if available */
if ( reserved & & pcpu_reserved_chunk ) {
chunk = pcpu_reserved_chunk ;
2009-03-06 18:44:09 +03:00
if ( size > chunk - > contig_hint | |
pcpu_extend_area_map ( chunk ) < 0 )
2009-03-06 18:44:13 +03:00
goto fail_unlock ;
2009-03-06 08:33:59 +03:00
off = pcpu_alloc_area ( chunk , size , align ) ;
if ( off > = 0 )
goto area_found ;
2009-03-06 18:44:13 +03:00
goto fail_unlock ;
2009-03-06 08:33:59 +03:00
}
2009-03-06 18:44:13 +03:00
restart :
2009-03-06 08:33:59 +03:00
/* search through normal chunks */
2009-02-20 10:29:08 +03:00
for ( slot = pcpu_size_to_slot ( size ) ; slot < pcpu_nr_slots ; slot + + ) {
list_for_each_entry ( chunk , & pcpu_slot [ slot ] , list ) {
if ( size > chunk - > contig_hint )
continue ;
2009-03-06 18:44:13 +03:00
switch ( pcpu_extend_area_map ( chunk ) ) {
case 0 :
break ;
case 1 :
goto restart ; /* pcpu_lock dropped, restart */
default :
goto fail_unlock ;
}
2009-02-20 10:29:08 +03:00
off = pcpu_alloc_area ( chunk , size , align ) ;
if ( off > = 0 )
goto area_found ;
}
}
/* hmmm... no space left, create a new chunk */
2009-03-06 18:44:13 +03:00
spin_unlock_irq ( & pcpu_lock ) ;
2009-02-20 10:29:08 +03:00
chunk = alloc_pcpu_chunk ( ) ;
if ( ! chunk )
2009-03-06 18:44:13 +03:00
goto fail_unlock_mutex ;
spin_lock_irq ( & pcpu_lock ) ;
2009-02-20 10:29:08 +03:00
pcpu_chunk_relocate ( chunk , - 1 ) ;
2009-03-06 18:44:13 +03:00
goto restart ;
2009-02-20 10:29:08 +03:00
area_found :
2009-03-06 18:44:13 +03:00
spin_unlock_irq ( & pcpu_lock ) ;
2009-02-20 10:29:08 +03:00
/* populate, map and clear the area */
if ( pcpu_populate_chunk ( chunk , off , size ) ) {
2009-03-06 18:44:13 +03:00
spin_lock_irq ( & pcpu_lock ) ;
2009-02-20 10:29:08 +03:00
pcpu_free_area ( chunk , off ) ;
2009-03-06 18:44:13 +03:00
goto fail_unlock ;
2009-02-20 10:29:08 +03:00
}
2009-03-06 18:44:13 +03:00
mutex_unlock ( & pcpu_alloc_mutex ) ;
return __addr_to_pcpu_ptr ( chunk - > vm - > addr + off ) ;
fail_unlock :
spin_unlock_irq ( & pcpu_lock ) ;
fail_unlock_mutex :
mutex_unlock ( & pcpu_alloc_mutex ) ;
return NULL ;
2009-02-20 10:29:08 +03:00
}
2009-03-06 08:33:59 +03:00
/**
* __alloc_percpu - allocate dynamic percpu area
* @ size : size of area to allocate in bytes
* @ align : alignment of area ( max PAGE_SIZE )
*
* Allocate percpu area of @ size bytes aligned at @ align . Might
* sleep . Might trigger writeouts .
*
2009-03-06 18:44:13 +03:00
* CONTEXT :
* Does GFP_KERNEL allocation .
*
2009-03-06 08:33:59 +03:00
* RETURNS :
* Percpu pointer to the allocated area on success , NULL on failure .
*/
void * __alloc_percpu ( size_t size , size_t align )
{
return pcpu_alloc ( size , align , false ) ;
}
2009-02-20 10:29:08 +03:00
EXPORT_SYMBOL_GPL ( __alloc_percpu ) ;
2009-03-06 08:33:59 +03:00
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @ size : size of area to allocate in bytes
* @ align : alignment of area ( max PAGE_SIZE )
*
* Allocate percpu area of @ size bytes aligned at @ align from reserved
* percpu area if arch has set it up ; otherwise , allocation is served
* from the same dynamic area . Might sleep . Might trigger writeouts .
*
2009-03-06 18:44:13 +03:00
* CONTEXT :
* Does GFP_KERNEL allocation .
*
2009-03-06 08:33:59 +03:00
* RETURNS :
* Percpu pointer to the allocated area on success , NULL on failure .
*/
void * __alloc_reserved_percpu ( size_t size , size_t align )
{
return pcpu_alloc ( size , align , true ) ;
}
2009-03-06 18:44:11 +03:00
/**
* pcpu_reclaim - reclaim fully free chunks , workqueue function
* @ work : unused
*
* Reclaim all fully free chunks except for the first one .
2009-03-06 18:44:13 +03:00
*
* CONTEXT :
* workqueue context .
2009-03-06 18:44:11 +03:00
*/
static void pcpu_reclaim ( struct work_struct * work )
2009-02-20 10:29:08 +03:00
{
2009-03-06 18:44:11 +03:00
LIST_HEAD ( todo ) ;
struct list_head * head = & pcpu_slot [ pcpu_nr_slots - 1 ] ;
struct pcpu_chunk * chunk , * next ;
2009-03-06 18:44:13 +03:00
mutex_lock ( & pcpu_alloc_mutex ) ;
spin_lock_irq ( & pcpu_lock ) ;
2009-03-06 18:44:11 +03:00
list_for_each_entry_safe ( chunk , next , head , list ) {
WARN_ON ( chunk - > immutable ) ;
/* spare the first one */
if ( chunk = = list_first_entry ( head , struct pcpu_chunk , list ) )
continue ;
list_move ( & chunk - > list , & todo ) ;
}
2009-03-06 18:44:13 +03:00
spin_unlock_irq ( & pcpu_lock ) ;
mutex_unlock ( & pcpu_alloc_mutex ) ;
2009-03-06 18:44:11 +03:00
list_for_each_entry_safe ( chunk , next , & todo , list ) {
pcpu_depopulate_chunk ( chunk , 0 , pcpu_unit_size , false ) ;
free_pcpu_chunk ( chunk ) ;
}
2009-02-20 10:29:08 +03:00
}
/**
* free_percpu - free percpu area
* @ ptr : pointer to area to free
*
2009-03-06 18:44:13 +03:00
* Free percpu area @ ptr .
*
* CONTEXT :
* Can be called from atomic context .
2009-02-20 10:29:08 +03:00
*/
void free_percpu ( void * ptr )
{
void * addr = __pcpu_ptr_to_addr ( ptr ) ;
struct pcpu_chunk * chunk ;
2009-03-06 18:44:13 +03:00
unsigned long flags ;
2009-02-20 10:29:08 +03:00
int off ;
if ( ! ptr )
return ;
2009-03-06 18:44:13 +03:00
spin_lock_irqsave ( & pcpu_lock , flags ) ;
2009-02-20 10:29:08 +03:00
chunk = pcpu_chunk_addr_search ( addr ) ;
off = addr - chunk - > vm - > addr ;
pcpu_free_area ( chunk , off ) ;
2009-03-06 18:44:11 +03:00
/* if there are more than one fully free chunks, wake up grim reaper */
2009-02-20 10:29:08 +03:00
if ( chunk - > free_size = = pcpu_unit_size ) {
struct pcpu_chunk * pos ;
2009-03-06 18:44:11 +03:00
list_for_each_entry ( pos , & pcpu_slot [ pcpu_nr_slots - 1 ] , list )
2009-02-20 10:29:08 +03:00
if ( pos ! = chunk ) {
2009-03-06 18:44:11 +03:00
schedule_work ( & pcpu_reclaim_work ) ;
2009-02-20 10:29:08 +03:00
break ;
}
}
2009-03-06 18:44:13 +03:00
spin_unlock_irqrestore ( & pcpu_lock , flags ) ;
2009-02-20 10:29:08 +03:00
}
EXPORT_SYMBOL_GPL ( free_percpu ) ;
/**
2009-02-24 05:57:21 +03:00
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @ get_page_fn : callback to fetch page pointer
* @ static_size : the size of static percpu area in bytes
2009-03-06 08:33:59 +03:00
* @ reserved_size : the size of reserved percpu area in bytes
2009-03-06 08:33:59 +03:00
* @ dyn_size : free size for dynamic allocation in bytes , - 1 for auto
2009-03-10 10:27:48 +03:00
* @ unit_size : unit size in bytes , must be multiple of PAGE_SIZE , - 1 for auto
2009-02-24 05:57:21 +03:00
* @ base_addr : mapped address , NULL for auto
* @ populate_pte_fn : callback to allocate pagetable , NULL if unnecessary
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area . This function is to be called from arch percpu area
* setup path . The first two parameters are mandatory . The rest are
* optional .
*
* @ get_page_fn ( ) should return pointer to percpu page given cpu
* number and page number . It should at least return enough pages to
* cover the static area . The returned pages for static area should
* have been initialized with valid data . If @ unit_size is specified ,
* it can also return pages after the static area . NULL return
* indicates end of pages for the cpu . Note that @ get_page_fn ( ) must
* return the same number of pages for all cpus .
*
2009-03-06 08:33:59 +03:00
* @ reserved_size , if non - zero , specifies the amount of bytes to
* reserve after the static area in the first chunk . This reserves
* the first chunk such that it ' s available only through reserved
* percpu allocation . This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range .
*
2009-03-10 10:27:48 +03:00
* @ dyn_size , if non - negative , determines the number of bytes
* available for dynamic allocation in the first chunk . Specifying
* non - negative value makes percpu leave alone the area beyond
* @ static_size + @ reserved_size + @ dyn_size .
*
2009-03-06 08:33:59 +03:00
* @ unit_size , if non - negative , specifies unit size and must be
* aligned to PAGE_SIZE and equal to or larger than @ static_size +
2009-03-10 10:27:48 +03:00
* @ reserved_size + if non - negative , @ dyn_size .
2009-02-24 05:57:21 +03:00
*
* Non - null @ base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it . percpu must not mess
* with the chunk . Note that @ base_addr with 0 @ unit_size or non - NULL
* @ populate_pte_fn doesn ' t make any sense .
*
* @ populate_pte_fn is used to populate the pagetable . NULL means the
* caller already populated the pagetable .
2009-02-20 10:29:08 +03:00
*
2009-03-06 08:33:59 +03:00
* If the first chunk ends up with both reserved and dynamic areas , it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area . They share the same vm
* and page map but uses different area allocation map to stay away
* from each other . The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks .
*
2009-02-20 10:29:08 +03:00
* RETURNS :
* The determined pcpu_unit_size which can be used to initialize
* percpu access .
*/
2009-02-24 05:57:21 +03:00
size_t __init pcpu_setup_first_chunk ( pcpu_get_page_fn_t get_page_fn ,
2009-03-06 08:33:59 +03:00
size_t static_size , size_t reserved_size ,
2009-03-10 10:27:48 +03:00
ssize_t dyn_size , ssize_t unit_size ,
2009-03-06 08:33:59 +03:00
void * base_addr ,
2009-07-04 03:10:59 +04:00
pcpu_fc_populate_pte_fn_t populate_pte_fn )
2009-02-20 10:29:08 +03:00
{
2009-03-06 08:33:59 +03:00
static struct vm_struct first_vm ;
2009-03-06 08:33:59 +03:00
static int smap [ 2 ] , dmap [ 2 ] ;
2009-03-10 10:27:48 +03:00
size_t size_sum = static_size + reserved_size +
( dyn_size > = 0 ? dyn_size : 0 ) ;
2009-03-06 08:33:59 +03:00
struct pcpu_chunk * schunk , * dchunk = NULL ;
2009-02-20 10:29:08 +03:00
unsigned int cpu ;
2009-02-24 05:57:21 +03:00
int nr_pages ;
2009-02-20 10:29:08 +03:00
int err , i ;
2009-02-24 05:57:21 +03:00
/* santiy checks */
2009-03-06 08:33:59 +03:00
BUILD_BUG_ON ( ARRAY_SIZE ( smap ) > = PCPU_DFL_MAP_ALLOC | |
ARRAY_SIZE ( dmap ) > = PCPU_DFL_MAP_ALLOC ) ;
2009-02-24 05:57:21 +03:00
BUG_ON ( ! static_size ) ;
2009-03-06 08:33:59 +03:00
if ( unit_size > = 0 ) {
2009-03-10 10:27:48 +03:00
BUG_ON ( unit_size < size_sum ) ;
2009-03-06 08:33:59 +03:00
BUG_ON ( unit_size & ~ PAGE_MASK ) ;
2009-03-10 10:27:48 +03:00
BUG_ON ( unit_size < PCPU_MIN_UNIT_SIZE ) ;
} else
2009-03-06 08:33:59 +03:00
BUG_ON ( base_addr ) ;
2009-02-24 05:57:21 +03:00
BUG_ON ( base_addr & & populate_pte_fn ) ;
2009-02-20 10:29:08 +03:00
2009-03-06 08:33:59 +03:00
if ( unit_size > = 0 )
2009-02-24 05:57:21 +03:00
pcpu_unit_pages = unit_size > > PAGE_SHIFT ;
else
pcpu_unit_pages = max_t ( int , PCPU_MIN_UNIT_SIZE > > PAGE_SHIFT ,
2009-03-10 10:27:48 +03:00
PFN_UP ( size_sum ) ) ;
2009-02-24 05:57:21 +03:00
2009-02-24 05:57:21 +03:00
pcpu_unit_size = pcpu_unit_pages < < PAGE_SHIFT ;
2009-02-20 10:29:08 +03:00
pcpu_chunk_size = num_possible_cpus ( ) * pcpu_unit_size ;
pcpu_chunk_struct_size = sizeof ( struct pcpu_chunk )
2009-02-24 05:57:20 +03:00
+ num_possible_cpus ( ) * pcpu_unit_pages * sizeof ( struct page * ) ;
2009-02-20 10:29:08 +03:00
2009-03-06 08:33:59 +03:00
if ( dyn_size < 0 )
2009-03-06 08:33:59 +03:00
dyn_size = pcpu_unit_size - static_size - reserved_size ;
2009-03-06 08:33:59 +03:00
2009-02-24 05:57:21 +03:00
/*
* Allocate chunk slots . The additional last slot is for
* empty chunks .
*/
pcpu_nr_slots = __pcpu_size_to_slot ( pcpu_unit_size ) + 2 ;
2009-02-20 10:29:08 +03:00
pcpu_slot = alloc_bootmem ( pcpu_nr_slots * sizeof ( pcpu_slot [ 0 ] ) ) ;
for ( i = 0 ; i < pcpu_nr_slots ; i + + )
INIT_LIST_HEAD ( & pcpu_slot [ i ] ) ;
2009-03-06 08:33:59 +03:00
/*
* Initialize static chunk . If reserved_size is zero , the
* static chunk covers static area + dynamic allocation area
* in the first chunk . If reserved_size is not zero , it
* covers static area + reserved area ( mostly used for module
* static percpu allocation ) .
*/
2009-03-06 08:33:59 +03:00
schunk = alloc_bootmem ( pcpu_chunk_struct_size ) ;
INIT_LIST_HEAD ( & schunk - > list ) ;
schunk - > vm = & first_vm ;
2009-03-06 08:33:59 +03:00
schunk - > map = smap ;
schunk - > map_alloc = ARRAY_SIZE ( smap ) ;
2009-03-06 08:33:59 +03:00
schunk - > page = schunk - > page_ar ;
2009-03-06 08:33:59 +03:00
if ( reserved_size ) {
schunk - > free_size = reserved_size ;
2009-04-02 08:19:54 +04:00
pcpu_reserved_chunk = schunk ;
pcpu_reserved_chunk_limit = static_size + reserved_size ;
2009-03-06 08:33:59 +03:00
} else {
schunk - > free_size = dyn_size ;
dyn_size = 0 ; /* dynamic area covered */
}
2009-03-06 08:33:59 +03:00
schunk - > contig_hint = schunk - > free_size ;
2009-02-20 10:29:08 +03:00
2009-03-06 08:33:59 +03:00
schunk - > map [ schunk - > map_used + + ] = - static_size ;
if ( schunk - > free_size )
schunk - > map [ schunk - > map_used + + ] = schunk - > free_size ;
2009-03-06 08:33:59 +03:00
/* init dynamic chunk if necessary */
if ( dyn_size ) {
dchunk = alloc_bootmem ( sizeof ( struct pcpu_chunk ) ) ;
INIT_LIST_HEAD ( & dchunk - > list ) ;
dchunk - > vm = & first_vm ;
dchunk - > map = dmap ;
dchunk - > map_alloc = ARRAY_SIZE ( dmap ) ;
dchunk - > page = schunk - > page_ar ; /* share page map with schunk */
dchunk - > contig_hint = dchunk - > free_size = dyn_size ;
dchunk - > map [ dchunk - > map_used + + ] = - pcpu_reserved_chunk_limit ;
dchunk - > map [ dchunk - > map_used + + ] = dchunk - > free_size ;
}
2009-02-24 05:57:21 +03:00
/* allocate vm address */
2009-03-06 08:33:59 +03:00
first_vm . flags = VM_ALLOC ;
first_vm . size = pcpu_chunk_size ;
2009-02-24 05:57:21 +03:00
if ( ! base_addr )
2009-03-06 08:33:59 +03:00
vm_area_register_early ( & first_vm , PAGE_SIZE ) ;
2009-02-24 05:57:21 +03:00
else {
/*
* Pages already mapped . No need to remap into
2009-03-06 08:33:59 +03:00
* vmalloc area . In this case the first chunks can ' t
* be mapped or unmapped by percpu and are marked
2009-02-24 05:57:21 +03:00
* immutable .
*/
2009-03-06 08:33:59 +03:00
first_vm . addr = base_addr ;
schunk - > immutable = true ;
2009-03-06 08:33:59 +03:00
if ( dchunk )
dchunk - > immutable = true ;
2009-02-24 05:57:21 +03:00
}
/* assign pages */
nr_pages = - 1 ;
2009-02-20 10:29:08 +03:00
for_each_possible_cpu ( cpu ) {
2009-02-24 05:57:21 +03:00
for ( i = 0 ; i < pcpu_unit_pages ; i + + ) {
struct page * page = get_page_fn ( cpu , i ) ;
if ( ! page )
break ;
2009-03-06 08:33:59 +03:00
* pcpu_chunk_pagep ( schunk , cpu , i ) = page ;
2009-02-20 10:29:08 +03:00
}
2009-02-24 05:57:21 +03:00
2009-03-06 08:33:59 +03:00
BUG_ON ( i < PFN_UP ( static_size ) ) ;
2009-02-24 05:57:21 +03:00
if ( nr_pages < 0 )
nr_pages = i ;
else
BUG_ON ( nr_pages ! = i ) ;
2009-02-20 10:29:08 +03:00
}
2009-02-24 05:57:21 +03:00
/* map them */
if ( populate_pte_fn ) {
for_each_possible_cpu ( cpu )
for ( i = 0 ; i < nr_pages ; i + + )
2009-03-06 08:33:59 +03:00
populate_pte_fn ( pcpu_chunk_addr ( schunk ,
2009-02-24 05:57:21 +03:00
cpu , i ) ) ;
2009-03-06 08:33:59 +03:00
err = pcpu_map ( schunk , 0 , nr_pages ) ;
2009-02-24 05:57:21 +03:00
if ( err )
panic ( " failed to setup static percpu area, err=%d \n " ,
err ) ;
}
2009-02-20 10:29:08 +03:00
2009-03-06 08:33:59 +03:00
/* link the first chunk in */
2009-04-02 08:19:54 +04:00
pcpu_first_chunk = dchunk ? : schunk ;
pcpu_chunk_relocate ( pcpu_first_chunk , - 1 ) ;
2009-02-20 10:29:08 +03:00
/* we're done */
2009-03-06 08:33:59 +03:00
pcpu_base_addr = ( void * ) pcpu_chunk_addr ( schunk , 0 , 0 ) ;
2009-02-20 10:29:08 +03:00
return pcpu_unit_size ;
}
2009-03-10 10:27:48 +03:00
/*
* Embedding first chunk setup helper .
*/
static void * pcpue_ptr __initdata ;
static size_t pcpue_size __initdata ;
static size_t pcpue_unit_size __initdata ;
static struct page * __init pcpue_get_page ( unsigned int cpu , int pageno )
{
size_t off = ( size_t ) pageno < < PAGE_SHIFT ;
if ( off > = pcpue_size )
return NULL ;
return virt_to_page ( pcpue_ptr + cpu * pcpue_unit_size + off ) ;
}
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @ static_size : the size of static percpu area in bytes
* @ reserved_size : the size of reserved percpu area in bytes
* @ dyn_size : free size for dynamic allocation in bytes , - 1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk ( ) is expected .
*
* If this function is used to setup the first chunk , it is allocated
* as a contiguous area using bootmem allocator and used as - is without
* being mapped into vmalloc area . This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size .
*
* When @ dyn_size is positive , dynamic area might be larger than
2009-07-04 03:10:58 +04:00
* specified to fill page alignment . When @ dyn_size is auto ,
* @ dyn_size is just big enough to fill page alignment after static
* and reserved areas .
2009-03-10 10:27:48 +03:00
*
* If the needed size is smaller than the minimum or specified unit
* size , the leftover is returned to the bootmem allocator .
*
* RETURNS :
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success , - errno on failure .
*/
ssize_t __init pcpu_embed_first_chunk ( size_t static_size , size_t reserved_size ,
2009-07-04 03:10:58 +04:00
ssize_t dyn_size )
2009-03-10 10:27:48 +03:00
{
2009-06-22 06:56:24 +04:00
size_t chunk_size ;
2009-03-10 10:27:48 +03:00
unsigned int cpu ;
/* determine parameters and allocate */
pcpue_size = PFN_ALIGN ( static_size + reserved_size +
( dyn_size > = 0 ? dyn_size : 0 ) ) ;
if ( dyn_size ! = 0 )
dyn_size = pcpue_size - static_size - reserved_size ;
2009-07-04 03:10:58 +04:00
pcpue_unit_size = max_t ( size_t , pcpue_size , PCPU_MIN_UNIT_SIZE ) ;
2009-06-22 06:56:24 +04:00
chunk_size = pcpue_unit_size * num_possible_cpus ( ) ;
pcpue_ptr = __alloc_bootmem_nopanic ( chunk_size , PAGE_SIZE ,
__pa ( MAX_DMA_ADDRESS ) ) ;
if ( ! pcpue_ptr ) {
pr_warning ( " PERCPU: failed to allocate %zu bytes for "
" embedding \n " , chunk_size ) ;
2009-03-10 10:27:48 +03:00
return - ENOMEM ;
2009-06-22 06:56:24 +04:00
}
2009-03-10 10:27:48 +03:00
/* return the leftover and copy */
for_each_possible_cpu ( cpu ) {
void * ptr = pcpue_ptr + cpu * pcpue_unit_size ;
free_bootmem ( __pa ( ptr + pcpue_size ) ,
pcpue_unit_size - pcpue_size ) ;
memcpy ( ptr , __per_cpu_load , static_size ) ;
}
/* we're ready, commit */
pr_info ( " PERCPU: Embedded %zu pages at %p, static data %zu bytes \n " ,
pcpue_size > > PAGE_SHIFT , pcpue_ptr , static_size ) ;
return pcpu_setup_first_chunk ( pcpue_get_page , static_size ,
reserved_size , dyn_size ,
pcpue_unit_size , pcpue_ptr , NULL ) ;
}
2009-03-30 14:07:44 +04:00
2009-07-04 03:10:59 +04:00
/*
* 4 k page first chunk setup helper .
*/
static struct page * * pcpu4k_pages __initdata ;
2009-07-04 03:10:59 +04:00
static int pcpu4k_unit_pages __initdata ;
2009-07-04 03:10:59 +04:00
static struct page * __init pcpu4k_get_page ( unsigned int cpu , int pageno )
{
2009-07-04 03:10:59 +04:00
if ( pageno < pcpu4k_unit_pages )
return pcpu4k_pages [ cpu * pcpu4k_unit_pages + pageno ] ;
2009-07-04 03:10:59 +04:00
return NULL ;
}
/**
* pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
* @ static_size : the size of static percpu area in bytes
* @ reserved_size : the size of reserved percpu area in bytes
* @ alloc_fn : function to allocate percpu page , always called with PAGE_SIZE
* @ free_fn : funtion to free percpu page , always called with PAGE_SIZE
* @ populate_pte_fn : function to populate pte
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk ( ) is expected .
*
* This is the basic allocator . Static percpu area is allocated
* page - by - page into vmalloc area .
*
* RETURNS :
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success , - errno on failure .
*/
ssize_t __init pcpu_4k_first_chunk ( size_t static_size , size_t reserved_size ,
pcpu_fc_alloc_fn_t alloc_fn ,
pcpu_fc_free_fn_t free_fn ,
pcpu_fc_populate_pte_fn_t populate_pte_fn )
{
2009-07-04 03:10:59 +04:00
static struct vm_struct vm ;
2009-07-04 03:10:59 +04:00
size_t pages_size ;
unsigned int cpu ;
int i , j ;
ssize_t ret ;
2009-07-04 03:10:59 +04:00
pcpu4k_unit_pages = PFN_UP ( max_t ( size_t , static_size + reserved_size ,
PCPU_MIN_UNIT_SIZE ) ) ;
2009-07-04 03:10:59 +04:00
/* unaligned allocations can't be freed, round up to page size */
2009-07-04 03:10:59 +04:00
pages_size = PFN_ALIGN ( pcpu4k_unit_pages * num_possible_cpus ( ) *
2009-07-04 03:10:59 +04:00
sizeof ( pcpu4k_pages [ 0 ] ) ) ;
pcpu4k_pages = alloc_bootmem ( pages_size ) ;
2009-07-04 03:10:59 +04:00
/* allocate pages */
2009-07-04 03:10:59 +04:00
j = 0 ;
for_each_possible_cpu ( cpu )
2009-07-04 03:10:59 +04:00
for ( i = 0 ; i < pcpu4k_unit_pages ; i + + ) {
2009-07-04 03:10:59 +04:00
void * ptr ;
ptr = alloc_fn ( cpu , PAGE_SIZE ) ;
if ( ! ptr ) {
pr_warning ( " PERCPU: failed to allocate "
" 4k page for cpu%u \n " , cpu ) ;
goto enomem ;
}
pcpu4k_pages [ j + + ] = virt_to_page ( ptr ) ;
}
2009-07-04 03:10:59 +04:00
/* allocate vm area, map the pages and copy static data */
vm . flags = VM_ALLOC ;
vm . size = num_possible_cpus ( ) * pcpu4k_unit_pages < < PAGE_SHIFT ;
vm_area_register_early ( & vm , PAGE_SIZE ) ;
for_each_possible_cpu ( cpu ) {
unsigned long unit_addr = ( unsigned long ) vm . addr +
( cpu * pcpu4k_unit_pages < < PAGE_SHIFT ) ;
for ( i = 0 ; i < pcpu4k_unit_pages ; i + + )
populate_pte_fn ( unit_addr + ( i < < PAGE_SHIFT ) ) ;
/* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages ( unit_addr ,
& pcpu4k_pages [ cpu * pcpu4k_unit_pages ] ,
pcpu4k_unit_pages ) ;
if ( ret < 0 )
panic ( " failed to map percpu area, err=%zd \n " , ret ) ;
/*
* FIXME : Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap ( ) on the local cpu .
* flush_cache_vmap ( ) can ' t be used as most supporting
* data structures are not set up yet .
*/
/* copy static data */
memcpy ( ( void * ) unit_addr , __per_cpu_load , static_size ) ;
}
2009-07-04 03:10:59 +04:00
/* we're ready, commit */
2009-07-04 03:10:59 +04:00
pr_info ( " PERCPU: %d 4k pages per cpu, static data %zu bytes \n " ,
pcpu4k_unit_pages , static_size ) ;
2009-07-04 03:10:59 +04:00
ret = pcpu_setup_first_chunk ( pcpu4k_get_page , static_size ,
reserved_size , - 1 ,
2009-07-04 03:10:59 +04:00
pcpu4k_unit_pages < < PAGE_SHIFT , vm . addr ,
NULL ) ;
2009-07-04 03:10:59 +04:00
goto out_free_ar ;
enomem :
while ( - - j > = 0 )
free_fn ( page_address ( pcpu4k_pages [ j ] ) , PAGE_SIZE ) ;
ret = - ENOMEM ;
out_free_ar :
free_bootmem ( __pa ( pcpu4k_pages ) , pages_size ) ;
return ret ;
}
2009-03-30 14:07:44 +04:00
/*
* Generic percpu area setup .
*
* The embedding helper is used because its behavior closely resembles
* the original non - dynamic generic percpu area setup . This is
* important because many archs have addressing restrictions and might
* fail if the percpu area is located far away from the previous
* location . As an added bonus , in non - NUMA cases , embedding is
* generally a good idea TLB - wise because percpu area can piggy back
* on the physical linear memory mapping which uses large page
* mappings on applicable archs .
*/
# ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset [ NR_CPUS ] __read_mostly ;
EXPORT_SYMBOL ( __per_cpu_offset ) ;
void __init setup_per_cpu_areas ( void )
{
size_t static_size = __per_cpu_end - __per_cpu_start ;
ssize_t unit_size ;
unsigned long delta ;
unsigned int cpu ;
/*
* Always reserve area for module percpu variables . That ' s
* what the legacy allocator did .
*/
unit_size = pcpu_embed_first_chunk ( static_size , PERCPU_MODULE_RESERVE ,
2009-07-04 03:10:58 +04:00
PERCPU_DYNAMIC_RESERVE ) ;
2009-03-30 14:07:44 +04:00
if ( unit_size < 0 )
panic ( " Failed to initialized percpu areas. " ) ;
delta = ( unsigned long ) pcpu_base_addr - ( unsigned long ) __per_cpu_start ;
for_each_possible_cpu ( cpu )
__per_cpu_offset [ cpu ] = delta + cpu * unit_size ;
}
# endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */