2012-01-10 02:51:56 +04:00
/*
* zsmalloc memory allocator
*
* Copyright ( C ) 2011 Nitin Gupta
2014-01-31 03:45:55 +04:00
* Copyright ( C ) 2012 , 2013 Minchan Kim
2012-01-10 02:51:56 +04:00
*
* This code is released using a dual license strategy : BSD / GPL
* You can choose the license that better fits your requirements .
*
* Released under the terms of 3 - clause BSD License
* Released under the terms of GNU General Public License Version 2.0
*/
2012-06-10 04:41:14 +04:00
/*
2013-12-11 06:04:37 +04:00
* This allocator is designed for use with zram . Thus , the allocator is
* supposed to work well under low memory conditions . In particular , it
* never attempts higher order page allocation which is very likely to
* fail under memory pressure . On the other hand , if we just use single
* ( 0 - order ) pages , it would suffer from very high fragmentation - -
* any object of size PAGE_SIZE / 2 or larger would occupy an entire page .
* This was one of the major issues with its predecessor ( xvmalloc ) .
2012-06-10 04:41:14 +04:00
*
* To overcome these issues , zsmalloc allocates a bunch of 0 - order pages
* and links them together using various ' struct page ' fields . These linked
* pages act as a single higher - order page i . e . an object can span 0 - order
* page boundaries . The code refers to these linked pages as a single entity
* called zspage .
*
2013-12-11 06:04:37 +04:00
* For simplicity , zsmalloc can only allocate objects of size up to PAGE_SIZE
* since this satisfies the requirements of all its current users ( in the
* worst case , page is incompressible and is thus stored " as-is " i . e . in
* uncompressed form ) . For allocation requests larger than this size , failure
* is returned ( see zs_malloc ) .
*
* Additionally , zs_malloc ( ) does not return a dereferenceable pointer .
* Instead , it returns an opaque handle ( unsigned long ) which encodes actual
* location of the allocated object . The reason for this indirection is that
* zsmalloc does not keep zspages permanently mapped since that would cause
* issues on 32 - bit systems where the VA region for kernel space mappings
* is very small . So , before using the allocating memory , the object has to
* be mapped using zs_map_object ( ) to get a usable pointer and subsequently
* unmapped using zs_unmap_object ( ) .
*
2012-06-10 04:41:14 +04:00
* Following is how we use various fields and flags of underlying
* struct page ( s ) to form a zspage .
*
* Usage of struct page fields :
* page - > first_page : points to the first component ( 0 - order ) page
* page - > index ( union with page - > freelist ) : offset of the first object
* starting in this page . For the first page , this is
* always 0 , so we use this field ( aka freelist ) to point
* to the first free object in zspage .
* page - > lru : links together all component pages ( except the first page )
* of a zspage
*
* For _first_ page only :
*
* page - > private ( union with page - > first_page ) : refers to the
* component page after the first page
* page - > freelist : points to the first free object in zspage .
* Free objects are linked together using in - place
* metadata .
* page - > objects : maximum number of objects we can store in this
* zspage ( class - > zspage_order * PAGE_SIZE / class - > size )
* page - > lru : links together first pages of various zspages .
* Basically forming list of zspages in a fullness group .
* page - > mapping : class index and fullness group of the zspage
*
* Usage of struct page flags :
* PG_private : identifies the first component page
* PG_private2 : identifies the last component page
*
*/
2012-01-10 02:51:56 +04:00
# ifdef CONFIG_ZSMALLOC_DEBUG
# define DEBUG
# endif
# include <linux/module.h>
# include <linux/kernel.h>
# include <linux/bitops.h>
# include <linux/errno.h>
# include <linux/highmem.h>
# include <linux/string.h>
# include <linux/slab.h>
# include <asm/tlbflush.h>
# include <asm/pgtable.h>
# include <linux/cpumask.h>
# include <linux/cpu.h>
2012-02-13 18:47:49 +04:00
# include <linux/vmalloc.h>
2012-07-18 20:55:55 +04:00
# include <linux/hardirq.h>
2012-08-08 10:12:17 +04:00
# include <linux/spinlock.h>
# include <linux/types.h>
2014-01-31 03:45:50 +04:00
# include <linux/zsmalloc.h>
2014-08-07 03:08:38 +04:00
# include <linux/zpool.h>
2012-08-08 10:12:17 +04:00
/*
* This must be power of 2 and greater than of equal to sizeof ( link_free ) .
* These two conditions ensure that any ' struct link_free ' itself doesn ' t
* span more than 1 page which avoids complex case of mapping 2 pages simply
* to restore link_free pointer values .
*/
# define ZS_ALIGN 8
/*
* A single ' zspage ' is composed of up to 2 ^ N discontiguous 0 - order ( single )
* pages . ZS_MAX_ZSPAGE_ORDER defines upper limit on N .
*/
# define ZS_MAX_ZSPAGE_ORDER 2
# define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
/*
* Object location ( < PFN > , < obj_idx > ) is encoded as
2013-12-11 06:04:37 +04:00
* as single ( unsigned long ) handle value .
2012-08-08 10:12:17 +04:00
*
* Note that object index < obj_idx > is relative to system
* page < PFN > it is stored in , so for each sub - page belonging
* to a zspage , obj_idx starts with 0.
*
* This is made more complicated by various memory models and PAE .
*/
# ifndef MAX_PHYSMEM_BITS
# ifdef CONFIG_HIGHMEM64G
# define MAX_PHYSMEM_BITS 36
# else /* !CONFIG_HIGHMEM64G */
/*
* If this definition of MAX_PHYSMEM_BITS is used , OBJ_INDEX_BITS will just
* be PAGE_SHIFT
*/
# define MAX_PHYSMEM_BITS BITS_PER_LONG
# endif
# endif
# define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
# define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
# define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
# define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
# define ZS_MIN_ALLOC_SIZE \
MAX ( 32 , ( ZS_MAX_PAGES_PER_ZSPAGE < < PAGE_SHIFT > > OBJ_INDEX_BITS ) )
# define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
2014-06-05 03:11:08 +04:00
* On systems with 4 K page size , this gives 255 size classes ! There is a
2012-08-08 10:12:17 +04:00
* trader - off here :
* - Large number of size classes is potentially wasteful as free page are
* spread across these classes
* - Small number of size classes causes large internal fragmentation
* - Probably its better to use specific size classes ( empirically
* determined ) . NOTE : all those class sizes must be set as multiple of
* ZS_ALIGN to make sure link_free itself never has to span 2 pages .
*
* ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
* ( reason above )
*/
2013-01-25 21:46:18 +04:00
# define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
2012-08-08 10:12:17 +04:00
# define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
ZS_SIZE_CLASS_DELTA + 1 )
/*
* We do not maintain any list for completely empty or full pages
*/
enum fullness_group {
ZS_ALMOST_FULL ,
ZS_ALMOST_EMPTY ,
_ZS_NR_FULLNESS_GROUPS ,
ZS_EMPTY ,
ZS_FULL
} ;
/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when :
* n < = N / f , where
* n = number of allocated objects
* N = total number of objects zspage can store
* f = 1 / fullness_threshold_frac
*
* Similarly , we assign zspage to :
* ZS_ALMOST_FULL when n > N / f
* ZS_EMPTY when n = = 0
* ZS_FULL when n = = N
*
* ( see : fix_fullness_group ( ) )
*/
static const int fullness_threshold_frac = 4 ;
struct size_class {
/*
* Size of objects stored in this class . Must be multiple
* of ZS_ALIGN .
*/
int size ;
unsigned int index ;
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage ;
spinlock_t lock ;
/* stats */
u64 pages_allocated ;
struct page * fullness_list [ _ZS_NR_FULLNESS_GROUPS ] ;
} ;
/*
* Placed within free objects to form a singly linked list .
* For every zspage , first_page - > freelist gives head of this list .
*
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
/* Handle of next free chunk (encodes <PFN, obj_idx>) */
void * next ;
} ;
struct zs_pool {
struct size_class size_class [ ZS_SIZE_CLASSES ] ;
gfp_t flags ; /* allocation flags used when growing pool */
} ;
2012-01-10 02:51:56 +04:00
/*
* A zspage ' s class index and fullness group
* are encoded in its ( first ) page - > mapping
*/
# define CLASS_IDX_BITS 28
# define FULLNESS_BITS 4
# define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
# define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
2012-07-18 20:55:56 +04:00
struct mapping_area {
2013-12-11 06:04:36 +04:00
# ifdef CONFIG_PGTABLE_MAPPING
2012-07-18 20:55:56 +04:00
struct vm_struct * vm ; /* vm area for mapping object that span pages */
# else
char * vm_buf ; /* copy buffer for objects that span pages */
# endif
char * vm_addr ; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm ; /* mapping mode */
} ;
2014-08-07 03:08:38 +04:00
/* zpool driver */
# ifdef CONFIG_ZPOOL
static void * zs_zpool_create ( gfp_t gfp , struct zpool_ops * zpool_ops )
{
return zs_create_pool ( gfp ) ;
}
static void zs_zpool_destroy ( void * pool )
{
zs_destroy_pool ( pool ) ;
}
static int zs_zpool_malloc ( void * pool , size_t size , gfp_t gfp ,
unsigned long * handle )
{
* handle = zs_malloc ( pool , size ) ;
return * handle ? 0 : - 1 ;
}
static void zs_zpool_free ( void * pool , unsigned long handle )
{
zs_free ( pool , handle ) ;
}
static int zs_zpool_shrink ( void * pool , unsigned int pages ,
unsigned int * reclaimed )
{
return - EINVAL ;
}
static void * zs_zpool_map ( void * pool , unsigned long handle ,
enum zpool_mapmode mm )
{
enum zs_mapmode zs_mm ;
switch ( mm ) {
case ZPOOL_MM_RO :
zs_mm = ZS_MM_RO ;
break ;
case ZPOOL_MM_WO :
zs_mm = ZS_MM_WO ;
break ;
case ZPOOL_MM_RW : /* fallthru */
default :
zs_mm = ZS_MM_RW ;
break ;
}
return zs_map_object ( pool , handle , zs_mm ) ;
}
static void zs_zpool_unmap ( void * pool , unsigned long handle )
{
zs_unmap_object ( pool , handle ) ;
}
static u64 zs_zpool_total_size ( void * pool )
{
return zs_get_total_size_bytes ( pool ) ;
}
static struct zpool_driver zs_zpool_driver = {
. type = " zsmalloc " ,
. owner = THIS_MODULE ,
. create = zs_zpool_create ,
. destroy = zs_zpool_destroy ,
. malloc = zs_zpool_malloc ,
. free = zs_zpool_free ,
. shrink = zs_zpool_shrink ,
. map = zs_zpool_map ,
. unmap = zs_zpool_unmap ,
. total_size = zs_zpool_total_size ,
} ;
# endif /* CONFIG_ZPOOL */
2012-01-10 02:51:56 +04:00
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU ( struct mapping_area , zs_map_area ) ;
static int is_first_page ( struct page * page )
{
2012-04-25 10:23:09 +04:00
return PagePrivate ( page ) ;
2012-01-10 02:51:56 +04:00
}
static int is_last_page ( struct page * page )
{
2012-04-25 10:23:09 +04:00
return PagePrivate2 ( page ) ;
2012-01-10 02:51:56 +04:00
}
static void get_zspage_mapping ( struct page * page , unsigned int * class_idx ,
enum fullness_group * fullness )
{
unsigned long m ;
BUG_ON ( ! is_first_page ( page ) ) ;
m = ( unsigned long ) page - > mapping ;
* fullness = m & FULLNESS_MASK ;
* class_idx = ( m > > FULLNESS_BITS ) & CLASS_IDX_MASK ;
}
static void set_zspage_mapping ( struct page * page , unsigned int class_idx ,
enum fullness_group fullness )
{
unsigned long m ;
BUG_ON ( ! is_first_page ( page ) ) ;
m = ( ( class_idx & CLASS_IDX_MASK ) < < FULLNESS_BITS ) |
( fullness & FULLNESS_MASK ) ;
page - > mapping = ( struct address_space * ) m ;
}
2013-12-11 06:04:37 +04:00
/*
* zsmalloc divides the pool into various size classes where each
* class maintains a list of zspages where each zspage is divided
* into equal sized chunks . Each allocation falls into one of these
* classes depending on its size . This function returns index of the
* size class which has chunk size big enough to hold the give size .
*/
2012-01-10 02:51:56 +04:00
static int get_size_class_index ( int size )
{
int idx = 0 ;
if ( likely ( size > ZS_MIN_ALLOC_SIZE ) )
idx = DIV_ROUND_UP ( size - ZS_MIN_ALLOC_SIZE ,
ZS_SIZE_CLASS_DELTA ) ;
return idx ;
}
2013-12-11 06:04:37 +04:00
/*
* For each size class , zspages are divided into different groups
* depending on how " full " they are . This was done so that we could
* easily find empty or nearly empty zspages when we try to shrink
* the pool ( not yet implemented ) . This function returns fullness
* status of the given page .
*/
2012-01-10 02:51:56 +04:00
static enum fullness_group get_fullness_group ( struct page * page )
{
int inuse , max_objects ;
enum fullness_group fg ;
BUG_ON ( ! is_first_page ( page ) ) ;
inuse = page - > inuse ;
max_objects = page - > objects ;
if ( inuse = = 0 )
fg = ZS_EMPTY ;
else if ( inuse = = max_objects )
fg = ZS_FULL ;
else if ( inuse < = max_objects / fullness_threshold_frac )
fg = ZS_ALMOST_EMPTY ;
else
fg = ZS_ALMOST_FULL ;
return fg ;
}
2013-12-11 06:04:37 +04:00
/*
* Each size class maintains various freelists and zspages are assigned
* to one of these freelists based on the number of live objects they
* have . This functions inserts the given zspage into the freelist
* identified by < class , fullness_group > .
*/
2012-01-10 02:51:56 +04:00
static void insert_zspage ( struct page * page , struct size_class * class ,
enum fullness_group fullness )
{
struct page * * head ;
BUG_ON ( ! is_first_page ( page ) ) ;
if ( fullness > = _ZS_NR_FULLNESS_GROUPS )
return ;
head = & class - > fullness_list [ fullness ] ;
if ( * head )
list_add_tail ( & page - > lru , & ( * head ) - > lru ) ;
* head = page ;
}
2013-12-11 06:04:37 +04:00
/*
* This function removes the given zspage from the freelist identified
* by < class , fullness_group > .
*/
2012-01-10 02:51:56 +04:00
static void remove_zspage ( struct page * page , struct size_class * class ,
enum fullness_group fullness )
{
struct page * * head ;
BUG_ON ( ! is_first_page ( page ) ) ;
if ( fullness > = _ZS_NR_FULLNESS_GROUPS )
return ;
head = & class - > fullness_list [ fullness ] ;
BUG_ON ( ! * head ) ;
if ( list_empty ( & ( * head ) - > lru ) )
* head = NULL ;
else if ( * head = = page )
* head = ( struct page * ) list_entry ( ( * head ) - > lru . next ,
struct page , lru ) ;
list_del_init ( & page - > lru ) ;
}
2013-12-11 06:04:37 +04:00
/*
* Each size class maintains zspages in different fullness groups depending
* on the number of live objects they contain . When allocating or freeing
* objects , the fullness status of the page can change , say , from ALMOST_FULL
* to ALMOST_EMPTY when freeing an object . This function checks if such
* a status change has occurred for the given page and accordingly moves the
* page from the freelist of the old fullness group to that of the new
* fullness group .
*/
2012-01-10 02:51:56 +04:00
static enum fullness_group fix_fullness_group ( struct zs_pool * pool ,
struct page * page )
{
int class_idx ;
struct size_class * class ;
enum fullness_group currfg , newfg ;
BUG_ON ( ! is_first_page ( page ) ) ;
get_zspage_mapping ( page , & class_idx , & currfg ) ;
newfg = get_fullness_group ( page ) ;
if ( newfg = = currfg )
goto out ;
class = & pool - > size_class [ class_idx ] ;
remove_zspage ( page , class , currfg ) ;
insert_zspage ( page , class , newfg ) ;
set_zspage_mapping ( page , class_idx , newfg ) ;
out :
return newfg ;
}
/*
* We have to decide on how many pages to link together
* to form a zspage for each size class . This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as :
* wastage = Zp - Zp % size_class
* where Zp = zspage size = k * PAGE_SIZE where k = 1 , 2 , . . .
*
* For example , for size class of 3 / 8 * PAGE_SIZE , we should
* link together 3 PAGE_SIZE sized pages to form a zspage
* since then we can perfectly fit in 8 such objects .
*/
2012-05-03 10:40:39 +04:00
static int get_pages_per_zspage ( int class_size )
2012-01-10 02:51:56 +04:00
{
int i , max_usedpc = 0 ;
/* zspage order which gives maximum used size per KB */
int max_usedpc_order = 1 ;
2012-03-05 21:33:21 +04:00
for ( i = 1 ; i < = ZS_MAX_PAGES_PER_ZSPAGE ; i + + ) {
2012-01-10 02:51:56 +04:00
int zspage_size ;
int waste , usedpc ;
zspage_size = i * PAGE_SIZE ;
waste = zspage_size % class_size ;
usedpc = ( zspage_size - waste ) * 100 / zspage_size ;
if ( usedpc > max_usedpc ) {
max_usedpc = usedpc ;
max_usedpc_order = i ;
}
}
return max_usedpc_order ;
}
/*
* A single ' zspage ' is composed of many system pages which are
* linked together using fields in struct page . This function finds
* the first / head page , given any component page of a zspage .
*/
static struct page * get_first_page ( struct page * page )
{
if ( is_first_page ( page ) )
return page ;
else
return page - > first_page ;
}
static struct page * get_next_page ( struct page * page )
{
struct page * next ;
if ( is_last_page ( page ) )
next = NULL ;
else if ( is_first_page ( page ) )
2013-07-12 11:08:13 +04:00
next = ( struct page * ) page_private ( page ) ;
2012-01-10 02:51:56 +04:00
else
next = list_entry ( page - > lru . next , struct page , lru ) ;
return next ;
}
2013-11-22 21:30:41 +04:00
/*
* Encode < page , obj_idx > as a single handle value .
* On hardware platforms with physical memory starting at 0x0 the pfn
* could be 0 so we ensure that the handle will never be 0 by adjusting the
* encoded obj_idx value before encoding .
*/
2012-01-10 02:51:56 +04:00
static void * obj_location_to_handle ( struct page * page , unsigned long obj_idx )
{
unsigned long handle ;
if ( ! page ) {
BUG_ON ( obj_idx ) ;
return NULL ;
}
handle = page_to_pfn ( page ) < < OBJ_INDEX_BITS ;
2013-11-22 21:30:41 +04:00
handle | = ( ( obj_idx + 1 ) & OBJ_INDEX_MASK ) ;
2012-01-10 02:51:56 +04:00
return ( void * ) handle ;
}
2013-11-22 21:30:41 +04:00
/*
* Decode < page , obj_idx > pair from the given object handle . We adjust the
* decoded obj_idx back to its original value since it was adjusted in
* obj_location_to_handle ( ) .
*/
2012-06-08 10:39:25 +04:00
static void obj_handle_to_location ( unsigned long handle , struct page * * page ,
2012-01-10 02:51:56 +04:00
unsigned long * obj_idx )
{
2012-06-08 10:39:25 +04:00
* page = pfn_to_page ( handle > > OBJ_INDEX_BITS ) ;
2013-11-22 21:30:41 +04:00
* obj_idx = ( handle & OBJ_INDEX_MASK ) - 1 ;
2012-01-10 02:51:56 +04:00
}
static unsigned long obj_idx_to_offset ( struct page * page ,
unsigned long obj_idx , int class_size )
{
unsigned long off = 0 ;
if ( ! is_first_page ( page ) )
off = page - > index ;
return off + obj_idx * class_size ;
}
2012-04-02 18:13:56 +04:00
static void reset_page ( struct page * page )
{
clear_bit ( PG_private , & page - > flags ) ;
clear_bit ( PG_private_2 , & page - > flags ) ;
set_page_private ( page , 0 ) ;
page - > mapping = NULL ;
page - > freelist = NULL ;
2013-02-23 04:34:59 +04:00
page_mapcount_reset ( page ) ;
2012-04-02 18:13:56 +04:00
}
2012-01-10 02:51:56 +04:00
static void free_zspage ( struct page * first_page )
{
2012-04-02 18:13:56 +04:00
struct page * nextp , * tmp , * head_extra ;
2012-01-10 02:51:56 +04:00
BUG_ON ( ! is_first_page ( first_page ) ) ;
BUG_ON ( first_page - > inuse ) ;
2012-04-02 18:13:56 +04:00
head_extra = ( struct page * ) page_private ( first_page ) ;
2012-01-10 02:51:56 +04:00
2012-04-02 18:13:56 +04:00
reset_page ( first_page ) ;
2012-01-10 02:51:56 +04:00
__free_page ( first_page ) ;
/* zspage with only 1 system page */
2012-04-02 18:13:56 +04:00
if ( ! head_extra )
2012-01-10 02:51:56 +04:00
return ;
2012-04-02 18:13:56 +04:00
list_for_each_entry_safe ( nextp , tmp , & head_extra - > lru , lru ) {
2012-01-10 02:51:56 +04:00
list_del ( & nextp - > lru ) ;
2012-04-02 18:13:56 +04:00
reset_page ( nextp ) ;
2012-01-10 02:51:56 +04:00
__free_page ( nextp ) ;
}
2012-04-02 18:13:56 +04:00
reset_page ( head_extra ) ;
__free_page ( head_extra ) ;
2012-01-10 02:51:56 +04:00
}
/* Initialize a newly allocated zspage */
static void init_zspage ( struct page * first_page , struct size_class * class )
{
unsigned long off = 0 ;
struct page * page = first_page ;
BUG_ON ( ! is_first_page ( first_page ) ) ;
while ( page ) {
struct page * next_page ;
struct link_free * link ;
unsigned int i , objs_on_page ;
/*
* page - > index stores offset of first object starting
* in the page . For the first page , this is always 0 ,
* so we use first_page - > index ( aka - > freelist ) to store
* head of corresponding zspage ' s freelist .
*/
if ( page ! = first_page )
page - > index = off ;
link = ( struct link_free * ) kmap_atomic ( page ) +
off / sizeof ( * link ) ;
objs_on_page = ( PAGE_SIZE - off ) / class - > size ;
for ( i = 1 ; i < = objs_on_page ; i + + ) {
off + = class - > size ;
if ( off < PAGE_SIZE ) {
link - > next = obj_location_to_handle ( page , i ) ;
link + = class - > size / sizeof ( * link ) ;
}
}
/*
* We now come to the last ( full or partial ) object on this
* page , which must point to the first object on the next
* page ( if present )
*/
next_page = get_next_page ( page ) ;
link - > next = obj_location_to_handle ( next_page , 0 ) ;
kunmap_atomic ( link ) ;
page = next_page ;
off = ( off + class - > size ) % PAGE_SIZE ;
}
}
/*
* Allocate a zspage for the given size class
*/
static struct page * alloc_zspage ( struct size_class * class , gfp_t flags )
{
int i , error ;
2012-06-14 01:03:42 +04:00
struct page * first_page = NULL , * uninitialized_var ( prev_page ) ;
2012-01-10 02:51:56 +04:00
/*
* Allocate individual pages and link them together as :
* 1. first page - > private = first sub - page
* 2. all sub - pages are linked together using page - > lru
* 3. each sub - page is linked to the first page using page - > first_page
*
* For each size class , First / Head pages are linked together using
* page - > lru . Also , we set PG_private to identify the first page
* ( i . e . no other sub - page has this flag set ) and PG_private_2 to
* identify the last page .
*/
error = - ENOMEM ;
2012-05-03 10:40:39 +04:00
for ( i = 0 ; i < class - > pages_per_zspage ; i + + ) {
2012-06-14 01:03:42 +04:00
struct page * page ;
2012-01-10 02:51:56 +04:00
page = alloc_page ( flags ) ;
if ( ! page )
goto cleanup ;
INIT_LIST_HEAD ( & page - > lru ) ;
if ( i = = 0 ) { /* first page */
2012-04-25 10:23:09 +04:00
SetPagePrivate ( page ) ;
2012-01-10 02:51:56 +04:00
set_page_private ( page , 0 ) ;
first_page = page ;
first_page - > inuse = 0 ;
}
if ( i = = 1 )
2013-07-12 11:08:13 +04:00
set_page_private ( first_page , ( unsigned long ) page ) ;
2012-01-10 02:51:56 +04:00
if ( i > = 1 )
page - > first_page = first_page ;
if ( i > = 2 )
list_add ( & page - > lru , & prev_page - > lru ) ;
2012-05-03 10:40:39 +04:00
if ( i = = class - > pages_per_zspage - 1 ) /* last page */
2012-04-25 10:23:09 +04:00
SetPagePrivate2 ( page ) ;
2012-01-10 02:51:56 +04:00
prev_page = page ;
}
init_zspage ( first_page , class ) ;
first_page - > freelist = obj_location_to_handle ( first_page , 0 ) ;
/* Maximum number of objects we can store in this zspage */
2012-05-03 10:40:39 +04:00
first_page - > objects = class - > pages_per_zspage * PAGE_SIZE / class - > size ;
2012-01-10 02:51:56 +04:00
error = 0 ; /* Success */
cleanup :
if ( unlikely ( error ) & & first_page ) {
free_zspage ( first_page ) ;
first_page = NULL ;
}
return first_page ;
}
static struct page * find_get_zspage ( struct size_class * class )
{
int i ;
struct page * page ;
for ( i = 0 ; i < _ZS_NR_FULLNESS_GROUPS ; i + + ) {
page = class - > fullness_list [ i ] ;
if ( page )
break ;
}
return page ;
}
2013-12-11 06:04:36 +04:00
# ifdef CONFIG_PGTABLE_MAPPING
2012-07-18 20:55:56 +04:00
static inline int __zs_cpu_up ( struct mapping_area * area )
{
/*
* Make sure we don ' t leak memory if a cpu UP notification
* and zs_init ( ) race and both call zs_cpu_up ( ) on the same cpu
*/
if ( area - > vm )
return 0 ;
area - > vm = alloc_vm_area ( PAGE_SIZE * 2 , NULL ) ;
if ( ! area - > vm )
return - ENOMEM ;
return 0 ;
}
static inline void __zs_cpu_down ( struct mapping_area * area )
{
if ( area - > vm )
free_vm_area ( area - > vm ) ;
area - > vm = NULL ;
}
static inline void * __zs_map_object ( struct mapping_area * area ,
struct page * pages [ 2 ] , int off , int size )
{
2014-08-07 03:06:58 +04:00
BUG_ON ( map_vm_area ( area - > vm , PAGE_KERNEL , pages ) ) ;
2012-07-18 20:55:56 +04:00
area - > vm_addr = area - > vm - > addr ;
return area - > vm_addr + off ;
}
static inline void __zs_unmap_object ( struct mapping_area * area ,
struct page * pages [ 2 ] , int off , int size )
{
unsigned long addr = ( unsigned long ) area - > vm_addr ;
2013-03-27 04:43:14 +04:00
unmap_kernel_range ( addr , PAGE_SIZE * 2 ) ;
2012-07-18 20:55:56 +04:00
}
2013-12-11 06:04:36 +04:00
# else /* CONFIG_PGTABLE_MAPPING */
2012-07-18 20:55:56 +04:00
static inline int __zs_cpu_up ( struct mapping_area * area )
{
/*
* Make sure we don ' t leak memory if a cpu UP notification
* and zs_init ( ) race and both call zs_cpu_up ( ) on the same cpu
*/
if ( area - > vm_buf )
return 0 ;
area - > vm_buf = ( char * ) __get_free_page ( GFP_KERNEL ) ;
if ( ! area - > vm_buf )
return - ENOMEM ;
return 0 ;
}
static inline void __zs_cpu_down ( struct mapping_area * area )
{
if ( area - > vm_buf )
free_page ( ( unsigned long ) area - > vm_buf ) ;
area - > vm_buf = NULL ;
}
static void * __zs_map_object ( struct mapping_area * area ,
struct page * pages [ 2 ] , int off , int size )
2012-07-03 01:15:49 +04:00
{
int sizes [ 2 ] ;
void * addr ;
2012-07-18 20:55:56 +04:00
char * buf = area - > vm_buf ;
2012-07-03 01:15:49 +04:00
2012-07-18 20:55:56 +04:00
/* disable page faults to match kmap_atomic() return conditions */
pagefault_disable ( ) ;
/* no read fastpath */
if ( area - > vm_mm = = ZS_MM_WO )
goto out ;
2012-07-03 01:15:49 +04:00
sizes [ 0 ] = PAGE_SIZE - off ;
sizes [ 1 ] = size - sizes [ 0 ] ;
/* copy object to per-cpu buffer */
addr = kmap_atomic ( pages [ 0 ] ) ;
memcpy ( buf , addr + off , sizes [ 0 ] ) ;
kunmap_atomic ( addr ) ;
addr = kmap_atomic ( pages [ 1 ] ) ;
memcpy ( buf + sizes [ 0 ] , addr , sizes [ 1 ] ) ;
kunmap_atomic ( addr ) ;
2012-07-18 20:55:56 +04:00
out :
return area - > vm_buf ;
2012-07-03 01:15:49 +04:00
}
2012-07-18 20:55:56 +04:00
static void __zs_unmap_object ( struct mapping_area * area ,
struct page * pages [ 2 ] , int off , int size )
2012-07-03 01:15:49 +04:00
{
int sizes [ 2 ] ;
void * addr ;
2012-07-18 20:55:56 +04:00
char * buf = area - > vm_buf ;
2012-07-03 01:15:49 +04:00
2012-07-18 20:55:56 +04:00
/* no write fastpath */
if ( area - > vm_mm = = ZS_MM_RO )
goto out ;
2012-07-03 01:15:49 +04:00
sizes [ 0 ] = PAGE_SIZE - off ;
sizes [ 1 ] = size - sizes [ 0 ] ;
/* copy per-cpu buffer to object */
addr = kmap_atomic ( pages [ 0 ] ) ;
memcpy ( addr + off , buf , sizes [ 0 ] ) ;
kunmap_atomic ( addr ) ;
addr = kmap_atomic ( pages [ 1 ] ) ;
memcpy ( addr , buf + sizes [ 0 ] , sizes [ 1 ] ) ;
kunmap_atomic ( addr ) ;
2012-07-18 20:55:56 +04:00
out :
/* enable page faults to match kunmap_atomic() return conditions */
pagefault_enable ( ) ;
2012-07-03 01:15:49 +04:00
}
2012-01-10 02:51:56 +04:00
2013-12-11 06:04:36 +04:00
# endif /* CONFIG_PGTABLE_MAPPING */
2012-07-18 20:55:56 +04:00
2012-01-10 02:51:56 +04:00
static int zs_cpu_notifier ( struct notifier_block * nb , unsigned long action ,
void * pcpu )
{
2012-07-18 20:55:56 +04:00
int ret , cpu = ( long ) pcpu ;
2012-01-10 02:51:56 +04:00
struct mapping_area * area ;
switch ( action ) {
case CPU_UP_PREPARE :
area = & per_cpu ( zs_map_area , cpu ) ;
2012-07-18 20:55:56 +04:00
ret = __zs_cpu_up ( area ) ;
if ( ret )
return notifier_from_errno ( ret ) ;
2012-01-10 02:51:56 +04:00
break ;
case CPU_DEAD :
case CPU_UP_CANCELED :
area = & per_cpu ( zs_map_area , cpu ) ;
2012-07-18 20:55:56 +04:00
__zs_cpu_down ( area ) ;
2012-01-10 02:51:56 +04:00
break ;
}
return NOTIFY_OK ;
}
static struct notifier_block zs_cpu_nb = {
. notifier_call = zs_cpu_notifier
} ;
static void zs_exit ( void )
{
int cpu ;
2014-08-07 03:08:38 +04:00
# ifdef CONFIG_ZPOOL
zpool_unregister_driver ( & zs_zpool_driver ) ;
# endif
2014-03-11 00:39:59 +04:00
cpu_notifier_register_begin ( ) ;
2012-01-10 02:51:56 +04:00
for_each_online_cpu ( cpu )
zs_cpu_notifier ( NULL , CPU_DEAD , ( void * ) ( long ) cpu ) ;
2014-03-11 00:39:59 +04:00
__unregister_cpu_notifier ( & zs_cpu_nb ) ;
cpu_notifier_register_done ( ) ;
2012-01-10 02:51:56 +04:00
}
static int zs_init ( void )
{
int cpu , ret ;
2014-03-11 00:39:59 +04:00
cpu_notifier_register_begin ( ) ;
__register_cpu_notifier ( & zs_cpu_nb ) ;
2012-01-10 02:51:56 +04:00
for_each_online_cpu ( cpu ) {
ret = zs_cpu_notifier ( NULL , CPU_UP_PREPARE , ( void * ) ( long ) cpu ) ;
2014-03-11 00:39:59 +04:00
if ( notifier_to_errno ( ret ) ) {
cpu_notifier_register_done ( ) ;
2012-01-10 02:51:56 +04:00
goto fail ;
2014-03-11 00:39:59 +04:00
}
2012-01-10 02:51:56 +04:00
}
2014-03-11 00:39:59 +04:00
cpu_notifier_register_done ( ) ;
2014-08-07 03:08:38 +04:00
# ifdef CONFIG_ZPOOL
zpool_register_driver ( & zs_zpool_driver ) ;
# endif
2012-01-10 02:51:56 +04:00
return 0 ;
fail :
zs_exit ( ) ;
return notifier_to_errno ( ret ) ;
}
2013-01-05 00:14:00 +04:00
/**
* zs_create_pool - Creates an allocation pool to work from .
2013-01-30 19:36:52 +04:00
* @ flags : allocation flags used to allocate pool metadata
2013-01-05 00:14:00 +04:00
*
* This function must be called before anything when using
* the zsmalloc allocator .
*
* On success , a pointer to the newly created pool is returned ,
* otherwise NULL .
*/
2013-01-30 19:36:52 +04:00
struct zs_pool * zs_create_pool ( gfp_t flags )
2012-01-10 02:51:56 +04:00
{
2012-06-20 05:31:11 +04:00
int i , ovhd_size ;
2012-01-10 02:51:56 +04:00
struct zs_pool * pool ;
ovhd_size = roundup ( sizeof ( * pool ) , PAGE_SIZE ) ;
pool = kzalloc ( ovhd_size , GFP_KERNEL ) ;
if ( ! pool )
return NULL ;
for ( i = 0 ; i < ZS_SIZE_CLASSES ; i + + ) {
int size ;
struct size_class * class ;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA ;
if ( size > ZS_MAX_ALLOC_SIZE )
size = ZS_MAX_ALLOC_SIZE ;
class = & pool - > size_class [ i ] ;
class - > size = size ;
class - > index = i ;
spin_lock_init ( & class - > lock ) ;
2012-05-03 10:40:39 +04:00
class - > pages_per_zspage = get_pages_per_zspage ( size ) ;
2012-01-10 02:51:56 +04:00
}
pool - > flags = flags ;
return pool ;
}
EXPORT_SYMBOL_GPL ( zs_create_pool ) ;
void zs_destroy_pool ( struct zs_pool * pool )
{
int i ;
for ( i = 0 ; i < ZS_SIZE_CLASSES ; i + + ) {
int fg ;
struct size_class * class = & pool - > size_class [ i ] ;
for ( fg = 0 ; fg < _ZS_NR_FULLNESS_GROUPS ; fg + + ) {
if ( class - > fullness_list [ fg ] ) {
2013-05-16 00:56:49 +04:00
pr_info ( " Freeing non-empty class with size %db, fullness group %d \n " ,
2012-01-10 02:51:56 +04:00
class - > size , fg ) ;
}
}
}
kfree ( pool ) ;
}
EXPORT_SYMBOL_GPL ( zs_destroy_pool ) ;
/**
* zs_malloc - Allocate block of given size from pool .
* @ pool : pool to allocate from
* @ size : size of block to allocate
*
2012-05-03 10:40:40 +04:00
* On success , handle to the allocated object is returned ,
2012-06-08 10:39:25 +04:00
* otherwise 0.
2012-01-10 02:51:56 +04:00
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail .
*/
2012-06-08 10:39:25 +04:00
unsigned long zs_malloc ( struct zs_pool * pool , size_t size )
2012-01-10 02:51:56 +04:00
{
2012-06-08 10:39:25 +04:00
unsigned long obj ;
2012-01-10 02:51:56 +04:00
struct link_free * link ;
int class_idx ;
struct size_class * class ;
struct page * first_page , * m_page ;
unsigned long m_objidx , m_offset ;
if ( unlikely ( ! size | | size > ZS_MAX_ALLOC_SIZE ) )
2012-06-08 10:39:25 +04:00
return 0 ;
2012-01-10 02:51:56 +04:00
class_idx = get_size_class_index ( size ) ;
class = & pool - > size_class [ class_idx ] ;
BUG_ON ( class_idx ! = class - > index ) ;
spin_lock ( & class - > lock ) ;
first_page = find_get_zspage ( class ) ;
if ( ! first_page ) {
spin_unlock ( & class - > lock ) ;
first_page = alloc_zspage ( class , pool - > flags ) ;
if ( unlikely ( ! first_page ) )
2012-06-08 10:39:25 +04:00
return 0 ;
2012-01-10 02:51:56 +04:00
set_zspage_mapping ( first_page , class - > index , ZS_EMPTY ) ;
spin_lock ( & class - > lock ) ;
2012-05-03 10:40:39 +04:00
class - > pages_allocated + = class - > pages_per_zspage ;
2012-01-10 02:51:56 +04:00
}
2012-06-08 10:39:25 +04:00
obj = ( unsigned long ) first_page - > freelist ;
2012-01-10 02:51:56 +04:00
obj_handle_to_location ( obj , & m_page , & m_objidx ) ;
m_offset = obj_idx_to_offset ( m_page , m_objidx , class - > size ) ;
link = ( struct link_free * ) kmap_atomic ( m_page ) +
m_offset / sizeof ( * link ) ;
first_page - > freelist = link - > next ;
memset ( link , POISON_INUSE , sizeof ( * link ) ) ;
kunmap_atomic ( link ) ;
first_page - > inuse + + ;
/* Now move the zspage to another fullness group, if required */
fix_fullness_group ( pool , first_page ) ;
spin_unlock ( & class - > lock ) ;
return obj ;
}
EXPORT_SYMBOL_GPL ( zs_malloc ) ;
2012-06-08 10:39:25 +04:00
void zs_free ( struct zs_pool * pool , unsigned long obj )
2012-01-10 02:51:56 +04:00
{
struct link_free * link ;
struct page * first_page , * f_page ;
unsigned long f_objidx , f_offset ;
int class_idx ;
struct size_class * class ;
enum fullness_group fullness ;
if ( unlikely ( ! obj ) )
return ;
obj_handle_to_location ( obj , & f_page , & f_objidx ) ;
first_page = get_first_page ( f_page ) ;
get_zspage_mapping ( first_page , & class_idx , & fullness ) ;
class = & pool - > size_class [ class_idx ] ;
f_offset = obj_idx_to_offset ( f_page , f_objidx , class - > size ) ;
spin_lock ( & class - > lock ) ;
/* Insert this object in containing zspage's freelist */
link = ( struct link_free * ) ( ( unsigned char * ) kmap_atomic ( f_page )
+ f_offset ) ;
link - > next = first_page - > freelist ;
kunmap_atomic ( link ) ;
2012-06-08 10:39:25 +04:00
first_page - > freelist = ( void * ) obj ;
2012-01-10 02:51:56 +04:00
first_page - > inuse - - ;
fullness = fix_fullness_group ( pool , first_page ) ;
if ( fullness = = ZS_EMPTY )
2012-05-03 10:40:39 +04:00
class - > pages_allocated - = class - > pages_per_zspage ;
2012-01-10 02:51:56 +04:00
spin_unlock ( & class - > lock ) ;
if ( fullness = = ZS_EMPTY )
free_zspage ( first_page ) ;
}
EXPORT_SYMBOL_GPL ( zs_free ) ;
2012-05-03 10:40:40 +04:00
/**
* zs_map_object - get address of allocated object from handle .
* @ pool : pool from which the object was allocated
* @ handle : handle returned from zs_malloc
*
* Before using an object allocated from zs_malloc , it must be mapped using
* this function . When done with the object , it must be unmapped using
2012-07-03 01:15:51 +04:00
* zs_unmap_object .
*
* Only one object can be mapped per cpu at a time . There is no protection
* against nested mappings .
*
* This function returns with preemption and page faults disabled .
2013-05-20 23:18:14 +04:00
*/
2012-07-03 01:15:52 +04:00
void * zs_map_object ( struct zs_pool * pool , unsigned long handle ,
enum zs_mapmode mm )
2012-01-10 02:51:56 +04:00
{
struct page * page ;
unsigned long obj_idx , off ;
unsigned int class_idx ;
enum fullness_group fg ;
struct size_class * class ;
struct mapping_area * area ;
2012-07-18 20:55:56 +04:00
struct page * pages [ 2 ] ;
2012-01-10 02:51:56 +04:00
BUG_ON ( ! handle ) ;
2012-07-18 20:55:55 +04:00
/*
* Because we use per - cpu mapping areas shared among the
* pools / users , we can ' t allow mapping in interrupt context
* because it can corrupt another users mappings .
*/
BUG_ON ( in_interrupt ( ) ) ;
2012-01-10 02:51:56 +04:00
obj_handle_to_location ( handle , & page , & obj_idx ) ;
get_zspage_mapping ( get_first_page ( page ) , & class_idx , & fg ) ;
class = & pool - > size_class [ class_idx ] ;
off = obj_idx_to_offset ( page , obj_idx , class - > size ) ;
area = & get_cpu_var ( zs_map_area ) ;
2012-07-18 20:55:56 +04:00
area - > vm_mm = mm ;
2012-01-10 02:51:56 +04:00
if ( off + class - > size < = PAGE_SIZE ) {
/* this object is contained entirely within a page */
area - > vm_addr = kmap_atomic ( page ) ;
2012-07-03 01:15:49 +04:00
return area - > vm_addr + off ;
2012-01-10 02:51:56 +04:00
}
2012-07-18 20:55:56 +04:00
/* this object spans two pages */
pages [ 0 ] = page ;
pages [ 1 ] = get_next_page ( page ) ;
BUG_ON ( ! pages [ 1 ] ) ;
2012-07-03 01:15:52 +04:00
2012-07-18 20:55:56 +04:00
return __zs_map_object ( area , pages , off , class - > size ) ;
2012-01-10 02:51:56 +04:00
}
EXPORT_SYMBOL_GPL ( zs_map_object ) ;
2012-06-08 10:39:25 +04:00
void zs_unmap_object ( struct zs_pool * pool , unsigned long handle )
2012-01-10 02:51:56 +04:00
{
struct page * page ;
unsigned long obj_idx , off ;
unsigned int class_idx ;
enum fullness_group fg ;
struct size_class * class ;
struct mapping_area * area ;
BUG_ON ( ! handle ) ;
obj_handle_to_location ( handle , & page , & obj_idx ) ;
get_zspage_mapping ( get_first_page ( page ) , & class_idx , & fg ) ;
class = & pool - > size_class [ class_idx ] ;
off = obj_idx_to_offset ( page , obj_idx , class - > size ) ;
2014-06-05 03:07:56 +04:00
area = this_cpu_ptr ( & zs_map_area ) ;
2012-07-18 20:55:56 +04:00
if ( off + class - > size < = PAGE_SIZE )
kunmap_atomic ( area - > vm_addr ) ;
else {
struct page * pages [ 2 ] ;
pages [ 0 ] = page ;
pages [ 1 ] = get_next_page ( page ) ;
BUG_ON ( ! pages [ 1 ] ) ;
2012-07-03 01:15:52 +04:00
2012-07-18 20:55:56 +04:00
__zs_unmap_object ( area , pages , off , class - > size ) ;
}
2012-01-10 02:51:56 +04:00
put_cpu_var ( zs_map_area ) ;
}
EXPORT_SYMBOL_GPL ( zs_unmap_object ) ;
u64 zs_get_total_size_bytes ( struct zs_pool * pool )
{
int i ;
u64 npages = 0 ;
for ( i = 0 ; i < ZS_SIZE_CLASSES ; i + + )
npages + = pool - > size_class [ i ] . pages_allocated ;
return npages < < PAGE_SHIFT ;
}
EXPORT_SYMBOL_GPL ( zs_get_total_size_bytes ) ;
2012-06-20 05:31:11 +04:00
module_init ( zs_init ) ;
module_exit ( zs_exit ) ;
MODULE_LICENSE ( " Dual BSD/GPL " ) ;
MODULE_AUTHOR ( " Nitin Gupta <ngupta@vflare.org> " ) ;