2012-11-08 21:57:20 -05:00
/*
* fs / ext4 / extents_status . c
*
* Written by Yongqiang Yang < xiaoqiangnk @ gmail . com >
* Modified by
* Allison Henderson < achender @ linux . vnet . ibm . com >
* Hugh Dickins < hughd @ google . com >
* Zheng Liu < wenqing . lz @ taobao . com >
*
* Ext4 extents status tree core functions .
*/
# include <linux/rbtree.h>
# include "ext4.h"
# include "extents_status.h"
# include "ext4_extents.h"
2012-11-08 21:57:33 -05:00
# include <trace/events/ext4.h>
2012-11-08 21:57:20 -05:00
/*
* According to previous discussion in Ext4 Developer Workshop , we
* will introduce a new structure called io tree to track all extent
* status in order to solve some problems that we have met
* ( e . g . Reservation space warning ) , and provide extent - level locking .
* Delay extent tree is the first step to achieve this goal . It is
* original built by Yongqiang Yang . At that time it is called delay
* extent tree , whose goal is only track delay extent in memory to
* simplify the implementation of fiemap and bigalloc , and introduce
* lseek SEEK_DATA / SEEK_HOLE support . That is why it is still called
* delay extent tree at the following comment . But for better
* understand what it does , it has been rename to extent status tree .
*
* Currently the first step has been done . All delay extents are
* tracked in the tree . It maintains the delay extent when a delay
* allocation is issued , and the delay extent is written out or
* invalidated . Therefore the implementation of fiemap and bigalloc
* are simplified , and SEEK_DATA / SEEK_HOLE are introduced .
*
* The following comment describes the implemenmtation of extent
* status tree and future works .
*/
/*
* extents status tree implementation for ext4 .
*
*
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* Extents status encompass delayed extents and extent locks
*
* 1. Why delayed extent implementation ?
*
* Without delayed extent , ext4 identifies a delayed extent by looking
* up page cache , this has several deficiencies - complicated , buggy ,
* and inefficient code .
*
* FIEMAP , SEEK_HOLE / DATA , bigalloc , punch hole and writeout all need
* to know if a block or a range of blocks are belonged to a delayed
* extent .
*
* Let us have a look at how they do without delayed extents implementation .
* - - FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes .
*
* - - SEEK_HOLE / DATA
* SEEK_HOLE / DATA has the same problem as FIEMAP .
*
* - - bigalloc
* bigalloc looks up page cache to figure out if a block is
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster .
*
* - - punch hole
* punch hole looks up page cache to identify a delayed extent .
*
* - - writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped , If there are not very many delayed buffers , then it is
* time comsuming .
*
* With delayed extents implementation , FIEMAP , SEEK_HOLE / DATA ,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation ( belonged to a delayed extent ) or
* not by searching the delayed extent tree .
*
*
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* 2. ext4 delayed extents impelmentation
*
* - - delayed extent
* A delayed extent is a range of blocks which are contiguous
* logically and under delayed allocation . Unlike extent in
* ext4 , delayed extent in ext4 is a in - memory struct , there is
* no corresponding on - disk data . There is no limit on length of
* delayed extent , so a delayed extent can contain as many blocks
* as they are contiguous logically .
*
* - - delayed extent tree
* Every inode has a delayed extent tree and all under delayed
* allocation blocks are added to the tree as delayed extents .
* Delayed extents in the tree are ordered by logical block no .
*
* - - operations on a delayed extent tree
* There are three operations on a delayed extent tree : find next
* delayed extent , adding a space ( a range of blocks ) and removing
* a space .
*
* - - race on a delayed extent tree
* Delayed extent tree is protected inode - > i_es_lock .
*
*
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* 3. performance analysis
* - - overhead
* 1. There is a cache extent for write access , so if writes are
* not very random , adding space operaions are in O ( 1 ) time .
*
* - - gain
* 2. Code is much simpler , more readable , more maintainable and
* more efficient .
*
*
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* 4. TODO list
* - - Track all extent status
*
* - - Improve get block process
*
* - - Extent - level locking
*/
static struct kmem_cache * ext4_es_cachep ;
int __init ext4_init_es ( void )
{
ext4_es_cachep = KMEM_CACHE ( extent_status , SLAB_RECLAIM_ACCOUNT ) ;
if ( ext4_es_cachep = = NULL )
return - ENOMEM ;
return 0 ;
}
void ext4_exit_es ( void )
{
if ( ext4_es_cachep )
kmem_cache_destroy ( ext4_es_cachep ) ;
}
void ext4_es_init_tree ( struct ext4_es_tree * tree )
{
tree - > root = RB_ROOT ;
tree - > cache_es = NULL ;
}
# ifdef ES_DEBUG__
static void ext4_es_print_tree ( struct inode * inode )
{
struct ext4_es_tree * tree ;
struct rb_node * node ;
printk ( KERN_DEBUG " status extents for inode %lu: " , inode - > i_ino ) ;
tree = & EXT4_I ( inode ) - > i_es_tree ;
node = rb_first ( & tree - > root ) ;
while ( node ) {
struct extent_status * es ;
es = rb_entry ( node , struct extent_status , rb_node ) ;
printk ( KERN_DEBUG " [%u/%u) " , es - > start , es - > len ) ;
node = rb_next ( node ) ;
}
printk ( KERN_DEBUG " \n " ) ;
}
# else
# define ext4_es_print_tree(inode)
# endif
static inline ext4_lblk_t extent_status_end ( struct extent_status * es )
{
BUG_ON ( es - > start + es - > len < es - > start ) ;
return es - > start + es - > len - 1 ;
}
/*
* search through the tree for an delayed extent with a given offset . If
* it can ' t be found , try to find next extent .
*/
static struct extent_status * __es_tree_search ( struct rb_root * root ,
ext4_lblk_t offset )
{
struct rb_node * node = root - > rb_node ;
struct extent_status * es = NULL ;
while ( node ) {
es = rb_entry ( node , struct extent_status , rb_node ) ;
if ( offset < es - > start )
node = node - > rb_left ;
else if ( offset > extent_status_end ( es ) )
node = node - > rb_right ;
else
return es ;
}
if ( es & & offset < es - > start )
return es ;
if ( es & & offset > extent_status_end ( es ) ) {
node = rb_next ( & es - > rb_node ) ;
return node ? rb_entry ( node , struct extent_status , rb_node ) :
NULL ;
}
return NULL ;
}
/*
* ext4_es_find_extent : find the 1 st delayed extent covering @ es - > start
* if it exists , otherwise , the next extent after @ es - > start .
*
* @ inode : the inode which owns delayed extents
* @ es : delayed extent that we found
*
* Returns the first block of the next extent after es , otherwise
* EXT_MAX_BLOCKS if no delay extent is found .
* Delayed extent is returned via @ es .
*/
ext4_lblk_t ext4_es_find_extent ( struct inode * inode , struct extent_status * es )
{
struct ext4_es_tree * tree = NULL ;
struct extent_status * es1 = NULL ;
struct rb_node * node ;
ext4_lblk_t ret = EXT_MAX_BLOCKS ;
2012-11-08 21:57:33 -05:00
trace_ext4_es_find_extent_enter ( inode , es - > start ) ;
2012-11-08 21:57:20 -05:00
read_lock ( & EXT4_I ( inode ) - > i_es_lock ) ;
tree = & EXT4_I ( inode ) - > i_es_tree ;
/* find delay extent in cache firstly */
if ( tree - > cache_es ) {
es1 = tree - > cache_es ;
if ( in_range ( es - > start , es1 - > start , es1 - > len ) ) {
es_debug ( " %u cached by [%u/%u) \n " ,
es - > start , es1 - > start , es1 - > len ) ;
goto out ;
}
}
es - > len = 0 ;
es1 = __es_tree_search ( & tree - > root , es - > start ) ;
out :
if ( es1 ) {
tree - > cache_es = es1 ;
es - > start = es1 - > start ;
es - > len = es1 - > len ;
node = rb_next ( & es1 - > rb_node ) ;
if ( node ) {
es1 = rb_entry ( node , struct extent_status , rb_node ) ;
ret = es1 - > start ;
}
}
read_unlock ( & EXT4_I ( inode ) - > i_es_lock ) ;
2012-11-08 21:57:33 -05:00
trace_ext4_es_find_extent_exit ( inode , es , ret ) ;
2012-11-08 21:57:20 -05:00
return ret ;
}
static struct extent_status *
ext4_es_alloc_extent ( ext4_lblk_t start , ext4_lblk_t len )
{
struct extent_status * es ;
es = kmem_cache_alloc ( ext4_es_cachep , GFP_ATOMIC ) ;
if ( es = = NULL )
return NULL ;
es - > start = start ;
es - > len = len ;
return es ;
}
static void ext4_es_free_extent ( struct extent_status * es )
{
kmem_cache_free ( ext4_es_cachep , es ) ;
}
static struct extent_status *
ext4_es_try_to_merge_left ( struct ext4_es_tree * tree , struct extent_status * es )
{
struct extent_status * es1 ;
struct rb_node * node ;
node = rb_prev ( & es - > rb_node ) ;
if ( ! node )
return es ;
es1 = rb_entry ( node , struct extent_status , rb_node ) ;
if ( es - > start = = extent_status_end ( es1 ) + 1 ) {
es1 - > len + = es - > len ;
rb_erase ( & es - > rb_node , & tree - > root ) ;
ext4_es_free_extent ( es ) ;
es = es1 ;
}
return es ;
}
static struct extent_status *
ext4_es_try_to_merge_right ( struct ext4_es_tree * tree , struct extent_status * es )
{
struct extent_status * es1 ;
struct rb_node * node ;
node = rb_next ( & es - > rb_node ) ;
if ( ! node )
return es ;
es1 = rb_entry ( node , struct extent_status , rb_node ) ;
if ( es1 - > start = = extent_status_end ( es ) + 1 ) {
es - > len + = es1 - > len ;
rb_erase ( node , & tree - > root ) ;
ext4_es_free_extent ( es1 ) ;
}
return es ;
}
static int __es_insert_extent ( struct ext4_es_tree * tree , ext4_lblk_t offset ,
ext4_lblk_t len )
{
struct rb_node * * p = & tree - > root . rb_node ;
struct rb_node * parent = NULL ;
struct extent_status * es ;
ext4_lblk_t end = offset + len - 1 ;
BUG_ON ( end < offset ) ;
es = tree - > cache_es ;
if ( es & & offset = = ( extent_status_end ( es ) + 1 ) ) {
es_debug ( " cached by [%u/%u) \n " , es - > start , es - > len ) ;
es - > len + = len ;
es = ext4_es_try_to_merge_right ( tree , es ) ;
goto out ;
} else if ( es & & es - > start = = end + 1 ) {
es_debug ( " cached by [%u/%u) \n " , es - > start , es - > len ) ;
es - > start = offset ;
es - > len + = len ;
es = ext4_es_try_to_merge_left ( tree , es ) ;
goto out ;
} else if ( es & & es - > start < = offset & &
end < = extent_status_end ( es ) ) {
es_debug ( " cached by [%u/%u) \n " , es - > start , es - > len ) ;
goto out ;
}
while ( * p ) {
parent = * p ;
es = rb_entry ( parent , struct extent_status , rb_node ) ;
if ( offset < es - > start ) {
if ( es - > start = = end + 1 ) {
es - > start = offset ;
es - > len + = len ;
es = ext4_es_try_to_merge_left ( tree , es ) ;
goto out ;
}
p = & ( * p ) - > rb_left ;
} else if ( offset > extent_status_end ( es ) ) {
if ( offset = = extent_status_end ( es ) + 1 ) {
es - > len + = len ;
es = ext4_es_try_to_merge_right ( tree , es ) ;
goto out ;
}
p = & ( * p ) - > rb_right ;
} else {
if ( extent_status_end ( es ) < = end )
es - > len = offset - es - > start + len ;
goto out ;
}
}
es = ext4_es_alloc_extent ( offset , len ) ;
if ( ! es )
return - ENOMEM ;
rb_link_node ( & es - > rb_node , parent , p ) ;
rb_insert_color ( & es - > rb_node , & tree - > root ) ;
out :
tree - > cache_es = es ;
return 0 ;
}
/*
* ext4_es_insert_extent ( ) adds a space to a delayed extent tree .
* Caller holds inode - > i_es_lock .
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent .
*
* Return 0 on success , error code on failure .
*/
int ext4_es_insert_extent ( struct inode * inode , ext4_lblk_t offset ,
ext4_lblk_t len )
{
struct ext4_es_tree * tree ;
int err = 0 ;
2012-11-08 21:57:33 -05:00
trace_ext4_es_insert_extent ( inode , offset , len ) ;
2012-11-08 21:57:20 -05:00
es_debug ( " add [%u/%u) to extent status tree of inode %lu \n " ,
offset , len , inode - > i_ino ) ;
write_lock ( & EXT4_I ( inode ) - > i_es_lock ) ;
tree = & EXT4_I ( inode ) - > i_es_tree ;
err = __es_insert_extent ( tree , offset , len ) ;
write_unlock ( & EXT4_I ( inode ) - > i_es_lock ) ;
ext4_es_print_tree ( inode ) ;
return err ;
}
/*
* ext4_es_remove_extent ( ) removes a space from a delayed extent tree .
* Caller holds inode - > i_es_lock .
*
* Return 0 on success , error code on failure .
*/
int ext4_es_remove_extent ( struct inode * inode , ext4_lblk_t offset ,
ext4_lblk_t len )
{
struct rb_node * node ;
struct ext4_es_tree * tree ;
struct extent_status * es ;
struct extent_status orig_es ;
ext4_lblk_t len1 , len2 , end ;
int err = 0 ;
2012-11-08 21:57:33 -05:00
trace_ext4_es_remove_extent ( inode , offset , len ) ;
2012-11-08 21:57:20 -05:00
es_debug ( " remove [%u/%u) from extent status tree of inode %lu \n " ,
offset , len , inode - > i_ino ) ;
end = offset + len - 1 ;
BUG_ON ( end < offset ) ;
write_lock ( & EXT4_I ( inode ) - > i_es_lock ) ;
tree = & EXT4_I ( inode ) - > i_es_tree ;
es = __es_tree_search ( & tree - > root , offset ) ;
if ( ! es )
goto out ;
if ( es - > start > end )
goto out ;
/* Simply invalidate cache_es. */
tree - > cache_es = NULL ;
orig_es . start = es - > start ;
orig_es . len = es - > len ;
len1 = offset > es - > start ? offset - es - > start : 0 ;
len2 = extent_status_end ( es ) > end ?
extent_status_end ( es ) - end : 0 ;
if ( len1 > 0 )
es - > len = len1 ;
if ( len2 > 0 ) {
if ( len1 > 0 ) {
err = __es_insert_extent ( tree , end + 1 , len2 ) ;
if ( err ) {
es - > start = orig_es . start ;
es - > len = orig_es . len ;
goto out ;
}
} else {
es - > start = end + 1 ;
es - > len = len2 ;
}
goto out ;
}
if ( len1 > 0 ) {
node = rb_next ( & es - > rb_node ) ;
if ( node )
es = rb_entry ( node , struct extent_status , rb_node ) ;
else
es = NULL ;
}
while ( es & & extent_status_end ( es ) < = end ) {
node = rb_next ( & es - > rb_node ) ;
rb_erase ( & es - > rb_node , & tree - > root ) ;
ext4_es_free_extent ( es ) ;
if ( ! node ) {
es = NULL ;
break ;
}
es = rb_entry ( node , struct extent_status , rb_node ) ;
}
if ( es & & es - > start < end + 1 ) {
len1 = extent_status_end ( es ) - end ;
es - > start = end + 1 ;
es - > len = len1 ;
}
out :
write_unlock ( & EXT4_I ( inode ) - > i_es_lock ) ;
ext4_es_print_tree ( inode ) ;
return err ;
}