2011-06-28 03:40:50 +04:00
/*
* linux / fs / ext4 / indirect . c
*
* from
*
* linux / fs / ext4 / inode . c
*
* Copyright ( C ) 1992 , 1993 , 1994 , 1995
* Remy Card ( card @ masi . ibp . fr )
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie ( Paris VI )
*
* from
*
* linux / fs / minix / inode . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* Goal - directed block allocation by Stephen Tweedie
* ( sct @ redhat . com ) , 1993 , 1998
*/
# include "ext4_jbd2.h"
# include "truncate.h"
2012-11-28 22:03:30 +04:00
# include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
2011-06-28 03:40:50 +04:00
# include <trace/events/ext4.h>
typedef struct {
__le32 * p ;
__le32 key ;
struct buffer_head * bh ;
} Indirect ;
static inline void add_chain ( Indirect * p , struct buffer_head * bh , __le32 * v )
{
p - > key = * ( p - > p = v ) ;
p - > bh = bh ;
}
/**
* ext4_block_to_path - parse the block number into array of offsets
* @ inode : inode in question ( we are only interested in its superblock )
* @ i_block : block number to be parsed
* @ offsets : array to store the offsets in
* @ boundary : set this non - zero if the referred - to block is likely to be
* followed ( on disk ) by an indirect block .
*
* To store the locations of file ' s data ext4 uses a data structure common
* for UNIX filesystems - tree of pointers anchored in the inode , with
* data blocks at leaves and indirect blocks in intermediate nodes .
* This function translates the block number into path in that tree -
* return value is the path length and @ offsets [ n ] is the offset of
* pointer to ( n + 1 ) th node in the nth one . If @ block is out of range
* ( negative or too large ) warning is printed and zero returned .
*
* Note : function doesn ' t find node addresses , so no IO is needed . All
* we need to know is the capacity of indirect blocks ( taken from the
* inode - > i_sb ) .
*/
/*
* Portability note : the last comparison ( check that we fit into triple
* indirect block ) is spelled differently , because otherwise on an
* architecture with 32 - bit longs and 8 Kb pages we might get into trouble
* if our filesystem had 8 Kb blocks . We might use long long , but that would
* kill us on x86 . Oh , well , at least the sign propagation does not matter -
* i_block would have to be negative in the very beginning , so we would not
* get there at all .
*/
static int ext4_block_to_path ( struct inode * inode ,
ext4_lblk_t i_block ,
ext4_lblk_t offsets [ 4 ] , int * boundary )
{
int ptrs = EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ;
int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS ( inode - > i_sb ) ;
const long direct_blocks = EXT4_NDIR_BLOCKS ,
indirect_blocks = ptrs ,
double_blocks = ( 1 < < ( ptrs_bits * 2 ) ) ;
int n = 0 ;
int final = 0 ;
if ( i_block < direct_blocks ) {
offsets [ n + + ] = i_block ;
final = direct_blocks ;
} else if ( ( i_block - = direct_blocks ) < indirect_blocks ) {
offsets [ n + + ] = EXT4_IND_BLOCK ;
offsets [ n + + ] = i_block ;
final = ptrs ;
} else if ( ( i_block - = indirect_blocks ) < double_blocks ) {
offsets [ n + + ] = EXT4_DIND_BLOCK ;
offsets [ n + + ] = i_block > > ptrs_bits ;
offsets [ n + + ] = i_block & ( ptrs - 1 ) ;
final = ptrs ;
} else if ( ( ( i_block - = double_blocks ) > > ( ptrs_bits * 2 ) ) < ptrs ) {
offsets [ n + + ] = EXT4_TIND_BLOCK ;
offsets [ n + + ] = i_block > > ( ptrs_bits * 2 ) ;
offsets [ n + + ] = ( i_block > > ptrs_bits ) & ( ptrs - 1 ) ;
offsets [ n + + ] = i_block & ( ptrs - 1 ) ;
final = ptrs ;
} else {
ext4_warning ( inode - > i_sb , " block %lu > max in inode %lu " ,
i_block + direct_blocks +
indirect_blocks + double_blocks , inode - > i_ino ) ;
}
if ( boundary )
* boundary = final - 1 - ( i_block & ( ptrs - 1 ) ) ;
return n ;
}
/**
* ext4_get_branch - read the chain of indirect blocks leading to data
* @ inode : inode in question
* @ depth : depth of the chain ( 1 - direct pointer , etc . )
* @ offsets : offsets of pointers in inode / indirect blocks
* @ chain : place to store the result
* @ err : here we store the error value
*
* Function fills the array of triples < key , p , bh > and returns % NULL
* if everything went OK or the pointer to the last filled triple
* ( incomplete one ) otherwise . Upon the return chain [ i ] . key contains
* the number of ( i + 1 ) - th block in the chain ( as it is stored in memory ,
* i . e . little - endian 32 - bit ) , chain [ i ] . p contains the address of that
* number ( it points into struct inode for i = = 0 and into the bh - > b_data
* for i > 0 ) and chain [ i ] . bh points to the buffer_head of i - th indirect
* block for i > 0 and NULL for i = = 0. In other words , it holds the block
* numbers of the chain , addresses they were taken from ( and where we can
* verify that chain did not change ) and buffer_heads hosting these
* numbers .
*
* Function stops when it stumbles upon zero pointer ( absent block )
* ( pointer to last triple returned , * @ err = = 0 )
* or when it gets an IO error reading an indirect block
* ( ditto , * @ err = = - EIO )
* or when it reads all @ depth - 1 indirect blocks successfully and finds
* the whole chain , all way to the data ( returns % NULL , * err = = 0 ) .
*
* Need to be called with
* down_read ( & EXT4_I ( inode ) - > i_data_sem )
*/
static Indirect * ext4_get_branch ( struct inode * inode , int depth ,
ext4_lblk_t * offsets ,
Indirect chain [ 4 ] , int * err )
{
struct super_block * sb = inode - > i_sb ;
Indirect * p = chain ;
struct buffer_head * bh ;
2013-01-13 01:19:36 +04:00
int ret = - EIO ;
2011-06-28 03:40:50 +04:00
* err = 0 ;
/* i_data is not going away, no lock needed */
add_chain ( chain , NULL , EXT4_I ( inode ) - > i_data + * offsets ) ;
if ( ! p - > key )
goto no_block ;
while ( - - depth ) {
bh = sb_getblk ( sb , le32_to_cpu ( p - > key ) ) ;
2013-01-13 01:19:36 +04:00
if ( unlikely ( ! bh ) ) {
ret = - ENOMEM ;
2011-06-28 03:40:50 +04:00
goto failure ;
2013-01-13 01:19:36 +04:00
}
2011-06-28 03:40:50 +04:00
if ( ! bh_uptodate_or_lock ( bh ) ) {
if ( bh_submit_read ( bh ) < 0 ) {
put_bh ( bh ) ;
goto failure ;
}
/* validate block references */
if ( ext4_check_indirect_blockref ( inode , bh ) ) {
put_bh ( bh ) ;
goto failure ;
}
}
add_chain ( + + p , bh , ( __le32 * ) bh - > b_data + * + + offsets ) ;
/* Reader: end */
if ( ! p - > key )
goto no_block ;
}
return NULL ;
failure :
2013-01-13 01:19:36 +04:00
* err = ret ;
2011-06-28 03:40:50 +04:00
no_block :
return p ;
}
/**
* ext4_find_near - find a place for allocation with sufficient locality
* @ inode : owner
* @ ind : descriptor of indirect block .
*
* This function returns the preferred place for block allocation .
* It is used when heuristic for sequential allocation fails .
* Rules are :
* + if there is a block to the left of our position - allocate near it .
* + if pointer will live in indirect block - allocate near that block .
* + if pointer will live in inode - allocate in the same
* cylinder group .
*
* In the latter case we colour the starting block by the callers PID to
* prevent it from clashing with concurrent allocations for a different inode
* in the same block group . The PID is used here so that functionally related
* files will be close - by on - disk .
*
* Caller must make sure that @ ind is valid and will stay that way .
*/
static ext4_fsblk_t ext4_find_near ( struct inode * inode , Indirect * ind )
{
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
__le32 * start = ind - > bh ? ( __le32 * ) ind - > bh - > b_data : ei - > i_data ;
__le32 * p ;
/* Try to find previous block */
for ( p = ind - > p - 1 ; p > = start ; p - - ) {
if ( * p )
return le32_to_cpu ( * p ) ;
}
/* No such thing, so let's try location of indirect block */
if ( ind - > bh )
return ind - > bh - > b_blocknr ;
/*
* It is going to be referred to from the inode itself ? OK , just put it
* into the same cylinder group then .
*/
2011-06-28 18:01:31 +04:00
return ext4_inode_to_goal_block ( inode ) ;
2011-06-28 03:40:50 +04:00
}
/**
* ext4_find_goal - find a preferred place for allocation .
* @ inode : owner
* @ block : block we want
* @ partial : pointer to the last triple within a chain
*
* Normally this function find the preferred place for block allocation ,
* returns it .
* Because this is only used for non - extent files , we limit the block nr
* to 32 bits .
*/
static ext4_fsblk_t ext4_find_goal ( struct inode * inode , ext4_lblk_t block ,
Indirect * partial )
{
ext4_fsblk_t goal ;
/*
* XXX need to get goal block from mballoc ' s data structures
*/
goal = ext4_find_near ( inode , partial ) ;
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS ;
return goal ;
}
/**
* ext4_blks_to_allocate - Look up the block map and count the number
* of direct blocks need to be allocated for the given branch .
*
* @ branch : chain of indirect blocks
* @ k : number of blocks need for indirect blocks
* @ blks : number of data blocks to be mapped .
* @ blocks_to_boundary : the offset in the indirect block
*
* return the total number of blocks to be allocate , including the
* direct and indirect blocks .
*/
static int ext4_blks_to_allocate ( Indirect * branch , int k , unsigned int blks ,
int blocks_to_boundary )
{
unsigned int count = 0 ;
/*
* Simple case , [ t , d ] Indirect block ( s ) has not allocated yet
* then it ' s clear blocks on that path have not allocated
*/
if ( k > 0 ) {
/* right now we don't handle cross boundary allocation */
if ( blks < blocks_to_boundary + 1 )
count + = blks ;
else
count + = blocks_to_boundary + 1 ;
return count ;
}
count + + ;
while ( count < blks & & count < = blocks_to_boundary & &
le32_to_cpu ( * ( branch [ 0 ] . p + count ) ) = = 0 ) {
count + + ;
}
return count ;
}
/**
* ext4_alloc_blocks : multiple allocate blocks needed for a branch
* @ handle : handle for this transaction
* @ inode : inode which needs allocated blocks
* @ iblock : the logical block to start allocated at
* @ goal : preferred physical block of allocation
* @ indirect_blks : the number of blocks need to allocate for indirect
* blocks
* @ blks : number of desired blocks
* @ new_blocks : on return it will store the new block numbers for
* the indirect blocks ( if needed ) and the first direct block ,
* @ err : on return it will store the error code
*
* This function will return the number of blocks allocated as
* requested by the passed - in parameters .
*/
static int ext4_alloc_blocks ( handle_t * handle , struct inode * inode ,
ext4_lblk_t iblock , ext4_fsblk_t goal ,
int indirect_blks , int blks ,
ext4_fsblk_t new_blocks [ 4 ] , int * err )
{
struct ext4_allocation_request ar ;
int target , i ;
unsigned long count = 0 , blk_allocated = 0 ;
int index = 0 ;
ext4_fsblk_t current_block = 0 ;
int ret = 0 ;
/*
* Here we try to allocate the requested multiple blocks at once ,
* on a best - effort basis .
* To build a branch , we should allocate blocks for
* the indirect blocks ( if not allocated yet ) , and at least
* the first direct block of this branch . That ' s the
* minimum number of blocks need to allocate ( required )
*/
/* first we try to allocate the indirect blocks */
target = indirect_blks ;
while ( target > 0 ) {
count = target ;
/* allocating blocks for indirect blocks and direct blocks */
current_block = ext4_new_meta_blocks ( handle , inode , goal ,
0 , & count , err ) ;
if ( * err )
goto failed_out ;
if ( unlikely ( current_block + count > EXT4_MAX_BLOCK_FILE_PHYS ) ) {
EXT4_ERROR_INODE ( inode ,
" current_block %llu + count %lu > %d! " ,
current_block , count ,
EXT4_MAX_BLOCK_FILE_PHYS ) ;
* err = - EIO ;
goto failed_out ;
}
target - = count ;
/* allocate blocks for indirect blocks */
while ( index < indirect_blks & & count ) {
new_blocks [ index + + ] = current_block + + ;
count - - ;
}
if ( count > 0 ) {
/*
* save the new block number
* for the first direct block
*/
new_blocks [ index ] = current_block ;
printk ( KERN_INFO " %s returned more blocks than "
" requested \n " , __func__ ) ;
WARN_ON ( 1 ) ;
break ;
}
}
target = blks - count ;
blk_allocated = count ;
if ( ! target )
goto allocated ;
/* Now allocate data blocks */
memset ( & ar , 0 , sizeof ( ar ) ) ;
ar . inode = inode ;
ar . goal = goal ;
ar . len = target ;
ar . logical = iblock ;
if ( S_ISREG ( inode - > i_mode ) )
/* enable in-core preallocation only for regular files */
ar . flags = EXT4_MB_HINT_DATA ;
current_block = ext4_mb_new_blocks ( handle , & ar , err ) ;
if ( unlikely ( current_block + ar . len > EXT4_MAX_BLOCK_FILE_PHYS ) ) {
EXT4_ERROR_INODE ( inode ,
" current_block %llu + ar.len %d > %d! " ,
current_block , ar . len ,
EXT4_MAX_BLOCK_FILE_PHYS ) ;
* err = - EIO ;
goto failed_out ;
}
if ( * err & & ( target = = blks ) ) {
/*
* if the allocation failed and we didn ' t allocate
* any blocks before
*/
goto failed_out ;
}
if ( ! * err ) {
if ( target = = blks ) {
/*
* save the new block number
* for the first direct block
*/
new_blocks [ index ] = current_block ;
}
blk_allocated + = ar . len ;
}
allocated :
/* total number of blocks allocated for direct blocks */
ret = blk_allocated ;
* err = 0 ;
return ret ;
failed_out :
for ( i = 0 ; i < index ; i + + )
ext4_free_blocks ( handle , inode , NULL , new_blocks [ i ] , 1 , 0 ) ;
return ret ;
}
/**
* ext4_alloc_branch - allocate and set up a chain of blocks .
* @ handle : handle for this transaction
* @ inode : owner
* @ indirect_blks : number of allocated indirect blocks
* @ blks : number of allocated direct blocks
* @ goal : preferred place for allocation
* @ offsets : offsets ( in the blocks ) to store the pointers to next .
* @ branch : place to store the chain in .
*
* This function allocates blocks , zeroes out all but the last one ,
* links them into chain and ( if we are synchronous ) writes them to disk .
* In other words , it prepares a branch that can be spliced onto the
* inode . It stores the information about that chain in the branch [ ] , in
* the same format as ext4_get_branch ( ) would do . We are calling it after
* we had read the existing part of chain and partial points to the last
* triple of that ( one with zero - > key ) . Upon the exit we have the same
* picture as after the successful ext4_get_block ( ) , except that in one
* place chain is disconnected - * branch - > p is still zero ( we did not
* set the last link ) , but branch - > key contains the number that should
* be placed into * branch - > p to fill that gap .
*
* If allocation fails we free all blocks we ' ve allocated ( and forget
* their buffer_heads ) and return the error value the from failed
* ext4_alloc_block ( ) ( normally - ENOSPC ) . Otherwise we set the chain
* as described above and return 0.
*/
static int ext4_alloc_branch ( handle_t * handle , struct inode * inode ,
ext4_lblk_t iblock , int indirect_blks ,
int * blks , ext4_fsblk_t goal ,
ext4_lblk_t * offsets , Indirect * branch )
{
int blocksize = inode - > i_sb - > s_blocksize ;
int i , n = 0 ;
int err = 0 ;
struct buffer_head * bh ;
int num ;
ext4_fsblk_t new_blocks [ 4 ] ;
ext4_fsblk_t current_block ;
num = ext4_alloc_blocks ( handle , inode , iblock , goal , indirect_blks ,
* blks , new_blocks , & err ) ;
if ( err )
return err ;
branch [ 0 ] . key = cpu_to_le32 ( new_blocks [ 0 ] ) ;
/*
* metadata blocks and data blocks are allocated .
*/
for ( n = 1 ; n < = indirect_blks ; n + + ) {
/*
* Get buffer_head for parent block , zero it out
* and set the pointer to new one , then send
* parent to disk .
*/
bh = sb_getblk ( inode - > i_sb , new_blocks [ n - 1 ] ) ;
if ( unlikely ( ! bh ) ) {
2013-01-13 01:19:36 +04:00
err = - ENOMEM ;
2011-06-28 03:40:50 +04:00
goto failed ;
}
branch [ n ] . bh = bh ;
lock_buffer ( bh ) ;
BUFFER_TRACE ( bh , " call get_create_access " ) ;
err = ext4_journal_get_create_access ( handle , bh ) ;
if ( err ) {
/* Don't brelse(bh) here; it's done in
* ext4_journal_forget ( ) below */
unlock_buffer ( bh ) ;
goto failed ;
}
memset ( bh - > b_data , 0 , blocksize ) ;
branch [ n ] . p = ( __le32 * ) bh - > b_data + offsets [ n ] ;
branch [ n ] . key = cpu_to_le32 ( new_blocks [ n ] ) ;
* branch [ n ] . p = branch [ n ] . key ;
if ( n = = indirect_blks ) {
current_block = new_blocks [ n ] ;
/*
* End of chain , update the last new metablock of
* the chain to point to the new allocated
* data blocks numbers
*/
for ( i = 1 ; i < num ; i + + )
* ( branch [ n ] . p + i ) = cpu_to_le32 ( + + current_block ) ;
}
BUFFER_TRACE ( bh , " marking uptodate " ) ;
set_buffer_uptodate ( bh ) ;
unlock_buffer ( bh ) ;
BUFFER_TRACE ( bh , " call ext4_handle_dirty_metadata " ) ;
err = ext4_handle_dirty_metadata ( handle , inode , bh ) ;
if ( err )
goto failed ;
}
* blks = num ;
return err ;
failed :
/* Allocation failed, free what we already allocated */
ext4_free_blocks ( handle , inode , NULL , new_blocks [ 0 ] , 1 , 0 ) ;
for ( i = 1 ; i < = n ; i + + ) {
/*
* branch [ i ] . bh is newly allocated , so there is no
* need to revoke the block , which is why we don ' t
* need to set EXT4_FREE_BLOCKS_METADATA .
*/
ext4_free_blocks ( handle , inode , NULL , new_blocks [ i ] , 1 ,
EXT4_FREE_BLOCKS_FORGET ) ;
}
for ( i = n + 1 ; i < indirect_blks ; i + + )
ext4_free_blocks ( handle , inode , NULL , new_blocks [ i ] , 1 , 0 ) ;
ext4_free_blocks ( handle , inode , NULL , new_blocks [ i ] , num , 0 ) ;
return err ;
}
/**
* ext4_splice_branch - splice the allocated branch onto inode .
* @ handle : handle for this transaction
* @ inode : owner
* @ block : ( logical ) number of block we are adding
* @ chain : chain of indirect blocks ( with a missing link - see
* ext4_alloc_branch )
* @ where : location of missing link
* @ num : number of indirect blocks we are adding
* @ blks : number of direct blocks we are adding
*
* This function fills the missing link and does all housekeeping needed in
* inode ( - > i_blocks , etc . ) . In case of success we end up with the full
* chain to new block and return 0.
*/
static int ext4_splice_branch ( handle_t * handle , struct inode * inode ,
ext4_lblk_t block , Indirect * where , int num ,
int blks )
{
int i ;
int err = 0 ;
ext4_fsblk_t current_block ;
/*
* If we ' re splicing into a [ td ] indirect block ( as opposed to the
* inode ) then we need to get write access to the [ td ] indirect block
* before the splice .
*/
if ( where - > bh ) {
BUFFER_TRACE ( where - > bh , " get_write_access " ) ;
err = ext4_journal_get_write_access ( handle , where - > bh ) ;
if ( err )
goto err_out ;
}
/* That's it */
* where - > p = where - > key ;
/*
* Update the host buffer_head or inode to point to more just allocated
* direct blocks blocks
*/
if ( num = = 0 & & blks > 1 ) {
current_block = le32_to_cpu ( where - > key ) + 1 ;
for ( i = 1 ; i < blks ; i + + )
* ( where - > p + i ) = cpu_to_le32 ( current_block + + ) ;
}
/* We are done with atomic stuff, now do the rest of housekeeping */
/* had we spliced it onto indirect block? */
if ( where - > bh ) {
/*
* If we spliced it onto an indirect block , we haven ' t
* altered the inode . Note however that if it is being spliced
* onto an indirect block at the very end of the file ( the
* file is growing ) then we * will * alter the inode to reflect
* the new i_size . But that is not done here - it is done in
* generic_commit_write - > __mark_inode_dirty - > ext4_dirty_inode .
*/
jbd_debug ( 5 , " splicing indirect only \n " ) ;
BUFFER_TRACE ( where - > bh , " call ext4_handle_dirty_metadata " ) ;
err = ext4_handle_dirty_metadata ( handle , inode , where - > bh ) ;
if ( err )
goto err_out ;
} else {
/*
* OK , we spliced it into the inode itself on a direct block .
*/
ext4_mark_inode_dirty ( handle , inode ) ;
jbd_debug ( 5 , " splicing direct \n " ) ;
}
return err ;
err_out :
for ( i = 1 ; i < = num ; i + + ) {
/*
* branch [ i ] . bh is newly allocated , so there is no
* need to revoke the block , which is why we don ' t
* need to set EXT4_FREE_BLOCKS_METADATA .
*/
ext4_free_blocks ( handle , inode , where [ i ] . bh , 0 , 1 ,
EXT4_FREE_BLOCKS_FORGET ) ;
}
ext4_free_blocks ( handle , inode , NULL , le32_to_cpu ( where [ num ] . key ) ,
blks , 0 ) ;
return err ;
}
/*
* The ext4_ind_map_blocks ( ) function handles non - extents inodes
* ( i . e . , using the traditional indirect / double - indirect i_blocks
* scheme ) for ext4_map_blocks ( ) .
*
* Allocation strategy is simple : if we have to allocate something , we will
* have to go the whole way to leaf . So let ' s do it before attaching anything
* to tree , set linkage between the newborn blocks , write them if sync is
* required , recheck the path , free and repeat if check fails , otherwise
* set the last missing link ( that will protect us from any truncate - generated
* removals - all blocks on the path are immune now ) and possibly force the
* write on the parent block .
* That has a nice additional property : no special recovery from the failed
* allocations is needed - we simply release blocks and do not touch anything
* reachable from inode .
*
* ` handle ' can be NULL if create = = 0.
*
* return > 0 , # of blocks mapped or allocated .
* return = 0 , if plain lookup failed .
* return < 0 , error case .
*
* The ext4_ind_get_blocks ( ) function should be called with
* down_write ( & EXT4_I ( inode ) - > i_data_sem ) if allocating filesystem
* blocks ( i . e . , flags has EXT4_GET_BLOCKS_CREATE set ) or
* down_read ( & EXT4_I ( inode ) - > i_data_sem ) if not allocating file system
* blocks .
*/
int ext4_ind_map_blocks ( handle_t * handle , struct inode * inode ,
struct ext4_map_blocks * map ,
int flags )
{
int err = - EIO ;
ext4_lblk_t offsets [ 4 ] ;
Indirect chain [ 4 ] ;
Indirect * partial ;
ext4_fsblk_t goal ;
int indirect_blks ;
int blocks_to_boundary = 0 ;
int depth ;
int count = 0 ;
ext4_fsblk_t first_block = 0 ;
trace_ext4_ind_map_blocks_enter ( inode , map - > m_lblk , map - > m_len , flags ) ;
J_ASSERT ( ! ( ext4_test_inode_flag ( inode , EXT4_INODE_EXTENTS ) ) ) ;
J_ASSERT ( handle ! = NULL | | ( flags & EXT4_GET_BLOCKS_CREATE ) = = 0 ) ;
depth = ext4_block_to_path ( inode , map - > m_lblk , offsets ,
& blocks_to_boundary ) ;
if ( depth = = 0 )
goto out ;
partial = ext4_get_branch ( inode , depth , offsets , chain , & err ) ;
/* Simplest case - block found, no allocation needed */
if ( ! partial ) {
first_block = le32_to_cpu ( chain [ depth - 1 ] . key ) ;
count + + ;
/*map more blocks*/
while ( count < map - > m_len & & count < = blocks_to_boundary ) {
ext4_fsblk_t blk ;
blk = le32_to_cpu ( * ( chain [ depth - 1 ] . p + count ) ) ;
if ( blk = = first_block + count )
count + + ;
else
break ;
}
goto got_it ;
}
/* Next simple case - plain lookup or failed read of indirect block */
if ( ( flags & EXT4_GET_BLOCKS_CREATE ) = = 0 | | err = = - EIO )
goto cleanup ;
/*
* Okay , we need to do block allocation .
*/
2011-09-10 02:36:51 +04:00
if ( EXT4_HAS_RO_COMPAT_FEATURE ( inode - > i_sb ,
EXT4_FEATURE_RO_COMPAT_BIGALLOC ) ) {
EXT4_ERROR_INODE ( inode , " Can't allocate blocks for "
" non-extent mapped inodes with bigalloc " ) ;
return - ENOSPC ;
}
2011-06-28 03:40:50 +04:00
goal = ext4_find_goal ( inode , map - > m_lblk , partial ) ;
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = ( chain + depth ) - partial - 1 ;
/*
* Next look up the indirect map to count the totoal number of
* direct blocks to allocate for this branch .
*/
count = ext4_blks_to_allocate ( partial , indirect_blks ,
map - > m_len , blocks_to_boundary ) ;
/*
* Block out ext4_truncate while we alter the tree
*/
err = ext4_alloc_branch ( handle , inode , map - > m_lblk , indirect_blks ,
& count , goal ,
offsets + ( partial - chain ) , partial ) ;
/*
* The ext4_splice_branch call will free and forget any buffers
* on the new chain if there is a failure , but that risks using
* up transaction credits , especially for bitmaps where the
* credits cannot be returned . Can we handle this somehow ? We
* may need to return - EAGAIN upwards in the worst case . - - sct
*/
if ( ! err )
err = ext4_splice_branch ( handle , inode , map - > m_lblk ,
partial , indirect_blks , count ) ;
if ( err )
goto cleanup ;
map - > m_flags | = EXT4_MAP_NEW ;
ext4_update_inode_fsync_trans ( handle , inode , 1 ) ;
got_it :
map - > m_flags | = EXT4_MAP_MAPPED ;
map - > m_pblk = le32_to_cpu ( chain [ depth - 1 ] . key ) ;
map - > m_len = count ;
if ( count > blocks_to_boundary )
map - > m_flags | = EXT4_MAP_BOUNDARY ;
err = count ;
/* Clean up and exit */
partial = chain + depth - 1 ; /* the whole chain */
cleanup :
while ( partial > chain ) {
BUFFER_TRACE ( partial - > bh , " call brelse " ) ;
brelse ( partial - > bh ) ;
partial - - ;
}
out :
2012-11-08 23:34:04 +04:00
trace_ext4_ind_map_blocks_exit ( inode , map , err ) ;
2011-06-28 03:40:50 +04:00
return err ;
}
/*
* O_DIRECT for ext3 ( or indirect map ) based files
*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list . So recovery will truncate it back to the original size
* if the machine crashes during the write .
*
* If the O_DIRECT write is intantiating holes inside i_size and the machine
* crashes then stale disk data _may_ be exposed inside the file . But current
* VFS code falls back into buffered path in that case so we are safe .
*/
ssize_t ext4_ind_direct_IO ( int rw , struct kiocb * iocb ,
const struct iovec * iov , loff_t offset ,
unsigned long nr_segs )
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file - > f_mapping - > host ;
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
handle_t * handle ;
ssize_t ret ;
int orphan = 0 ;
size_t count = iov_length ( iov , nr_segs ) ;
int retries = 0 ;
if ( rw = = WRITE ) {
loff_t final_size = offset + count ;
if ( final_size > inode - > i_size ) {
/* Credits for sb + inode write */
handle = ext4_journal_start ( inode , 2 ) ;
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
goto out ;
}
ret = ext4_orphan_add ( handle , inode ) ;
if ( ret ) {
ext4_journal_stop ( handle ) ;
goto out ;
}
orphan = 1 ;
ei - > i_disksize = inode - > i_size ;
ext4_journal_stop ( handle ) ;
}
}
retry :
2011-08-20 03:13:32 +04:00
if ( rw = = READ & & ext4_should_dioread_nolock ( inode ) ) {
2012-10-05 19:31:55 +04:00
if ( unlikely ( atomic_read ( & EXT4_I ( inode ) - > i_unwritten ) ) ) {
mutex_lock ( & inode - > i_mutex ) ;
ext4_flush_unwritten_io ( inode ) ;
mutex_unlock ( & inode - > i_mutex ) ;
}
2012-09-29 08:41:21 +04:00
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio ( ) . Check inode ' s state
* while holding extra i_dio_count ref .
*/
atomic_inc ( & inode - > i_dio_count ) ;
smp_mb ( ) ;
if ( unlikely ( ext4_test_inode_state ( inode ,
EXT4_STATE_DIOREAD_LOCK ) ) ) {
inode_dio_done ( inode ) ;
goto locked ;
}
2011-06-28 03:40:50 +04:00
ret = __blockdev_direct_IO ( rw , iocb , inode ,
inode - > i_sb - > s_bdev , iov ,
offset , nr_segs ,
ext4_get_block , NULL , NULL , 0 ) ;
2012-09-29 08:41:21 +04:00
inode_dio_done ( inode ) ;
2011-08-20 03:13:32 +04:00
} else {
2012-09-29 08:41:21 +04:00
locked :
2011-08-02 03:56:03 +04:00
ret = blockdev_direct_IO ( rw , iocb , inode , iov ,
offset , nr_segs , ext4_get_block ) ;
2011-06-28 03:40:50 +04:00
if ( unlikely ( ( rw & WRITE ) & & ret < 0 ) ) {
loff_t isize = i_size_read ( inode ) ;
loff_t end = offset + iov_length ( iov , nr_segs ) ;
if ( end > isize )
ext4_truncate_failed_write ( inode ) ;
}
}
if ( ret = = - ENOSPC & & ext4_should_retry_alloc ( inode - > i_sb , & retries ) )
goto retry ;
if ( orphan ) {
int err ;
/* Credits for sb + inode write */
handle = ext4_journal_start ( inode , 2 ) ;
if ( IS_ERR ( handle ) ) {
/* This is really bad luck. We've written the data
* but cannot extend i_size . Bail out and pretend
* the write failed . . . */
ret = PTR_ERR ( handle ) ;
if ( inode - > i_nlink )
ext4_orphan_del ( NULL , inode ) ;
goto out ;
}
if ( inode - > i_nlink )
ext4_orphan_del ( handle , inode ) ;
if ( ret > 0 ) {
loff_t end = offset + ret ;
if ( end > inode - > i_size ) {
ei - > i_disksize = end ;
i_size_write ( inode , end ) ;
/*
* We ' re going to return a positive ` ret '
* here due to non - zero - length I / O , so there ' s
* no way of reporting error returns from
* ext4_mark_inode_dirty ( ) to userspace . So
* ignore it .
*/
ext4_mark_inode_dirty ( handle , inode ) ;
}
}
err = ext4_journal_stop ( handle ) ;
if ( ret = = 0 )
ret = err ;
}
out :
return ret ;
}
/*
* Calculate the number of metadata blocks need to reserve
* to allocate a new block at @ lblocks for non extent file based file
*/
int ext4_ind_calc_metadata_amount ( struct inode * inode , sector_t lblock )
{
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
sector_t dind_mask = ~ ( ( sector_t ) EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) - 1 ) ;
int blk_bits ;
if ( lblock < EXT4_NDIR_BLOCKS )
return 0 ;
lblock - = EXT4_NDIR_BLOCKS ;
if ( ei - > i_da_metadata_calc_len & &
( lblock & dind_mask ) = = ei - > i_da_metadata_calc_last_lblock ) {
ei - > i_da_metadata_calc_len + + ;
return 0 ;
}
ei - > i_da_metadata_calc_last_lblock = lblock & dind_mask ;
ei - > i_da_metadata_calc_len = 1 ;
blk_bits = order_base_2 ( lblock ) ;
return ( blk_bits / EXT4_ADDR_PER_BLOCK_BITS ( inode - > i_sb ) ) + 1 ;
}
int ext4_ind_trans_blocks ( struct inode * inode , int nrblocks , int chunk )
{
int indirects ;
/* if nrblocks are contiguous */
if ( chunk ) {
/*
* With N contiguous data blocks , we need at most
* N / EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) + 1 indirect blocks ,
* 2 dindirect blocks , and 1 tindirect block
*/
return DIV_ROUND_UP ( nrblocks ,
EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ) + 4 ;
}
/*
* if nrblocks are not contiguous , worse case , each block touch
* a indirect block , and each indirect block touch a double indirect
* block , plus a triple indirect block
*/
indirects = nrblocks * 2 + 1 ;
return indirects ;
}
/*
* Truncate transactions can be complex and absolutely huge . So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don ' t overflow the journal .
*
* start_transaction gets us a new handle for a truncate transaction ,
* and extend_transaction tries to extend the existing one a bit . If
* extend fails , we need to propagate the failure up and restart the
* transaction in the top - level truncate loop . - - sct
*/
static handle_t * start_transaction ( struct inode * inode )
{
handle_t * result ;
result = ext4_journal_start ( inode , ext4_blocks_for_truncate ( inode ) ) ;
if ( ! IS_ERR ( result ) )
return result ;
ext4_std_error ( inode - > i_sb , PTR_ERR ( result ) ) ;
return result ;
}
/*
* Try to extend this transaction for the purposes of truncation .
*
* Returns 0 if we managed to create more room . If we can ' t create more
* room , and the transaction must be restarted we return 1.
*/
static int try_to_extend_transaction ( handle_t * handle , struct inode * inode )
{
if ( ! ext4_handle_valid ( handle ) )
return 0 ;
if ( ext4_handle_has_enough_credits ( handle , EXT4_RESERVE_TRANS_BLOCKS + 1 ) )
return 0 ;
if ( ! ext4_journal_extend ( handle , ext4_blocks_for_truncate ( inode ) ) )
return 0 ;
return 1 ;
}
/*
* Probably it should be a library function . . . search for first non - zero word
* or memcmp with zero_page , whatever is better for particular architecture .
* Linus ?
*/
static inline int all_zeroes ( __le32 * p , __le32 * q )
{
while ( p < q )
if ( * p + + )
return 0 ;
return 1 ;
}
/**
* ext4_find_shared - find the indirect blocks for partial truncation .
* @ inode : inode in question
* @ depth : depth of the affected branch
* @ offsets : offsets of pointers in that branch ( see ext4_block_to_path )
* @ chain : place to store the pointers to partial indirect blocks
* @ top : place to the ( detached ) top of branch
*
* This is a helper function used by ext4_truncate ( ) .
*
* When we do truncate ( ) we may have to clean the ends of several
* indirect blocks but leave the blocks themselves alive . Block is
* partially truncated if some data below the new i_size is referred
* from it ( and it is on the path to the first completely truncated
* data block , indeed ) . We have to free the top of that path along
* with everything to the right of the path . Since no allocation
* past the truncation point is possible until ext4_truncate ( )
* finishes , we may safely do the latter , but top of branch may
* require special attention - pageout below the truncation point
* might try to populate it .
*
* We atomically detach the top of branch from the tree , store the
* block number of its root in * @ top , pointers to buffer_heads of
* partially truncated blocks - in @ chain [ ] . bh and pointers to
* their last elements that should not be removed - in
* @ chain [ ] . p . Return value is the pointer to last filled element
* of @ chain .
*
* The work left to caller to do the actual freeing of subtrees :
* a ) free the subtree starting from * @ top
* b ) free the subtrees whose roots are stored in
* ( @ chain [ i ] . p + 1 . . end of @ chain [ i ] . bh - > b_data )
* c ) free the subtrees growing from the inode past the @ chain [ 0 ] .
* ( no partially truncated stuff there ) . */
static Indirect * ext4_find_shared ( struct inode * inode , int depth ,
ext4_lblk_t offsets [ 4 ] , Indirect chain [ 4 ] ,
__le32 * top )
{
Indirect * partial , * p ;
int k , err ;
* top = 0 ;
/* Make k index the deepest non-null offset + 1 */
for ( k = depth ; k > 1 & & ! offsets [ k - 1 ] ; k - - )
;
partial = ext4_get_branch ( inode , k , offsets , chain , & err ) ;
/* Writer: pointers */
if ( ! partial )
partial = chain + k - 1 ;
/*
* If the branch acquired continuation since we ' ve looked at it -
* fine , it should all survive and ( new ) top doesn ' t belong to us .
*/
if ( ! partial - > key & & * partial - > p )
/* Writer: end */
goto no_top ;
for ( p = partial ; ( p > chain ) & & all_zeroes ( ( __le32 * ) p - > bh - > b_data , p - > p ) ; p - - )
;
/*
* OK , we ' ve found the last block that must survive . The rest of our
* branch should be detached before unlocking . However , if that rest
* of branch is all ours and does not grow immediately from the inode
* it ' s easier to cheat and just decrement partial - > p .
*/
if ( p = = chain + k - 1 & & p > chain ) {
p - > p - - ;
} else {
* top = * p - > p ;
/* Nope, don't do this in ext4. Must leave the tree intact */
#if 0
* p - > p = 0 ;
# endif
}
/* Writer: end */
while ( partial > p ) {
brelse ( partial - > bh ) ;
partial - - ;
}
no_top :
return partial ;
}
/*
* Zero a number of block pointers in either an inode or an indirect block .
* If we restart the transaction we must again get write access to the
* indirect block for further modification .
*
* We release ` count ' blocks on disk , but ( last - first ) may be greater
* than ` count ' because there can be holes in there .
*
* Return 0 on success , 1 on invalid block range
* and < 0 on fatal error .
*/
static int ext4_clear_blocks ( handle_t * handle , struct inode * inode ,
struct buffer_head * bh ,
ext4_fsblk_t block_to_free ,
unsigned long count , __le32 * first ,
__le32 * last )
{
__le32 * p ;
int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED ;
int err ;
if ( S_ISDIR ( inode - > i_mode ) | | S_ISLNK ( inode - > i_mode ) )
flags | = EXT4_FREE_BLOCKS_METADATA ;
if ( ! ext4_data_block_valid ( EXT4_SB ( inode - > i_sb ) , block_to_free ,
count ) ) {
EXT4_ERROR_INODE ( inode , " attempt to clear invalid "
" blocks %llu len %lu " ,
( unsigned long long ) block_to_free , count ) ;
return 1 ;
}
if ( try_to_extend_transaction ( handle , inode ) ) {
if ( bh ) {
BUFFER_TRACE ( bh , " call ext4_handle_dirty_metadata " ) ;
err = ext4_handle_dirty_metadata ( handle , inode , bh ) ;
if ( unlikely ( err ) )
goto out_err ;
}
err = ext4_mark_inode_dirty ( handle , inode ) ;
if ( unlikely ( err ) )
goto out_err ;
err = ext4_truncate_restart_trans ( handle , inode ,
ext4_blocks_for_truncate ( inode ) ) ;
if ( unlikely ( err ) )
goto out_err ;
if ( bh ) {
BUFFER_TRACE ( bh , " retaking write access " ) ;
err = ext4_journal_get_write_access ( handle , bh ) ;
if ( unlikely ( err ) )
goto out_err ;
}
}
for ( p = first ; p < last ; p + + )
* p = 0 ;
ext4_free_blocks ( handle , inode , NULL , block_to_free , count , flags ) ;
return 0 ;
out_err :
ext4_std_error ( inode - > i_sb , err ) ;
return err ;
}
/**
* ext4_free_data - free a list of data blocks
* @ handle : handle for this transaction
* @ inode : inode we are dealing with
* @ this_bh : indirect buffer_head which contains * @ first and * @ last
* @ first : array of block numbers
* @ last : points immediately past the end of array
*
* We are freeing all blocks referred from that array ( numbers are stored as
* little - endian 32 - bit ) and updating @ inode - > i_blocks appropriately .
*
* We accumulate contiguous runs of blocks to free . Conveniently , if these
* blocks are contiguous then releasing them at one time will only affect one
* or two bitmap blocks ( + group descriptor ( s ) and superblock ) and we won ' t
* actually use a lot of journal space .
*
* @ this_bh will be % NULL if @ first and @ last point into the inode ' s direct
* block pointers .
*/
static void ext4_free_data ( handle_t * handle , struct inode * inode ,
struct buffer_head * this_bh ,
__le32 * first , __le32 * last )
{
ext4_fsblk_t block_to_free = 0 ; /* Starting block # of a run */
unsigned long count = 0 ; /* Number of blocks in the run */
__le32 * block_to_free_p = NULL ; /* Pointer into inode/ind
corresponding to
block_to_free */
ext4_fsblk_t nr ; /* Current block # */
__le32 * p ; /* Pointer into inode/ind
for current block */
int err = 0 ;
if ( this_bh ) { /* For indirect block */
BUFFER_TRACE ( this_bh , " get_write_access " ) ;
err = ext4_journal_get_write_access ( handle , this_bh ) ;
/* Important: if we can't update the indirect pointers
* to the blocks , we can ' t free them . */
if ( err )
return ;
}
for ( p = first ; p < last ; p + + ) {
nr = le32_to_cpu ( * p ) ;
if ( nr ) {
/* accumulate blocks to free if they're contiguous */
if ( count = = 0 ) {
block_to_free = nr ;
block_to_free_p = p ;
count = 1 ;
} else if ( nr = = block_to_free + count ) {
count + + ;
} else {
err = ext4_clear_blocks ( handle , inode , this_bh ,
block_to_free , count ,
block_to_free_p , p ) ;
if ( err )
break ;
block_to_free = nr ;
block_to_free_p = p ;
count = 1 ;
}
}
}
if ( ! err & & count > 0 )
err = ext4_clear_blocks ( handle , inode , this_bh , block_to_free ,
count , block_to_free_p , p ) ;
if ( err < 0 )
/* fatal error */
return ;
if ( this_bh ) {
BUFFER_TRACE ( this_bh , " call ext4_handle_dirty_metadata " ) ;
/*
* The buffer head should have an attached journal head at this
* point . However , if the data is corrupted and an indirect
* block pointed to itself , it would have been detached when
* the block was cleared . Check for this instead of OOPSing .
*/
if ( ( EXT4_JOURNAL ( inode ) = = NULL ) | | bh2jh ( this_bh ) )
ext4_handle_dirty_metadata ( handle , inode , this_bh ) ;
else
EXT4_ERROR_INODE ( inode ,
" circular indirect block detected at "
" block %llu " ,
( unsigned long long ) this_bh - > b_blocknr ) ;
}
}
/**
* ext4_free_branches - free an array of branches
* @ handle : JBD handle for this transaction
* @ inode : inode we are dealing with
* @ parent_bh : the buffer_head which contains * @ first and * @ last
* @ first : array of block numbers
* @ last : pointer immediately past the end of array
* @ depth : depth of the branches to free
*
* We are freeing all blocks referred from these branches ( numbers are
* stored as little - endian 32 - bit ) and updating @ inode - > i_blocks
* appropriately .
*/
static void ext4_free_branches ( handle_t * handle , struct inode * inode ,
struct buffer_head * parent_bh ,
__le32 * first , __le32 * last , int depth )
{
ext4_fsblk_t nr ;
__le32 * p ;
if ( ext4_handle_is_aborted ( handle ) )
return ;
if ( depth - - ) {
struct buffer_head * bh ;
int addr_per_block = EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ;
p = last ;
while ( - - p > = first ) {
nr = le32_to_cpu ( * p ) ;
if ( ! nr )
continue ; /* A hole */
if ( ! ext4_data_block_valid ( EXT4_SB ( inode - > i_sb ) ,
nr , 1 ) ) {
EXT4_ERROR_INODE ( inode ,
" invalid indirect mapped "
" block %lu (level %d) " ,
( unsigned long ) nr , depth ) ;
break ;
}
/* Go read the buffer for the next level down */
bh = sb_bread ( inode - > i_sb , nr ) ;
/*
* A read failure ? Report error and clear slot
* ( should be rare ) .
*/
if ( ! bh ) {
EXT4_ERROR_INODE_BLOCK ( inode , nr ,
" Read failure " ) ;
continue ;
}
/* This zaps the entire block. Bottom up. */
BUFFER_TRACE ( bh , " free child branches " ) ;
ext4_free_branches ( handle , inode , bh ,
( __le32 * ) bh - > b_data ,
( __le32 * ) bh - > b_data + addr_per_block ,
depth ) ;
brelse ( bh ) ;
/*
* Everything below this this pointer has been
* released . Now let this top - of - subtree go .
*
* We want the freeing of this indirect block to be
* atomic in the journal with the updating of the
* bitmap block which owns it . So make some room in
* the journal .
*
* We zero the parent pointer * after * freeing its
* pointee in the bitmaps , so if extend_transaction ( )
* for some reason fails to put the bitmap changes and
* the release into the same transaction , recovery
* will merely complain about releasing a free block ,
* rather than leaking blocks .
*/
if ( ext4_handle_is_aborted ( handle ) )
return ;
if ( try_to_extend_transaction ( handle , inode ) ) {
ext4_mark_inode_dirty ( handle , inode ) ;
ext4_truncate_restart_trans ( handle , inode ,
ext4_blocks_for_truncate ( inode ) ) ;
}
/*
* The forget flag here is critical because if
* we are journaling ( and not doing data
* journaling ) , we have to make sure a revoke
* record is written to prevent the journal
* replay from overwriting the ( former )
* indirect block if it gets reallocated as a
* data block . This must happen in the same
* transaction where the data blocks are
* actually freed .
*/
ext4_free_blocks ( handle , inode , NULL , nr , 1 ,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET ) ;
if ( parent_bh ) {
/*
* The block which we have just freed is
* pointed to by an indirect block : journal it
*/
BUFFER_TRACE ( parent_bh , " get_write_access " ) ;
if ( ! ext4_journal_get_write_access ( handle ,
parent_bh ) ) {
* p = 0 ;
BUFFER_TRACE ( parent_bh ,
" call ext4_handle_dirty_metadata " ) ;
ext4_handle_dirty_metadata ( handle ,
inode ,
parent_bh ) ;
}
}
}
} else {
/* We have reached the bottom of the tree. */
BUFFER_TRACE ( parent_bh , " free data blocks " ) ;
ext4_free_data ( handle , inode , parent_bh , first , last ) ;
}
}
void ext4_ind_truncate ( struct inode * inode )
{
handle_t * handle ;
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
__le32 * i_data = ei - > i_data ;
int addr_per_block = EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ;
struct address_space * mapping = inode - > i_mapping ;
ext4_lblk_t offsets [ 4 ] ;
Indirect chain [ 4 ] ;
Indirect * partial ;
__le32 nr = 0 ;
int n = 0 ;
ext4_lblk_t last_block , max_block ;
2011-09-07 05:49:44 +04:00
loff_t page_len ;
2011-06-28 03:40:50 +04:00
unsigned blocksize = inode - > i_sb - > s_blocksize ;
2011-09-07 05:49:44 +04:00
int err ;
2011-06-28 03:40:50 +04:00
handle = start_transaction ( inode ) ;
if ( IS_ERR ( handle ) )
return ; /* AKPM: return what? */
last_block = ( inode - > i_size + blocksize - 1 )
> > EXT4_BLOCK_SIZE_BITS ( inode - > i_sb ) ;
max_block = ( EXT4_SB ( inode - > i_sb ) - > s_bitmap_maxbytes + blocksize - 1 )
> > EXT4_BLOCK_SIZE_BITS ( inode - > i_sb ) ;
2011-09-07 05:49:44 +04:00
if ( inode - > i_size % PAGE_CACHE_SIZE ! = 0 ) {
page_len = PAGE_CACHE_SIZE -
( inode - > i_size & ( PAGE_CACHE_SIZE - 1 ) ) ;
err = ext4_discard_partial_page_buffers ( handle ,
mapping , inode - > i_size , page_len , 0 ) ;
if ( err )
2011-06-28 03:40:50 +04:00
goto out_stop ;
2011-09-07 05:49:44 +04:00
}
2011-06-28 03:40:50 +04:00
if ( last_block ! = max_block ) {
n = ext4_block_to_path ( inode , last_block , offsets , NULL ) ;
if ( n = = 0 )
goto out_stop ; /* error */
}
/*
* OK . This truncate is going to happen . We add the inode to the
* orphan list , so that if this truncate spans multiple transactions ,
* and we crash , we will resume the truncate when the filesystem
* recovers . It also marks the inode dirty , to catch the new size .
*
* Implication : the file must always be in a sane , consistent
* truncatable state while each transaction commits .
*/
if ( ext4_orphan_add ( handle , inode ) )
goto out_stop ;
/*
* From here we block out all ext4_get_block ( ) callers who want to
* modify the block allocation tree .
*/
down_write ( & ei - > i_data_sem ) ;
ext4_discard_preallocations ( inode ) ;
2012-11-09 06:57:32 +04:00
ext4_es_remove_extent ( inode , last_block , EXT_MAX_BLOCKS - last_block ) ;
2011-06-28 03:40:50 +04:00
/*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes , so it is now safe to propagate
* the new , shorter inode size ( held for now in i_size ) into the
* on - disk inode . We do this via i_disksize , which is the value which
* ext4 * really * writes onto the disk inode .
*/
ei - > i_disksize = inode - > i_size ;
if ( last_block = = max_block ) {
/*
* It is unnecessary to free any data blocks if last_block is
* equal to the indirect block limit .
*/
goto out_unlock ;
} else if ( n = = 1 ) { /* direct blocks */
ext4_free_data ( handle , inode , NULL , i_data + offsets [ 0 ] ,
i_data + EXT4_NDIR_BLOCKS ) ;
goto do_indirects ;
}
partial = ext4_find_shared ( inode , n , offsets , chain , & nr ) ;
/* Kill the top of shared branch (not detached) */
if ( nr ) {
if ( partial = = chain ) {
/* Shared branch grows from the inode */
ext4_free_branches ( handle , inode , NULL ,
& nr , & nr + 1 , ( chain + n - 1 ) - partial ) ;
* partial - > p = 0 ;
/*
* We mark the inode dirty prior to restart ,
* and prior to stop . No need for it here .
*/
} else {
/* Shared branch grows from an indirect block */
BUFFER_TRACE ( partial - > bh , " get_write_access " ) ;
ext4_free_branches ( handle , inode , partial - > bh ,
partial - > p ,
partial - > p + 1 , ( chain + n - 1 ) - partial ) ;
}
}
/* Clear the ends of indirect blocks on the shared branch */
while ( partial > chain ) {
ext4_free_branches ( handle , inode , partial - > bh , partial - > p + 1 ,
( __le32 * ) partial - > bh - > b_data + addr_per_block ,
( chain + n - 1 ) - partial ) ;
BUFFER_TRACE ( partial - > bh , " call brelse " ) ;
brelse ( partial - > bh ) ;
partial - - ;
}
do_indirects :
/* Kill the remaining (whole) subtrees */
switch ( offsets [ 0 ] ) {
default :
nr = i_data [ EXT4_IND_BLOCK ] ;
if ( nr ) {
ext4_free_branches ( handle , inode , NULL , & nr , & nr + 1 , 1 ) ;
i_data [ EXT4_IND_BLOCK ] = 0 ;
}
case EXT4_IND_BLOCK :
nr = i_data [ EXT4_DIND_BLOCK ] ;
if ( nr ) {
ext4_free_branches ( handle , inode , NULL , & nr , & nr + 1 , 2 ) ;
i_data [ EXT4_DIND_BLOCK ] = 0 ;
}
case EXT4_DIND_BLOCK :
nr = i_data [ EXT4_TIND_BLOCK ] ;
if ( nr ) {
ext4_free_branches ( handle , inode , NULL , & nr , & nr + 1 , 3 ) ;
i_data [ EXT4_TIND_BLOCK ] = 0 ;
}
case EXT4_TIND_BLOCK :
;
}
out_unlock :
up_write ( & ei - > i_data_sem ) ;
inode - > i_mtime = inode - > i_ctime = ext4_current_time ( inode ) ;
ext4_mark_inode_dirty ( handle , inode ) ;
/*
* In a multi - transaction truncate , we only make the final transaction
* synchronous
*/
if ( IS_SYNC ( inode ) )
ext4_handle_sync ( handle ) ;
out_stop :
/*
* If this was a simple ftruncate ( ) , and the file will remain alive
* then we need to clear up the orphan record which we created above .
* However , if this was a real unlink then we were called by
* ext4_delete_inode ( ) , and we allow that function to clean up the
* orphan info for us .
*/
if ( inode - > i_nlink )
ext4_orphan_del ( handle , inode ) ;
ext4_journal_stop ( handle ) ;
trace_ext4_truncate_exit ( inode ) ;
}
2013-01-28 18:21:37 +04:00
static int free_hole_blocks ( handle_t * handle , struct inode * inode ,
struct buffer_head * parent_bh , __le32 * i_data ,
int level , ext4_lblk_t first ,
ext4_lblk_t count , int max )
{
struct buffer_head * bh = NULL ;
int addr_per_block = EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ;
int ret = 0 ;
int i , inc ;
ext4_lblk_t offset ;
__le32 blk ;
inc = 1 < < ( ( EXT4_BLOCK_SIZE_BITS ( inode - > i_sb ) - 2 ) * level ) ;
for ( i = 0 , offset = 0 ; i < max ; i + + , i_data + + , offset + = inc ) {
if ( offset > = count + first )
break ;
if ( * i_data = = 0 | | ( offset + inc ) < = first )
continue ;
blk = * i_data ;
if ( level > 0 ) {
ext4_lblk_t first2 ;
bh = sb_bread ( inode - > i_sb , blk ) ;
if ( ! bh ) {
EXT4_ERROR_INODE_BLOCK ( inode , blk ,
" Read failure " ) ;
return - EIO ;
}
first2 = ( first > offset ) ? first - offset : 0 ;
ret = free_hole_blocks ( handle , inode , bh ,
( __le32 * ) bh - > b_data , level - 1 ,
first2 , count - offset ,
inode - > i_sb - > s_blocksize > > 2 ) ;
if ( ret ) {
brelse ( bh ) ;
goto err ;
}
}
if ( level = = 0 | |
( bh & & all_zeroes ( ( __le32 * ) bh - > b_data ,
( __le32 * ) bh - > b_data + addr_per_block ) ) ) {
ext4_free_data ( handle , inode , parent_bh , & blk , & blk + 1 ) ;
* i_data = 0 ;
}
brelse ( bh ) ;
bh = NULL ;
}
err :
return ret ;
}
static int ext4_free_hole_blocks ( handle_t * handle , struct inode * inode ,
ext4_lblk_t first , ext4_lblk_t stop )
{
int addr_per_block = EXT4_ADDR_PER_BLOCK ( inode - > i_sb ) ;
int level , ret = 0 ;
int num = EXT4_NDIR_BLOCKS ;
ext4_lblk_t count , max = EXT4_NDIR_BLOCKS ;
__le32 * i_data = EXT4_I ( inode ) - > i_data ;
count = stop - first ;
for ( level = 0 ; level < 4 ; level + + , max * = addr_per_block ) {
if ( first < max ) {
ret = free_hole_blocks ( handle , inode , NULL , i_data ,
level , first , count , num ) ;
if ( ret )
goto err ;
if ( count > max - first )
count - = max - first ;
else
break ;
first = 0 ;
} else {
first - = max ;
}
i_data + = num ;
if ( level = = 0 ) {
num = 1 ;
max = 1 ;
}
}
err :
return ret ;
}
int ext4_ind_punch_hole ( struct file * file , loff_t offset , loff_t length )
{
struct inode * inode = file - > f_path . dentry - > d_inode ;
struct super_block * sb = inode - > i_sb ;
ext4_lblk_t first_block , stop_block ;
struct address_space * mapping = inode - > i_mapping ;
handle_t * handle = NULL ;
loff_t first_page , last_page , page_len ;
loff_t first_page_offset , last_page_offset ;
int err = 0 ;
/*
* Write out all dirty pages to avoid race conditions
* Then release them .
*/
if ( mapping - > nrpages & & mapping_tagged ( mapping , PAGECACHE_TAG_DIRTY ) ) {
err = filemap_write_and_wait_range ( mapping ,
offset , offset + length - 1 ) ;
if ( err )
return err ;
}
mutex_lock ( & inode - > i_mutex ) ;
/* It's not possible punch hole on append only file */
if ( IS_APPEND ( inode ) | | IS_IMMUTABLE ( inode ) ) {
err = - EPERM ;
goto out_mutex ;
}
if ( IS_SWAPFILE ( inode ) ) {
err = - ETXTBSY ;
goto out_mutex ;
}
/* No need to punch hole beyond i_size */
if ( offset > = inode - > i_size )
goto out_mutex ;
/*
* If the hole extents beyond i_size , set the hole
* to end after the page that contains i_size
*/
if ( offset + length > inode - > i_size ) {
length = inode - > i_size +
PAGE_CACHE_SIZE - ( inode - > i_size & ( PAGE_CACHE_SIZE - 1 ) ) -
offset ;
}
first_page = ( offset + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
last_page = ( offset + length ) > > PAGE_CACHE_SHIFT ;
first_page_offset = first_page < < PAGE_CACHE_SHIFT ;
last_page_offset = last_page < < PAGE_CACHE_SHIFT ;
/* Now release the pages */
if ( last_page_offset > first_page_offset ) {
truncate_pagecache_range ( inode , first_page_offset ,
last_page_offset - 1 ) ;
}
/* Wait all existing dio works, newcomers will block on i_mutex */
inode_dio_wait ( inode ) ;
handle = start_transaction ( inode ) ;
if ( IS_ERR ( handle ) )
goto out_mutex ;
/*
* Now we need to zero out the non - page - aligned data in the
* pages at the start and tail of the hole , and unmap the buffer
* heads for the block aligned regions of the page that were
* completely zerod .
*/
if ( first_page > last_page ) {
/*
* If the file space being truncated is contained within a page
* just zero out and unmap the middle of that page
*/
err = ext4_discard_partial_page_buffers ( handle ,
mapping , offset , length , 0 ) ;
if ( err )
goto out ;
} else {
/*
* Zero out and unmap the paritial page that contains
* the start of the hole
*/
page_len = first_page_offset - offset ;
if ( page_len > 0 ) {
err = ext4_discard_partial_page_buffers ( handle , mapping ,
offset , page_len , 0 ) ;
if ( err )
goto out ;
}
/*
* Zero out and unmap the partial page that contains
* the end of the hole
*/
page_len = offset + length - last_page_offset ;
if ( page_len > 0 ) {
err = ext4_discard_partial_page_buffers ( handle , mapping ,
last_page_offset , page_len , 0 ) ;
if ( err )
goto out ;
}
}
/*
* If i_size contained in the last page , we need to
* unmap and zero the paritial page after i_size
*/
if ( inode - > i_size > > PAGE_CACHE_SHIFT = = last_page & &
inode - > i_size % PAGE_CACHE_SIZE ! = 0 ) {
page_len = PAGE_CACHE_SIZE -
( inode - > i_size & ( PAGE_CACHE_SIZE - 1 ) ) ;
if ( page_len > 0 ) {
err = ext4_discard_partial_page_buffers ( handle ,
mapping , inode - > i_size , page_len , 0 ) ;
if ( err )
goto out ;
}
}
first_block = ( offset + sb - > s_blocksize - 1 ) > >
EXT4_BLOCK_SIZE_BITS ( sb ) ;
stop_block = ( offset + length ) > > EXT4_BLOCK_SIZE_BITS ( sb ) ;
if ( first_block > = stop_block )
goto out ;
down_write ( & EXT4_I ( inode ) - > i_data_sem ) ;
ext4_discard_preallocations ( inode ) ;
err = ext4_es_remove_extent ( inode , first_block ,
stop_block - first_block ) ;
err = ext4_free_hole_blocks ( handle , inode , first_block , stop_block ) ;
ext4_discard_preallocations ( inode ) ;
if ( IS_SYNC ( inode ) )
ext4_handle_sync ( handle ) ;
up_write ( & EXT4_I ( inode ) - > i_data_sem ) ;
out :
inode - > i_mtime = inode - > i_ctime = ext4_current_time ( inode ) ;
ext4_mark_inode_dirty ( handle , inode ) ;
ext4_journal_stop ( handle ) ;
out_mutex :
mutex_unlock ( & inode - > i_mutex ) ;
return err ;
}