2005-04-16 15:20:36 -07:00
/*
* linux / fs / ext2 / inode . c
*
* Copyright ( C ) 1992 , 1993 , 1994 , 1995
* Remy Card ( card @ masi . ibp . fr )
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie ( Paris VI )
*
* from
*
* linux / fs / minix / inode . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* Goal - directed block allocation by Stephen Tweedie
* ( sct @ dcs . ed . ac . uk ) , 1993 , 1998
* Big - endian to little - endian byte - swapping / bitmaps by
* David S . Miller ( davem @ caip . rutgers . edu ) , 1995
* 64 - bit file support on 64 - bit platforms by Jakub Jelinek
* ( jj @ sunsite . ms . mff . cuni . cz )
*
* Assorted race fixes , rewrite of ext2_get_block ( ) by Al Viro , 2000
*/
# include <linux/smp_lock.h>
# include <linux/time.h>
# include <linux/highuid.h>
# include <linux/pagemap.h>
# include <linux/quotaops.h>
# include <linux/module.h>
# include <linux/writeback.h>
# include <linux/buffer_head.h>
# include <linux/mpage.h>
2008-10-03 17:32:43 -04:00
# include <linux/fiemap.h>
2005-04-16 15:20:36 -07:00
# include "ext2.h"
# include "acl.h"
2005-06-23 22:05:26 -07:00
# include "xip.h"
2005-04-16 15:20:36 -07:00
MODULE_AUTHOR ( " Remy Card and others " ) ;
MODULE_DESCRIPTION ( " Second Extended Filesystem " ) ;
MODULE_LICENSE ( " GPL " ) ;
static int ext2_update_inode ( struct inode * inode , int do_sync ) ;
/*
* Test whether an inode is a fast symlink .
*/
static inline int ext2_inode_is_fast_symlink ( struct inode * inode )
{
int ea_blocks = EXT2_I ( inode ) - > i_file_acl ?
( inode - > i_sb - > s_blocksize > > 9 ) : 0 ;
return ( S_ISLNK ( inode - > i_mode ) & &
inode - > i_blocks - ea_blocks = = 0 ) ;
}
/*
* Called at the last iput ( ) if i_nlink is zero .
*/
void ext2_delete_inode ( struct inode * inode )
{
2005-09-09 13:01:31 -07:00
truncate_inode_pages ( & inode - > i_data , 0 ) ;
2005-04-16 15:20:36 -07:00
if ( is_bad_inode ( inode ) )
goto no_delete ;
EXT2_I ( inode ) - > i_dtime = get_seconds ( ) ;
mark_inode_dirty ( inode ) ;
ext2_update_inode ( inode , inode_needs_sync ( inode ) ) ;
inode - > i_size = 0 ;
if ( inode - > i_blocks )
ext2_truncate ( inode ) ;
ext2_free_inode ( inode ) ;
return ;
no_delete :
clear_inode ( inode ) ; /* We must guarantee clearing of inode... */
}
typedef struct {
__le32 * p ;
__le32 key ;
struct buffer_head * bh ;
} Indirect ;
static inline void add_chain ( Indirect * p , struct buffer_head * bh , __le32 * v )
{
p - > key = * ( p - > p = v ) ;
p - > bh = bh ;
}
static inline int verify_chain ( Indirect * from , Indirect * to )
{
while ( from < = to & & from - > key = = * from - > p )
from + + ;
return ( from > to ) ;
}
/**
* ext2_block_to_path - parse the block number into array of offsets
* @ inode : inode in question ( we are only interested in its superblock )
* @ i_block : block number to be parsed
* @ offsets : array to store the offsets in
* @ boundary : set this non - zero if the referred - to block is likely to be
* followed ( on disk ) by an indirect block .
* To store the locations of file ' s data ext2 uses a data structure common
* for UNIX filesystems - tree of pointers anchored in the inode , with
* data blocks at leaves and indirect blocks in intermediate nodes .
* This function translates the block number into path in that tree -
* return value is the path length and @ offsets [ n ] is the offset of
* pointer to ( n + 1 ) th node in the nth one . If @ block is out of range
* ( negative or too large ) warning is printed and zero returned .
*
* Note : function doesn ' t find node addresses , so no IO is needed . All
* we need to know is the capacity of indirect blocks ( taken from the
* inode - > i_sb ) .
*/
/*
* Portability note : the last comparison ( check that we fit into triple
* indirect block ) is spelled differently , because otherwise on an
* architecture with 32 - bit longs and 8 Kb pages we might get into trouble
* if our filesystem had 8 Kb blocks . We might use long long , but that would
* kill us on x86 . Oh , well , at least the sign propagation does not matter -
* i_block would have to be negative in the very beginning , so we would not
* get there at all .
*/
static int ext2_block_to_path ( struct inode * inode ,
long i_block , int offsets [ 4 ] , int * boundary )
{
int ptrs = EXT2_ADDR_PER_BLOCK ( inode - > i_sb ) ;
int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS ( inode - > i_sb ) ;
const long direct_blocks = EXT2_NDIR_BLOCKS ,
indirect_blocks = ptrs ,
double_blocks = ( 1 < < ( ptrs_bits * 2 ) ) ;
int n = 0 ;
int final = 0 ;
if ( i_block < 0 ) {
ext2_warning ( inode - > i_sb , " ext2_block_to_path " , " block < 0 " ) ;
} else if ( i_block < direct_blocks ) {
offsets [ n + + ] = i_block ;
final = direct_blocks ;
} else if ( ( i_block - = direct_blocks ) < indirect_blocks ) {
offsets [ n + + ] = EXT2_IND_BLOCK ;
offsets [ n + + ] = i_block ;
final = ptrs ;
} else if ( ( i_block - = indirect_blocks ) < double_blocks ) {
offsets [ n + + ] = EXT2_DIND_BLOCK ;
offsets [ n + + ] = i_block > > ptrs_bits ;
offsets [ n + + ] = i_block & ( ptrs - 1 ) ;
final = ptrs ;
} else if ( ( ( i_block - = double_blocks ) > > ( ptrs_bits * 2 ) ) < ptrs ) {
offsets [ n + + ] = EXT2_TIND_BLOCK ;
offsets [ n + + ] = i_block > > ( ptrs_bits * 2 ) ;
offsets [ n + + ] = ( i_block > > ptrs_bits ) & ( ptrs - 1 ) ;
offsets [ n + + ] = i_block & ( ptrs - 1 ) ;
final = ptrs ;
} else {
ext2_warning ( inode - > i_sb , " ext2_block_to_path " , " block > big " ) ;
}
if ( boundary )
2007-10-16 23:30:46 -07:00
* boundary = final - 1 - ( i_block & ( ptrs - 1 ) ) ;
2005-04-16 15:20:36 -07:00
return n ;
}
/**
* ext2_get_branch - read the chain of indirect blocks leading to data
* @ inode : inode in question
* @ depth : depth of the chain ( 1 - direct pointer , etc . )
* @ offsets : offsets of pointers in inode / indirect blocks
* @ chain : place to store the result
* @ err : here we store the error value
*
* Function fills the array of triples < key , p , bh > and returns % NULL
* if everything went OK or the pointer to the last filled triple
* ( incomplete one ) otherwise . Upon the return chain [ i ] . key contains
* the number of ( i + 1 ) - th block in the chain ( as it is stored in memory ,
* i . e . little - endian 32 - bit ) , chain [ i ] . p contains the address of that
* number ( it points into struct inode for i = = 0 and into the bh - > b_data
* for i > 0 ) and chain [ i ] . bh points to the buffer_head of i - th indirect
* block for i > 0 and NULL for i = = 0. In other words , it holds the block
* numbers of the chain , addresses they were taken from ( and where we can
* verify that chain did not change ) and buffer_heads hosting these
* numbers .
*
* Function stops when it stumbles upon zero pointer ( absent block )
* ( pointer to last triple returned , * @ err = = 0 )
* or when it gets an IO error reading an indirect block
* ( ditto , * @ err = = - EIO )
* or when it notices that chain had been changed while it was reading
* ( ditto , * @ err = = - EAGAIN )
* or when it reads all @ depth - 1 indirect blocks successfully and finds
* the whole chain , all way to the data ( returns % NULL , * err = = 0 ) .
*/
static Indirect * ext2_get_branch ( struct inode * inode ,
int depth ,
int * offsets ,
Indirect chain [ 4 ] ,
int * err )
{
struct super_block * sb = inode - > i_sb ;
Indirect * p = chain ;
struct buffer_head * bh ;
* err = 0 ;
/* i_data is not going away, no lock needed */
add_chain ( chain , NULL , EXT2_I ( inode ) - > i_data + * offsets ) ;
if ( ! p - > key )
goto no_block ;
while ( - - depth ) {
bh = sb_bread ( sb , le32_to_cpu ( p - > key ) ) ;
if ( ! bh )
goto failure ;
read_lock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
if ( ! verify_chain ( chain , p ) )
goto changed ;
add_chain ( + + p , bh , ( __le32 * ) bh - > b_data + * + + offsets ) ;
read_unlock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
if ( ! p - > key )
goto no_block ;
}
return NULL ;
changed :
read_unlock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
brelse ( bh ) ;
* err = - EAGAIN ;
goto no_block ;
failure :
* err = - EIO ;
no_block :
return p ;
}
/**
* ext2_find_near - find a place for allocation with sufficient locality
* @ inode : owner
* @ ind : descriptor of indirect block .
*
2008-04-21 22:45:55 +00:00
* This function returns the preferred place for block allocation .
2005-04-16 15:20:36 -07:00
* It is used when heuristic for sequential allocation fails .
* Rules are :
* + if there is a block to the left of our position - allocate near it .
* + if pointer will live in indirect block - allocate near that block .
* + if pointer will live in inode - allocate in the same cylinder group .
*
* In the latter case we colour the starting block by the callers PID to
* prevent it from clashing with concurrent allocations for a different inode
* in the same block group . The PID is used here so that functionally related
* files will be close - by on - disk .
*
* Caller must make sure that @ ind is valid and will stay that way .
*/
2008-04-28 02:16:02 -07:00
static ext2_fsblk_t ext2_find_near ( struct inode * inode , Indirect * ind )
2005-04-16 15:20:36 -07:00
{
struct ext2_inode_info * ei = EXT2_I ( inode ) ;
__le32 * start = ind - > bh ? ( __le32 * ) ind - > bh - > b_data : ei - > i_data ;
__le32 * p ;
2008-04-28 02:16:02 -07:00
ext2_fsblk_t bg_start ;
ext2_fsblk_t colour ;
2005-04-16 15:20:36 -07:00
/* Try to find previous block */
for ( p = ind - > p - 1 ; p > = start ; p - - )
if ( * p )
return le32_to_cpu ( * p ) ;
/* No such thing, so let's try location of indirect block */
if ( ind - > bh )
return ind - > bh - > b_blocknr ;
/*
* It is going to be refered from inode itself ? OK , just put it into
* the same cylinder group then .
*/
2008-04-28 02:16:01 -07:00
bg_start = ext2_group_first_block_no ( inode - > i_sb , ei - > i_block_group ) ;
2005-04-16 15:20:36 -07:00
colour = ( current - > pid % 16 ) *
( EXT2_BLOCKS_PER_GROUP ( inode - > i_sb ) / 16 ) ;
return bg_start + colour ;
}
/**
2008-04-21 22:45:55 +00:00
* ext2_find_goal - find a preferred place for allocation .
2005-04-16 15:20:36 -07:00
* @ inode : owner
* @ block : block we want
* @ partial : pointer to the last triple within a chain
*
2007-10-16 23:30:46 -07:00
* Returns preferred place for a block ( the goal ) .
2005-04-16 15:20:36 -07:00
*/
2008-04-28 02:16:02 -07:00
static inline ext2_fsblk_t ext2_find_goal ( struct inode * inode , long block ,
Indirect * partial )
2005-04-16 15:20:36 -07:00
{
2007-10-16 23:30:46 -07:00
struct ext2_block_alloc_info * block_i ;
block_i = EXT2_I ( inode ) - > i_block_alloc_info ;
/*
* try the heuristic for sequential allocation ,
* failing that at least try to get decent locality .
*/
if ( block_i & & ( block = = block_i - > last_alloc_logical_block + 1 )
& & ( block_i - > last_alloc_physical_block ! = 0 ) ) {
return block_i - > last_alloc_physical_block + 1 ;
}
return ext2_find_near ( inode , partial ) ;
}
/**
* ext2_blks_to_allocate : Look up the block map and count the number
* of direct blocks need to be allocated for the given branch .
*
* @ branch : chain of indirect blocks
* @ k : number of blocks need for indirect blocks
* @ blks : number of data blocks to be mapped .
* @ blocks_to_boundary : the offset in the indirect block
*
* return the total number of blocks to be allocate , including the
* direct and indirect blocks .
*/
static int
ext2_blks_to_allocate ( Indirect * branch , int k , unsigned long blks ,
int blocks_to_boundary )
{
unsigned long count = 0 ;
/*
* Simple case , [ t , d ] Indirect block ( s ) has not allocated yet
* then it ' s clear blocks on that path have not allocated
*/
if ( k > 0 ) {
/* right now don't hanel cross boundary allocation */
if ( blks < blocks_to_boundary + 1 )
count + = blks ;
else
count + = blocks_to_boundary + 1 ;
return count ;
2005-04-16 15:20:36 -07:00
}
2007-10-16 23:30:46 -07:00
count + + ;
while ( count < blks & & count < = blocks_to_boundary
& & le32_to_cpu ( * ( branch [ 0 ] . p + count ) ) = = 0 ) {
count + + ;
}
return count ;
}
/**
* ext2_alloc_blocks : multiple allocate blocks needed for a branch
* @ indirect_blks : the number of blocks need to allocate for indirect
* blocks
*
* @ new_blocks : on return it will store the new block numbers for
* the indirect blocks ( if needed ) and the first direct block ,
* @ blks : on return it will store the total number of allocated
* direct blocks
*/
static int ext2_alloc_blocks ( struct inode * inode ,
ext2_fsblk_t goal , int indirect_blks , int blks ,
ext2_fsblk_t new_blocks [ 4 ] , int * err )
{
int target , i ;
unsigned long count = 0 ;
int index = 0 ;
ext2_fsblk_t current_block = 0 ;
int ret = 0 ;
/*
* Here we try to allocate the requested multiple blocks at once ,
* on a best - effort basis .
* To build a branch , we should allocate blocks for
* the indirect blocks ( if not allocated yet ) , and at least
* the first direct block of this branch . That ' s the
* minimum number of blocks need to allocate ( required )
*/
target = blks + indirect_blks ;
while ( 1 ) {
count = target ;
/* allocating blocks for indirect blocks and direct blocks */
current_block = ext2_new_blocks ( inode , goal , & count , err ) ;
if ( * err )
goto failed_out ;
target - = count ;
/* allocate blocks for indirect blocks */
while ( index < indirect_blks & & count ) {
new_blocks [ index + + ] = current_block + + ;
count - - ;
}
if ( count > 0 )
break ;
}
/* save the new block number for the first direct block */
new_blocks [ index ] = current_block ;
/* total number of blocks allocated for direct blocks */
ret = count ;
* err = 0 ;
return ret ;
failed_out :
for ( i = 0 ; i < index ; i + + )
ext2_free_blocks ( inode , new_blocks [ i ] , 1 ) ;
return ret ;
2005-04-16 15:20:36 -07:00
}
/**
* ext2_alloc_branch - allocate and set up a chain of blocks .
* @ inode : owner
* @ num : depth of the chain ( number of blocks to allocate )
* @ offsets : offsets ( in the blocks ) to store the pointers to next .
* @ branch : place to store the chain in .
*
* This function allocates @ num blocks , zeroes out all but the last one ,
* links them into chain and ( if we are synchronous ) writes them to disk .
* In other words , it prepares a branch that can be spliced onto the
* inode . It stores the information about that chain in the branch [ ] , in
* the same format as ext2_get_branch ( ) would do . We are calling it after
* we had read the existing part of chain and partial points to the last
* triple of that ( one with zero - > key ) . Upon the exit we have the same
* picture as after the successful ext2_get_block ( ) , excpet that in one
* place chain is disconnected - * branch - > p is still zero ( we did not
* set the last link ) , but branch - > key contains the number that should
* be placed into * branch - > p to fill that gap .
*
* If allocation fails we free all blocks we ' ve allocated ( and forget
* their buffer_heads ) and return the error value the from failed
* ext2_alloc_block ( ) ( normally - ENOSPC ) . Otherwise we set the chain
* as described above and return 0.
*/
static int ext2_alloc_branch ( struct inode * inode ,
2007-10-16 23:30:46 -07:00
int indirect_blks , int * blks , ext2_fsblk_t goal ,
int * offsets , Indirect * branch )
2005-04-16 15:20:36 -07:00
{
int blocksize = inode - > i_sb - > s_blocksize ;
2007-10-16 23:30:46 -07:00
int i , n = 0 ;
int err = 0 ;
struct buffer_head * bh ;
int num ;
ext2_fsblk_t new_blocks [ 4 ] ;
ext2_fsblk_t current_block ;
num = ext2_alloc_blocks ( inode , goal , indirect_blks ,
* blks , new_blocks , & err ) ;
if ( err )
return err ;
branch [ 0 ] . key = cpu_to_le32 ( new_blocks [ 0 ] ) ;
/*
* metadata blocks and data blocks are allocated .
*/
for ( n = 1 ; n < = indirect_blks ; n + + ) {
2005-04-16 15:20:36 -07:00
/*
2007-10-16 23:30:46 -07:00
* Get buffer_head for parent block , zero it out
* and set the pointer to new one , then send
* parent to disk .
2005-04-16 15:20:36 -07:00
*/
2007-10-16 23:30:46 -07:00
bh = sb_getblk ( inode - > i_sb , new_blocks [ n - 1 ] ) ;
branch [ n ] . bh = bh ;
2005-04-16 15:20:36 -07:00
lock_buffer ( bh ) ;
memset ( bh - > b_data , 0 , blocksize ) ;
branch [ n ] . p = ( __le32 * ) bh - > b_data + offsets [ n ] ;
2007-10-16 23:30:46 -07:00
branch [ n ] . key = cpu_to_le32 ( new_blocks [ n ] ) ;
2005-04-16 15:20:36 -07:00
* branch [ n ] . p = branch [ n ] . key ;
2007-10-16 23:30:46 -07:00
if ( n = = indirect_blks ) {
current_block = new_blocks [ n ] ;
/*
* End of chain , update the last new metablock of
* the chain to point to the new allocated
* data blocks numbers
*/
for ( i = 1 ; i < num ; i + + )
* ( branch [ n ] . p + i ) = cpu_to_le32 ( + + current_block ) ;
}
2005-04-16 15:20:36 -07:00
set_buffer_uptodate ( bh ) ;
unlock_buffer ( bh ) ;
mark_buffer_dirty_inode ( bh , inode ) ;
/* We used to sync bh here if IS_SYNC(inode).
* But we now rely upon generic_osync_inode ( )
* and b_inode_buffers . But not for directories .
*/
if ( S_ISDIR ( inode - > i_mode ) & & IS_DIRSYNC ( inode ) )
sync_dirty_buffer ( bh ) ;
}
2007-10-16 23:30:46 -07:00
* blks = num ;
2005-04-16 15:20:36 -07:00
return err ;
}
/**
2007-10-16 23:30:46 -07:00
* ext2_splice_branch - splice the allocated branch onto inode .
* @ inode : owner
* @ block : ( logical ) number of block we are adding
* @ chain : chain of indirect blocks ( with a missing link - see
* ext2_alloc_branch )
* @ where : location of missing link
* @ num : number of indirect blocks we are adding
* @ blks : number of direct blocks we are adding
2005-04-16 15:20:36 -07:00
*
2007-10-16 23:30:46 -07:00
* This function fills the missing link and does all housekeeping needed in
* inode ( - > i_blocks , etc . ) . In case of success we end up with the full
* chain to new block and return 0.
2005-04-16 15:20:36 -07:00
*/
2007-10-16 23:30:46 -07:00
static void ext2_splice_branch ( struct inode * inode ,
long block , Indirect * where , int num , int blks )
2005-04-16 15:20:36 -07:00
{
int i ;
2007-10-16 23:30:46 -07:00
struct ext2_block_alloc_info * block_i ;
ext2_fsblk_t current_block ;
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
block_i = EXT2_I ( inode ) - > i_block_alloc_info ;
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
/* XXX LOCKING probably should have i_meta_lock ?*/
2005-04-16 15:20:36 -07:00
/* That's it */
* where - > p = where - > key ;
2007-10-16 23:30:46 -07:00
/*
* Update the host buffer_head or inode to point to more just allocated
* direct blocks blocks
*/
if ( num = = 0 & & blks > 1 ) {
current_block = le32_to_cpu ( where - > key ) + 1 ;
for ( i = 1 ; i < blks ; i + + )
* ( where - > p + i ) = cpu_to_le32 ( current_block + + ) ;
}
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
/*
* update the most recently allocated logical & physical block
* in i_block_alloc_info , to assist find the proper goal block for next
* allocation
*/
if ( block_i ) {
block_i - > last_alloc_logical_block = block + blks - 1 ;
block_i - > last_alloc_physical_block =
le32_to_cpu ( where [ num ] . key ) + blks - 1 ;
}
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
/* We are done with atomic stuff, now do the rest of housekeeping */
2005-04-16 15:20:36 -07:00
/* had we spliced it onto indirect block? */
if ( where - > bh )
mark_buffer_dirty_inode ( where - > bh , inode ) ;
2007-10-16 23:30:46 -07:00
inode - > i_ctime = CURRENT_TIME_SEC ;
2005-04-16 15:20:36 -07:00
mark_inode_dirty ( inode ) ;
}
/*
* Allocation strategy is simple : if we have to allocate something , we will
* have to go the whole way to leaf . So let ' s do it before attaching anything
* to tree , set linkage between the newborn blocks , write them if sync is
* required , recheck the path , free and repeat if check fails , otherwise
* set the last missing link ( that will protect us from any truncate - generated
* removals - all blocks on the path are immune now ) and possibly force the
* write on the parent block .
* That has a nice additional property : no special recovery from the failed
* allocations is needed - we simply release blocks and do not touch anything
* reachable from inode .
2007-10-16 23:30:46 -07:00
*
* ` handle ' can be NULL if create = = 0.
*
* return > 0 , # of blocks mapped or allocated .
* return = 0 , if plain lookup failed .
* return < 0 , error case .
2005-04-16 15:20:36 -07:00
*/
2007-10-16 23:30:46 -07:00
static int ext2_get_blocks ( struct inode * inode ,
sector_t iblock , unsigned long maxblocks ,
struct buffer_head * bh_result ,
int create )
2005-04-16 15:20:36 -07:00
{
int err = - EIO ;
int offsets [ 4 ] ;
Indirect chain [ 4 ] ;
Indirect * partial ;
2007-10-16 23:30:46 -07:00
ext2_fsblk_t goal ;
int indirect_blks ;
int blocks_to_boundary = 0 ;
int depth ;
struct ext2_inode_info * ei = EXT2_I ( inode ) ;
int count = 0 ;
ext2_fsblk_t first_block = 0 ;
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
depth = ext2_block_to_path ( inode , iblock , offsets , & blocks_to_boundary ) ;
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
if ( depth = = 0 )
return ( err ) ;
2005-04-16 15:20:36 -07:00
reread :
partial = ext2_get_branch ( inode , depth , offsets , chain , & err ) ;
/* Simplest case - block found, no allocation needed */
if ( ! partial ) {
2007-10-16 23:30:46 -07:00
first_block = le32_to_cpu ( chain [ depth - 1 ] . key ) ;
clear_buffer_new ( bh_result ) ; /* What's this do? */
count + + ;
/*map more blocks*/
while ( count < maxblocks & & count < = blocks_to_boundary ) {
ext2_fsblk_t blk ;
if ( ! verify_chain ( chain , partial ) ) {
/*
* Indirect block might be removed by
* truncate while we were reading it .
* Handling of that case : forget what we ' ve
* got now , go to reread .
*/
count = 0 ;
goto changed ;
}
blk = le32_to_cpu ( * ( chain [ depth - 1 ] . p + count ) ) ;
if ( blk = = first_block + count )
count + + ;
else
break ;
}
goto got_it ;
2005-04-16 15:20:36 -07:00
}
/* Next simple case - plain lookup or failed read of indirect block */
2007-10-16 23:30:46 -07:00
if ( ! create | | err = = - EIO )
goto cleanup ;
mutex_lock ( & ei - > truncate_mutex ) ;
2005-04-16 15:20:36 -07:00
/*
2007-10-16 23:30:46 -07:00
* Okay , we need to do block allocation . Lazily initialize the block
* allocation info here if necessary
*/
if ( S_ISREG ( inode - > i_mode ) & & ( ! ei - > i_block_alloc_info ) )
ext2_init_block_alloc_info ( inode ) ;
2005-04-16 15:20:36 -07:00
2008-02-06 01:40:16 -08:00
goal = ext2_find_goal ( inode , iblock , partial ) ;
2005-04-16 15:20:36 -07:00
2007-10-16 23:30:46 -07:00
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = ( chain + depth ) - partial - 1 ;
/*
* Next look up the indirect map to count the totoal number of
* direct blocks to allocate for this branch .
*/
count = ext2_blks_to_allocate ( partial , indirect_blks ,
maxblocks , blocks_to_boundary ) ;
/*
* XXX ? ? ? ? Block out ext2_truncate while we alter the tree
*/
err = ext2_alloc_branch ( inode , indirect_blks , & count , goal ,
offsets + ( partial - chain ) , partial ) ;
if ( err ) {
mutex_unlock ( & ei - > truncate_mutex ) ;
2005-04-16 15:20:36 -07:00
goto cleanup ;
2007-10-16 23:30:46 -07:00
}
2005-04-16 15:20:36 -07:00
2005-06-23 22:05:26 -07:00
if ( ext2_use_xip ( inode - > i_sb ) ) {
/*
* we need to clear the block
*/
err = ext2_clear_xip_target ( inode ,
le32_to_cpu ( chain [ depth - 1 ] . key ) ) ;
2007-10-16 23:30:46 -07:00
if ( err ) {
mutex_unlock ( & ei - > truncate_mutex ) ;
2005-06-23 22:05:26 -07:00
goto cleanup ;
2007-10-16 23:30:46 -07:00
}
2005-06-23 22:05:26 -07:00
}
2007-10-16 23:30:46 -07:00
ext2_splice_branch ( inode , iblock , partial , indirect_blks , count ) ;
mutex_unlock ( & ei - > truncate_mutex ) ;
2005-04-16 15:20:36 -07:00
set_buffer_new ( bh_result ) ;
2007-10-16 23:30:46 -07:00
got_it :
map_bh ( bh_result , inode - > i_sb , le32_to_cpu ( chain [ depth - 1 ] . key ) ) ;
if ( count > blocks_to_boundary )
set_buffer_boundary ( bh_result ) ;
err = count ;
/* Clean up and exit */
partial = chain + depth - 1 ; /* the whole chain */
cleanup :
while ( partial > chain ) {
brelse ( partial - > bh ) ;
partial - - ;
}
return err ;
2005-04-16 15:20:36 -07:00
changed :
while ( partial > chain ) {
brelse ( partial - > bh ) ;
partial - - ;
}
goto reread ;
}
2007-10-16 23:30:46 -07:00
int ext2_get_block ( struct inode * inode , sector_t iblock , struct buffer_head * bh_result , int create )
{
unsigned max_blocks = bh_result - > b_size > > inode - > i_blkbits ;
int ret = ext2_get_blocks ( inode , iblock , max_blocks ,
bh_result , create ) ;
if ( ret > 0 ) {
bh_result - > b_size = ( ret < < inode - > i_blkbits ) ;
ret = 0 ;
}
return ret ;
}
2008-10-03 17:32:43 -04:00
int ext2_fiemap ( struct inode * inode , struct fiemap_extent_info * fieinfo ,
u64 start , u64 len )
{
return generic_block_fiemap ( inode , fieinfo , start , len ,
ext2_get_block ) ;
}
2005-04-16 15:20:36 -07:00
static int ext2_writepage ( struct page * page , struct writeback_control * wbc )
{
return block_write_full_page ( page , ext2_get_block , wbc ) ;
}
static int ext2_readpage ( struct file * file , struct page * page )
{
return mpage_readpage ( page , ext2_get_block ) ;
}
static int
ext2_readpages ( struct file * file , struct address_space * mapping ,
struct list_head * pages , unsigned nr_pages )
{
return mpage_readpages ( mapping , pages , nr_pages , ext2_get_block ) ;
}
2007-10-16 01:25:04 -07:00
int __ext2_write_begin ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , void * * fsdata )
2005-04-16 15:20:36 -07:00
{
2007-10-16 01:25:04 -07:00
return block_write_begin ( file , mapping , pos , len , flags , pagep , fsdata ,
ext2_get_block ) ;
2005-04-16 15:20:36 -07:00
}
static int
2007-10-16 01:25:04 -07:00
ext2_write_begin ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , void * * fsdata )
2005-04-16 15:20:36 -07:00
{
2007-10-16 01:25:04 -07:00
* pagep = NULL ;
return __ext2_write_begin ( file , mapping , pos , len , flags , pagep , fsdata ) ;
2005-04-16 15:20:36 -07:00
}
2007-10-16 01:25:25 -07:00
static int
ext2_nobh_write_begin ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , void * * fsdata )
{
/*
* Dir - in - pagecache still uses ext2_write_begin . Would have to rework
* directory handling code to pass around offsets rather than struct
* pages in order to make this work easily .
*/
return nobh_write_begin ( file , mapping , pos , len , flags , pagep , fsdata ,
ext2_get_block ) ;
}
2005-04-16 15:20:36 -07:00
static int ext2_nobh_writepage ( struct page * page ,
struct writeback_control * wbc )
{
return nobh_writepage ( page , ext2_get_block , wbc ) ;
}
static sector_t ext2_bmap ( struct address_space * mapping , sector_t block )
{
return generic_block_bmap ( mapping , block , ext2_get_block ) ;
}
static ssize_t
ext2_direct_IO ( int rw , struct kiocb * iocb , const struct iovec * iov ,
loff_t offset , unsigned long nr_segs )
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file - > f_mapping - > host ;
return blockdev_direct_IO ( rw , iocb , inode , inode - > i_sb - > s_bdev , iov ,
2006-03-26 01:38:02 -08:00
offset , nr_segs , ext2_get_block , NULL ) ;
2005-04-16 15:20:36 -07:00
}
static int
ext2_writepages ( struct address_space * mapping , struct writeback_control * wbc )
{
return mpage_writepages ( mapping , wbc , ext2_get_block ) ;
}
2006-06-28 04:26:44 -07:00
const struct address_space_operations ext2_aops = {
2005-04-16 15:20:36 -07:00
. readpage = ext2_readpage ,
. readpages = ext2_readpages ,
. writepage = ext2_writepage ,
. sync_page = block_sync_page ,
2007-10-16 01:25:04 -07:00
. write_begin = ext2_write_begin ,
. write_end = generic_write_end ,
2005-04-16 15:20:36 -07:00
. bmap = ext2_bmap ,
. direct_IO = ext2_direct_IO ,
. writepages = ext2_writepages ,
2006-02-01 03:05:41 -08:00
. migratepage = buffer_migrate_page ,
vfs: pagecache usage optimization for pagesize!=blocksize
When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.
I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment. Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.
So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate. This can
reduce read IO and improve system throughput.
I wrote a benchmark program and got result number with this program.
This benchmark do:
1: mount and open a test file.
2: create a 512MB file.
3: close a file and umount.
4: mount and again open a test file.
5: pwrite randomly 300000 times on a test file. offset is aligned
by IO size(1024bytes).
6: measure time of preading randomly 100000 times on a test file.
The result was:
2.6.26
330 sec
2.6.26-patched
226 sec
Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB
On ext3/4, a file is written through buffer/block. So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment. This test result
showed this.
The benchmark program is as follows:
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#define LEN 1024
#define LOOP 1024*512 /* 512MB */
main(void)
{
unsigned long i, offset, filesize;
int fd;
char buf[LEN];
time_t t1, t2;
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
memset(buf, 0, LEN);
fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
for (i = 0; i < LOOP; i++)
write(fd, buf, LEN);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
fd = open("/root/test1/testfile", O_RDWR);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
filesize = LEN * LOOP;
for (i = 0; i < 300000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pwrite(fd, buf, LEN, offset);
}
printf("start test\n");
time(&t1);
for (i = 0; i < 100000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pread(fd, buf, LEN, offset);
}
time(&t2);
printf("%ld sec\n", t2-t1);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
}
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-28 15:46:36 -07:00
. is_partially_uptodate = block_is_partially_uptodate ,
2005-04-16 15:20:36 -07:00
} ;
2006-06-28 04:26:44 -07:00
const struct address_space_operations ext2_aops_xip = {
2005-06-23 22:05:26 -07:00
. bmap = ext2_bmap ,
2008-04-28 02:13:02 -07:00
. get_xip_mem = ext2_get_xip_mem ,
2005-06-23 22:05:26 -07:00
} ;
2006-06-28 04:26:44 -07:00
const struct address_space_operations ext2_nobh_aops = {
2005-04-16 15:20:36 -07:00
. readpage = ext2_readpage ,
. readpages = ext2_readpages ,
. writepage = ext2_nobh_writepage ,
. sync_page = block_sync_page ,
2007-10-16 01:25:25 -07:00
. write_begin = ext2_nobh_write_begin ,
. write_end = nobh_write_end ,
2005-04-16 15:20:36 -07:00
. bmap = ext2_bmap ,
. direct_IO = ext2_direct_IO ,
. writepages = ext2_writepages ,
2006-02-01 03:05:41 -08:00
. migratepage = buffer_migrate_page ,
2005-04-16 15:20:36 -07:00
} ;
/*
* Probably it should be a library function . . . search for first non - zero word
* or memcmp with zero_page , whatever is better for particular architecture .
* Linus ?
*/
static inline int all_zeroes ( __le32 * p , __le32 * q )
{
while ( p < q )
if ( * p + + )
return 0 ;
return 1 ;
}
/**
* ext2_find_shared - find the indirect blocks for partial truncation .
* @ inode : inode in question
* @ depth : depth of the affected branch
* @ offsets : offsets of pointers in that branch ( see ext2_block_to_path )
* @ chain : place to store the pointers to partial indirect blocks
* @ top : place to the ( detached ) top of branch
*
* This is a helper function used by ext2_truncate ( ) .
*
* When we do truncate ( ) we may have to clean the ends of several indirect
* blocks but leave the blocks themselves alive . Block is partially
* truncated if some data below the new i_size is refered from it ( and
* it is on the path to the first completely truncated data block , indeed ) .
* We have to free the top of that path along with everything to the right
* of the path . Since no allocation past the truncation point is possible
* until ext2_truncate ( ) finishes , we may safely do the latter , but top
* of branch may require special attention - pageout below the truncation
* point might try to populate it .
*
* We atomically detach the top of branch from the tree , store the block
* number of its root in * @ top , pointers to buffer_heads of partially
* truncated blocks - in @ chain [ ] . bh and pointers to their last elements
* that should not be removed - in @ chain [ ] . p . Return value is the pointer
* to last filled element of @ chain .
*
* The work left to caller to do the actual freeing of subtrees :
* a ) free the subtree starting from * @ top
* b ) free the subtrees whose roots are stored in
* ( @ chain [ i ] . p + 1 . . end of @ chain [ i ] . bh - > b_data )
* c ) free the subtrees growing from the inode past the @ chain [ 0 ] . p
* ( no partially truncated stuff there ) .
*/
static Indirect * ext2_find_shared ( struct inode * inode ,
int depth ,
int offsets [ 4 ] ,
Indirect chain [ 4 ] ,
__le32 * top )
{
Indirect * partial , * p ;
int k , err ;
* top = 0 ;
for ( k = depth ; k > 1 & & ! offsets [ k - 1 ] ; k - - )
;
partial = ext2_get_branch ( inode , k , offsets , chain , & err ) ;
if ( ! partial )
partial = chain + k - 1 ;
/*
* If the branch acquired continuation since we ' ve looked at it -
* fine , it should all survive and ( new ) top doesn ' t belong to us .
*/
write_lock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
if ( ! partial - > key & & * partial - > p ) {
write_unlock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
goto no_top ;
}
for ( p = partial ; p > chain & & all_zeroes ( ( __le32 * ) p - > bh - > b_data , p - > p ) ; p - - )
;
/*
* OK , we ' ve found the last block that must survive . The rest of our
* branch should be detached before unlocking . However , if that rest
* of branch is all ours and does not grow immediately from the inode
* it ' s easier to cheat and just decrement partial - > p .
*/
if ( p = = chain + k - 1 & & p > chain ) {
p - > p - - ;
} else {
* top = * p - > p ;
* p - > p = 0 ;
}
write_unlock ( & EXT2_I ( inode ) - > i_meta_lock ) ;
while ( partial > p )
{
brelse ( partial - > bh ) ;
partial - - ;
}
no_top :
return partial ;
}
/**
* ext2_free_data - free a list of data blocks
* @ inode : inode we are dealing with
* @ p : array of block numbers
* @ q : points immediately past the end of array
*
* We are freeing all blocks refered from that array ( numbers are
* stored as little - endian 32 - bit ) and updating @ inode - > i_blocks
* appropriately .
*/
static inline void ext2_free_data ( struct inode * inode , __le32 * p , __le32 * q )
{
unsigned long block_to_free = 0 , count = 0 ;
unsigned long nr ;
for ( ; p < q ; p + + ) {
nr = le32_to_cpu ( * p ) ;
if ( nr ) {
* p = 0 ;
/* accumulate blocks to free if they're contiguous */
if ( count = = 0 )
goto free_this ;
else if ( block_to_free = = nr - count )
count + + ;
else {
mark_inode_dirty ( inode ) ;
ext2_free_blocks ( inode , block_to_free , count ) ;
free_this :
block_to_free = nr ;
count = 1 ;
}
}
}
if ( count > 0 ) {
mark_inode_dirty ( inode ) ;
ext2_free_blocks ( inode , block_to_free , count ) ;
}
}
/**
* ext2_free_branches - free an array of branches
* @ inode : inode we are dealing with
* @ p : array of block numbers
* @ q : pointer immediately past the end of array
* @ depth : depth of the branches to free
*
* We are freeing all blocks refered from these branches ( numbers are
* stored as little - endian 32 - bit ) and updating @ inode - > i_blocks
* appropriately .
*/
static void ext2_free_branches ( struct inode * inode , __le32 * p , __le32 * q , int depth )
{
struct buffer_head * bh ;
unsigned long nr ;
if ( depth - - ) {
int addr_per_block = EXT2_ADDR_PER_BLOCK ( inode - > i_sb ) ;
for ( ; p < q ; p + + ) {
nr = le32_to_cpu ( * p ) ;
if ( ! nr )
continue ;
* p = 0 ;
bh = sb_bread ( inode - > i_sb , nr ) ;
/*
* A read failure ? Report error and clear slot
* ( should be rare ) .
*/
if ( ! bh ) {
ext2_error ( inode - > i_sb , " ext2_free_branches " ,
" Read failure, inode=%ld, block=%ld " ,
inode - > i_ino , nr ) ;
continue ;
}
ext2_free_branches ( inode ,
( __le32 * ) bh - > b_data ,
( __le32 * ) bh - > b_data + addr_per_block ,
depth ) ;
bforget ( bh ) ;
ext2_free_blocks ( inode , nr , 1 ) ;
mark_inode_dirty ( inode ) ;
}
} else
ext2_free_data ( inode , p , q ) ;
}
2007-10-16 23:30:46 -07:00
void ext2_truncate ( struct inode * inode )
2005-04-16 15:20:36 -07:00
{
__le32 * i_data = EXT2_I ( inode ) - > i_data ;
2007-10-16 23:30:46 -07:00
struct ext2_inode_info * ei = EXT2_I ( inode ) ;
2005-04-16 15:20:36 -07:00
int addr_per_block = EXT2_ADDR_PER_BLOCK ( inode - > i_sb ) ;
int offsets [ 4 ] ;
Indirect chain [ 4 ] ;
Indirect * partial ;
__le32 nr = 0 ;
int n ;
long iblock ;
unsigned blocksize ;
if ( ! ( S_ISREG ( inode - > i_mode ) | | S_ISDIR ( inode - > i_mode ) | |
S_ISLNK ( inode - > i_mode ) ) )
return ;
if ( ext2_inode_is_fast_symlink ( inode ) )
return ;
if ( IS_APPEND ( inode ) | | IS_IMMUTABLE ( inode ) )
return ;
blocksize = inode - > i_sb - > s_blocksize ;
iblock = ( inode - > i_size + blocksize - 1 )
> > EXT2_BLOCK_SIZE_BITS ( inode - > i_sb ) ;
2005-06-23 22:05:26 -07:00
if ( mapping_is_xip ( inode - > i_mapping ) )
xip_truncate_page ( inode - > i_mapping , inode - > i_size ) ;
else if ( test_opt ( inode - > i_sb , NOBH ) )
2007-10-16 01:25:25 -07:00
nobh_truncate_page ( inode - > i_mapping ,
inode - > i_size , ext2_get_block ) ;
2005-04-16 15:20:36 -07:00
else
block_truncate_page ( inode - > i_mapping ,
inode - > i_size , ext2_get_block ) ;
n = ext2_block_to_path ( inode , iblock , offsets , NULL ) ;
if ( n = = 0 )
return ;
2007-10-16 23:30:46 -07:00
/*
* From here we block out all ext2_get_block ( ) callers who want to
* modify the block allocation tree .
*/
mutex_lock ( & ei - > truncate_mutex ) ;
2005-04-16 15:20:36 -07:00
if ( n = = 1 ) {
ext2_free_data ( inode , i_data + offsets [ 0 ] ,
i_data + EXT2_NDIR_BLOCKS ) ;
goto do_indirects ;
}
partial = ext2_find_shared ( inode , n , offsets , chain , & nr ) ;
/* Kill the top of shared branch (already detached) */
if ( nr ) {
if ( partial = = chain )
mark_inode_dirty ( inode ) ;
else
mark_buffer_dirty_inode ( partial - > bh , inode ) ;
ext2_free_branches ( inode , & nr , & nr + 1 , ( chain + n - 1 ) - partial ) ;
}
/* Clear the ends of indirect blocks on the shared branch */
while ( partial > chain ) {
ext2_free_branches ( inode ,
partial - > p + 1 ,
( __le32 * ) partial - > bh - > b_data + addr_per_block ,
( chain + n - 1 ) - partial ) ;
mark_buffer_dirty_inode ( partial - > bh , inode ) ;
brelse ( partial - > bh ) ;
partial - - ;
}
do_indirects :
/* Kill the remaining (whole) subtrees */
switch ( offsets [ 0 ] ) {
default :
nr = i_data [ EXT2_IND_BLOCK ] ;
if ( nr ) {
i_data [ EXT2_IND_BLOCK ] = 0 ;
mark_inode_dirty ( inode ) ;
ext2_free_branches ( inode , & nr , & nr + 1 , 1 ) ;
}
case EXT2_IND_BLOCK :
nr = i_data [ EXT2_DIND_BLOCK ] ;
if ( nr ) {
i_data [ EXT2_DIND_BLOCK ] = 0 ;
mark_inode_dirty ( inode ) ;
ext2_free_branches ( inode , & nr , & nr + 1 , 2 ) ;
}
case EXT2_DIND_BLOCK :
nr = i_data [ EXT2_TIND_BLOCK ] ;
if ( nr ) {
i_data [ EXT2_TIND_BLOCK ] = 0 ;
mark_inode_dirty ( inode ) ;
ext2_free_branches ( inode , & nr , & nr + 1 , 3 ) ;
}
case EXT2_TIND_BLOCK :
;
}
2007-10-16 23:30:46 -07:00
ext2_discard_reservation ( inode ) ;
mutex_unlock ( & ei - > truncate_mutex ) ;
2005-04-16 15:20:36 -07:00
inode - > i_mtime = inode - > i_ctime = CURRENT_TIME_SEC ;
if ( inode_needs_sync ( inode ) ) {
sync_mapping_buffers ( inode - > i_mapping ) ;
ext2_sync_inode ( inode ) ;
} else {
mark_inode_dirty ( inode ) ;
}
}
static struct ext2_inode * ext2_get_inode ( struct super_block * sb , ino_t ino ,
struct buffer_head * * p )
{
struct buffer_head * bh ;
unsigned long block_group ;
unsigned long block ;
unsigned long offset ;
struct ext2_group_desc * gdp ;
* p = NULL ;
if ( ( ino ! = EXT2_ROOT_INO & & ino < EXT2_FIRST_INO ( sb ) ) | |
ino > le32_to_cpu ( EXT2_SB ( sb ) - > s_es - > s_inodes_count ) )
goto Einval ;
block_group = ( ino - 1 ) / EXT2_INODES_PER_GROUP ( sb ) ;
2007-10-16 23:26:30 -07:00
gdp = ext2_get_group_desc ( sb , block_group , NULL ) ;
2005-04-16 15:20:36 -07:00
if ( ! gdp )
goto Egdp ;
/*
* Figure out the offset within the block group inode table
*/
offset = ( ( ino - 1 ) % EXT2_INODES_PER_GROUP ( sb ) ) * EXT2_INODE_SIZE ( sb ) ;
block = le32_to_cpu ( gdp - > bg_inode_table ) +
( offset > > EXT2_BLOCK_SIZE_BITS ( sb ) ) ;
if ( ! ( bh = sb_bread ( sb , block ) ) )
goto Eio ;
* p = bh ;
offset & = ( EXT2_BLOCK_SIZE ( sb ) - 1 ) ;
return ( struct ext2_inode * ) ( bh - > b_data + offset ) ;
Einval :
ext2_error ( sb , " ext2_get_inode " , " bad inode number: %lu " ,
( unsigned long ) ino ) ;
return ERR_PTR ( - EINVAL ) ;
Eio :
ext2_error ( sb , " ext2_get_inode " ,
" unable to read inode block - inode=%lu, block=%lu " ,
( unsigned long ) ino , block ) ;
Egdp :
return ERR_PTR ( - EIO ) ;
}
void ext2_set_inode_flags ( struct inode * inode )
{
unsigned int flags = EXT2_I ( inode ) - > i_flags ;
inode - > i_flags & = ~ ( S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC ) ;
if ( flags & EXT2_SYNC_FL )
inode - > i_flags | = S_SYNC ;
if ( flags & EXT2_APPEND_FL )
inode - > i_flags | = S_APPEND ;
if ( flags & EXT2_IMMUTABLE_FL )
inode - > i_flags | = S_IMMUTABLE ;
if ( flags & EXT2_NOATIME_FL )
inode - > i_flags | = S_NOATIME ;
if ( flags & EXT2_DIRSYNC_FL )
inode - > i_flags | = S_DIRSYNC ;
}
2007-05-08 00:31:04 -07:00
/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
void ext2_get_inode_flags ( struct ext2_inode_info * ei )
{
unsigned int flags = ei - > vfs_inode . i_flags ;
ei - > i_flags & = ~ ( EXT2_SYNC_FL | EXT2_APPEND_FL |
EXT2_IMMUTABLE_FL | EXT2_NOATIME_FL | EXT2_DIRSYNC_FL ) ;
if ( flags & S_SYNC )
ei - > i_flags | = EXT2_SYNC_FL ;
if ( flags & S_APPEND )
ei - > i_flags | = EXT2_APPEND_FL ;
if ( flags & S_IMMUTABLE )
ei - > i_flags | = EXT2_IMMUTABLE_FL ;
if ( flags & S_NOATIME )
ei - > i_flags | = EXT2_NOATIME_FL ;
if ( flags & S_DIRSYNC )
ei - > i_flags | = EXT2_DIRSYNC_FL ;
}
2008-02-07 00:15:35 -08:00
struct inode * ext2_iget ( struct super_block * sb , unsigned long ino )
2005-04-16 15:20:36 -07:00
{
2008-02-07 00:15:35 -08:00
struct ext2_inode_info * ei ;
2005-04-16 15:20:36 -07:00
struct buffer_head * bh ;
2008-02-07 00:15:35 -08:00
struct ext2_inode * raw_inode ;
struct inode * inode ;
long ret = - EIO ;
2005-04-16 15:20:36 -07:00
int n ;
2008-02-07 00:15:35 -08:00
inode = iget_locked ( sb , ino ) ;
if ( ! inode )
return ERR_PTR ( - ENOMEM ) ;
if ( ! ( inode - > i_state & I_NEW ) )
return inode ;
ei = EXT2_I ( inode ) ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_EXT2_FS_POSIX_ACL
ei - > i_acl = EXT2_ACL_NOT_CACHED ;
ei - > i_default_acl = EXT2_ACL_NOT_CACHED ;
# endif
2007-10-16 23:30:46 -07:00
ei - > i_block_alloc_info = NULL ;
2008-02-07 00:15:35 -08:00
raw_inode = ext2_get_inode ( inode - > i_sb , ino , & bh ) ;
if ( IS_ERR ( raw_inode ) ) {
ret = PTR_ERR ( raw_inode ) ;
2005-04-16 15:20:36 -07:00
goto bad_inode ;
2008-02-07 00:15:35 -08:00
}
2005-04-16 15:20:36 -07:00
inode - > i_mode = le16_to_cpu ( raw_inode - > i_mode ) ;
inode - > i_uid = ( uid_t ) le16_to_cpu ( raw_inode - > i_uid_low ) ;
inode - > i_gid = ( gid_t ) le16_to_cpu ( raw_inode - > i_gid_low ) ;
if ( ! ( test_opt ( inode - > i_sb , NO_UID32 ) ) ) {
inode - > i_uid | = le16_to_cpu ( raw_inode - > i_uid_high ) < < 16 ;
inode - > i_gid | = le16_to_cpu ( raw_inode - > i_gid_high ) < < 16 ;
}
inode - > i_nlink = le16_to_cpu ( raw_inode - > i_links_count ) ;
inode - > i_size = le32_to_cpu ( raw_inode - > i_size ) ;
ext2/3/4: fix file date underflow on ext2 3 filesystems on 64 bit systems
Taken from http://bugzilla.kernel.org/show_bug.cgi?id=5079
signed long ranges from -2.147.483.648 to 2.147.483.647 on x86 32bit
10000011110110100100111110111101 .. -2,082,844,739
10000011110110100100111110111101 .. 2,212,122,557 <- this currently gets
stored on the disk but when converting it to a 64bit signed long value it loses
its sign and becomes positive.
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: <linux-ext4@vger.kernel.org>
Andreas says:
This patch is now treating timestamps with the high bit set as negative
times (before Jan 1, 1970). This means we lose 1/2 of the possible range
of timestamps (lopping off 68 years before unix timestamp overflow -
now only 30 years away :-) to handle the extremely rare case of setting
timestamps into the distant past.
If we are only interested in fixing the underflow case, we could just
limit the values to 0 instead of storing negative values. At worst this
will skew the timestamp by a few hours for timezones in the far east
(files would still show Jan 1, 1970 in "ls -l" output).
That said, it seems 32-bit systems (mine at least) allow files to be set
into the past (01/01/1907 works fine) so it seems this patch is bringing
the x86_64 behaviour into sync with other kernels.
On the plus side, we have a patch that is ready to add nanosecond timestamps
to ext3 and as an added bonus adds 2 high bits to the on-disk timestamp so
this extends the maximum date to 2242.
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-08 00:23:39 -07:00
inode - > i_atime . tv_sec = ( signed ) le32_to_cpu ( raw_inode - > i_atime ) ;
inode - > i_ctime . tv_sec = ( signed ) le32_to_cpu ( raw_inode - > i_ctime ) ;
inode - > i_mtime . tv_sec = ( signed ) le32_to_cpu ( raw_inode - > i_mtime ) ;
2005-04-16 15:20:36 -07:00
inode - > i_atime . tv_nsec = inode - > i_mtime . tv_nsec = inode - > i_ctime . tv_nsec = 0 ;
ei - > i_dtime = le32_to_cpu ( raw_inode - > i_dtime ) ;
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
* the test is that same one that e2fsck uses
* NeilBrown 1999 oct15
*/
if ( inode - > i_nlink = = 0 & & ( inode - > i_mode = = 0 | | ei - > i_dtime ) ) {
/* this inode is deleted */
brelse ( bh ) ;
2008-02-07 00:15:35 -08:00
ret = - ESTALE ;
2005-04-16 15:20:36 -07:00
goto bad_inode ;
}
inode - > i_blocks = le32_to_cpu ( raw_inode - > i_blocks ) ;
ei - > i_flags = le32_to_cpu ( raw_inode - > i_flags ) ;
ei - > i_faddr = le32_to_cpu ( raw_inode - > i_faddr ) ;
ei - > i_frag_no = raw_inode - > i_frag ;
ei - > i_frag_size = raw_inode - > i_fsize ;
ei - > i_file_acl = le32_to_cpu ( raw_inode - > i_file_acl ) ;
ei - > i_dir_acl = 0 ;
if ( S_ISREG ( inode - > i_mode ) )
inode - > i_size | = ( ( __u64 ) le32_to_cpu ( raw_inode - > i_size_high ) ) < < 32 ;
else
ei - > i_dir_acl = le32_to_cpu ( raw_inode - > i_dir_acl ) ;
ei - > i_dtime = 0 ;
inode - > i_generation = le32_to_cpu ( raw_inode - > i_generation ) ;
ei - > i_state = 0 ;
ei - > i_block_group = ( ino - 1 ) / EXT2_INODES_PER_GROUP ( inode - > i_sb ) ;
ei - > i_dir_start_lookup = 0 ;
/*
* NOTE ! The in - memory inode i_data array is in little - endian order
* even on big - endian machines : we do NOT byteswap the block numbers !
*/
for ( n = 0 ; n < EXT2_N_BLOCKS ; n + + )
ei - > i_data [ n ] = raw_inode - > i_block [ n ] ;
if ( S_ISREG ( inode - > i_mode ) ) {
inode - > i_op = & ext2_file_inode_operations ;
2005-06-23 22:05:26 -07:00
if ( ext2_use_xip ( inode - > i_sb ) ) {
inode - > i_mapping - > a_ops = & ext2_aops_xip ;
inode - > i_fop = & ext2_xip_file_operations ;
} else if ( test_opt ( inode - > i_sb , NOBH ) ) {
2005-04-16 15:20:36 -07:00
inode - > i_mapping - > a_ops = & ext2_nobh_aops ;
2005-06-23 22:05:26 -07:00
inode - > i_fop = & ext2_file_operations ;
} else {
2005-04-16 15:20:36 -07:00
inode - > i_mapping - > a_ops = & ext2_aops ;
2005-06-23 22:05:26 -07:00
inode - > i_fop = & ext2_file_operations ;
}
2005-04-16 15:20:36 -07:00
} else if ( S_ISDIR ( inode - > i_mode ) ) {
inode - > i_op = & ext2_dir_inode_operations ;
inode - > i_fop = & ext2_dir_operations ;
if ( test_opt ( inode - > i_sb , NOBH ) )
inode - > i_mapping - > a_ops = & ext2_nobh_aops ;
else
inode - > i_mapping - > a_ops = & ext2_aops ;
} else if ( S_ISLNK ( inode - > i_mode ) ) {
if ( ext2_inode_is_fast_symlink ( inode ) )
inode - > i_op = & ext2_fast_symlink_inode_operations ;
else {
inode - > i_op = & ext2_symlink_inode_operations ;
if ( test_opt ( inode - > i_sb , NOBH ) )
inode - > i_mapping - > a_ops = & ext2_nobh_aops ;
else
inode - > i_mapping - > a_ops = & ext2_aops ;
}
} else {
inode - > i_op = & ext2_special_inode_operations ;
if ( raw_inode - > i_block [ 0 ] )
init_special_inode ( inode , inode - > i_mode ,
old_decode_dev ( le32_to_cpu ( raw_inode - > i_block [ 0 ] ) ) ) ;
else
init_special_inode ( inode , inode - > i_mode ,
new_decode_dev ( le32_to_cpu ( raw_inode - > i_block [ 1 ] ) ) ) ;
}
brelse ( bh ) ;
ext2_set_inode_flags ( inode ) ;
2008-02-07 00:15:35 -08:00
unlock_new_inode ( inode ) ;
return inode ;
2005-04-16 15:20:36 -07:00
bad_inode :
2008-02-07 00:15:35 -08:00
iget_failed ( inode ) ;
return ERR_PTR ( ret ) ;
2005-04-16 15:20:36 -07:00
}
static int ext2_update_inode ( struct inode * inode , int do_sync )
{
struct ext2_inode_info * ei = EXT2_I ( inode ) ;
struct super_block * sb = inode - > i_sb ;
ino_t ino = inode - > i_ino ;
uid_t uid = inode - > i_uid ;
gid_t gid = inode - > i_gid ;
struct buffer_head * bh ;
struct ext2_inode * raw_inode = ext2_get_inode ( sb , ino , & bh ) ;
int n ;
int err = 0 ;
if ( IS_ERR ( raw_inode ) )
return - EIO ;
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes . */
if ( ei - > i_state & EXT2_STATE_NEW )
memset ( raw_inode , 0 , EXT2_SB ( sb ) - > s_inode_size ) ;
2007-05-08 00:31:04 -07:00
ext2_get_inode_flags ( ei ) ;
2005-04-16 15:20:36 -07:00
raw_inode - > i_mode = cpu_to_le16 ( inode - > i_mode ) ;
if ( ! ( test_opt ( sb , NO_UID32 ) ) ) {
raw_inode - > i_uid_low = cpu_to_le16 ( low_16_bits ( uid ) ) ;
raw_inode - > i_gid_low = cpu_to_le16 ( low_16_bits ( gid ) ) ;
/*
* Fix up interoperability with old kernels . Otherwise , old inodes get
* re - used with the upper 16 bits of the uid / gid intact
*/
if ( ! ei - > i_dtime ) {
raw_inode - > i_uid_high = cpu_to_le16 ( high_16_bits ( uid ) ) ;
raw_inode - > i_gid_high = cpu_to_le16 ( high_16_bits ( gid ) ) ;
} else {
raw_inode - > i_uid_high = 0 ;
raw_inode - > i_gid_high = 0 ;
}
} else {
raw_inode - > i_uid_low = cpu_to_le16 ( fs_high2lowuid ( uid ) ) ;
raw_inode - > i_gid_low = cpu_to_le16 ( fs_high2lowgid ( gid ) ) ;
raw_inode - > i_uid_high = 0 ;
raw_inode - > i_gid_high = 0 ;
}
raw_inode - > i_links_count = cpu_to_le16 ( inode - > i_nlink ) ;
raw_inode - > i_size = cpu_to_le32 ( inode - > i_size ) ;
raw_inode - > i_atime = cpu_to_le32 ( inode - > i_atime . tv_sec ) ;
raw_inode - > i_ctime = cpu_to_le32 ( inode - > i_ctime . tv_sec ) ;
raw_inode - > i_mtime = cpu_to_le32 ( inode - > i_mtime . tv_sec ) ;
raw_inode - > i_blocks = cpu_to_le32 ( inode - > i_blocks ) ;
raw_inode - > i_dtime = cpu_to_le32 ( ei - > i_dtime ) ;
raw_inode - > i_flags = cpu_to_le32 ( ei - > i_flags ) ;
raw_inode - > i_faddr = cpu_to_le32 ( ei - > i_faddr ) ;
raw_inode - > i_frag = ei - > i_frag_no ;
raw_inode - > i_fsize = ei - > i_frag_size ;
raw_inode - > i_file_acl = cpu_to_le32 ( ei - > i_file_acl ) ;
if ( ! S_ISREG ( inode - > i_mode ) )
raw_inode - > i_dir_acl = cpu_to_le32 ( ei - > i_dir_acl ) ;
else {
raw_inode - > i_size_high = cpu_to_le32 ( inode - > i_size > > 32 ) ;
if ( inode - > i_size > 0x7fffffffULL ) {
if ( ! EXT2_HAS_RO_COMPAT_FEATURE ( sb ,
EXT2_FEATURE_RO_COMPAT_LARGE_FILE ) | |
EXT2_SB ( sb ) - > s_es - > s_rev_level = =
cpu_to_le32 ( EXT2_GOOD_OLD_REV ) ) {
/* If this is the first large file
* created , add a flag to the superblock .
*/
lock_kernel ( ) ;
ext2_update_dynamic_rev ( sb ) ;
EXT2_SET_RO_COMPAT_FEATURE ( sb ,
EXT2_FEATURE_RO_COMPAT_LARGE_FILE ) ;
unlock_kernel ( ) ;
ext2_write_super ( sb ) ;
}
}
}
raw_inode - > i_generation = cpu_to_le32 ( inode - > i_generation ) ;
if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) ) {
if ( old_valid_dev ( inode - > i_rdev ) ) {
raw_inode - > i_block [ 0 ] =
cpu_to_le32 ( old_encode_dev ( inode - > i_rdev ) ) ;
raw_inode - > i_block [ 1 ] = 0 ;
} else {
raw_inode - > i_block [ 0 ] = 0 ;
raw_inode - > i_block [ 1 ] =
cpu_to_le32 ( new_encode_dev ( inode - > i_rdev ) ) ;
raw_inode - > i_block [ 2 ] = 0 ;
}
} else for ( n = 0 ; n < EXT2_N_BLOCKS ; n + + )
raw_inode - > i_block [ n ] = ei - > i_data [ n ] ;
mark_buffer_dirty ( bh ) ;
if ( do_sync ) {
sync_dirty_buffer ( bh ) ;
if ( buffer_req ( bh ) & & ! buffer_uptodate ( bh ) ) {
printk ( " IO error syncing ext2 inode [%s:%08lx] \n " ,
sb - > s_id , ( unsigned long ) ino ) ;
err = - EIO ;
}
}
ei - > i_state & = ~ EXT2_STATE_NEW ;
brelse ( bh ) ;
return err ;
}
int ext2_write_inode ( struct inode * inode , int wait )
{
return ext2_update_inode ( inode , wait ) ;
}
int ext2_sync_inode ( struct inode * inode )
{
struct writeback_control wbc = {
. sync_mode = WB_SYNC_ALL ,
. nr_to_write = 0 , /* sys_fsync did this */
} ;
return sync_inode ( inode , & wbc ) ;
}
int ext2_setattr ( struct dentry * dentry , struct iattr * iattr )
{
struct inode * inode = dentry - > d_inode ;
int error ;
error = inode_change_ok ( inode , iattr ) ;
if ( error )
return error ;
if ( ( iattr - > ia_valid & ATTR_UID & & iattr - > ia_uid ! = inode - > i_uid ) | |
( iattr - > ia_valid & ATTR_GID & & iattr - > ia_gid ! = inode - > i_gid ) ) {
error = DQUOT_TRANSFER ( inode , iattr ) ? - EDQUOT : 0 ;
if ( error )
return error ;
}
error = inode_setattr ( inode , iattr ) ;
if ( ! error & & ( iattr - > ia_valid & ATTR_MODE ) )
error = ext2_acl_chmod ( inode ) ;
return error ;
}