2011-07-30 20:52:41 -04:00
/*
* linux / fs / nfs / blocklayout / blocklayout . h
*
* Module for the NFSv4 .1 pNFS block layout driver .
*
* Copyright ( c ) 2006 The Regents of the University of Michigan .
* All rights reserved .
*
* Andy Adamson < andros @ citi . umich . edu >
* Fred Isaman < iisaman @ umich . edu >
*
* permission is granted to use , copy , create derivative works and
* redistribute this software and such derivative works for any purpose ,
* so long as the name of the university of michigan is not used in
* any advertising or publicity pertaining to the use or distribution
* of this software without specific , written prior authorization . if
* the above copyright notice or any other identification of the
* university of michigan is included in any copy of any portion of
* this software , then the disclaimer below must also be included .
*
* this software is provided as is , without representation from the
* university of michigan as to its fitness for any purpose , and without
* warranty by the university of michigan of any kind , either express
* or implied , including without limitation the implied warranties of
* merchantability and fitness for a particular purpose . the regents
* of the university of michigan shall not be liable for any damages ,
* including special , indirect , incidental , or consequential damages ,
* with respect to any claim arising out or in connection with the use
* of the software , even if it has been or is hereafter advised of the
* possibility of such damages .
*/
# include "blocklayout.h"
# define NFSDBG_FACILITY NFSDBG_PNFS_LD
2011-07-30 20:52:49 -04:00
/* Bit numbers */
# define EXTENT_INITIALIZED 0
# define EXTENT_WRITTEN 1
# define EXTENT_IN_COMMIT 2
# define INTERNAL_EXISTS MY_MAX_TAGS
# define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
/* Returns largest t<=s s.t. t%base==0 */
static inline sector_t normalize ( sector_t s , int base )
{
sector_t tmp = s ; /* Since do_div modifies its argument */
return s - do_div ( tmp , base ) ;
}
static inline sector_t normalize_up ( sector_t s , int base )
{
return normalize ( s + base - 1 , base ) ;
}
/* Complete stub using list while determine API wanted */
/* Returns tags, or negative */
static int32_t _find_entry ( struct my_tree * tree , u64 s )
{
struct pnfs_inval_tracking * pos ;
dprintk ( " %s(%llu) enter \n " , __func__ , s ) ;
list_for_each_entry_reverse ( pos , & tree - > mtt_stub , it_link ) {
if ( pos - > it_sector > s )
continue ;
else if ( pos - > it_sector = = s )
return pos - > it_tags & INTERNAL_MASK ;
else
break ;
}
return - ENOENT ;
}
static inline
int _has_tag ( struct my_tree * tree , u64 s , int32_t tag )
{
int32_t tags ;
dprintk ( " %s(%llu, %i) enter \n " , __func__ , s , tag ) ;
s = normalize ( s , tree - > mtt_step_size ) ;
tags = _find_entry ( tree , s ) ;
if ( ( tags < 0 ) | | ! ( tags & ( 1 < < tag ) ) )
return 0 ;
else
return 1 ;
}
/* Creates entry with tag, or if entry already exists, unions tag to it.
* If storage is not NULL , newly created entry will use it .
* Returns number of entries added , or negative on error .
*/
static int _add_entry ( struct my_tree * tree , u64 s , int32_t tag ,
struct pnfs_inval_tracking * storage )
{
int found = 0 ;
struct pnfs_inval_tracking * pos ;
dprintk ( " %s(%llu, %i, %p) enter \n " , __func__ , s , tag , storage ) ;
list_for_each_entry_reverse ( pos , & tree - > mtt_stub , it_link ) {
if ( pos - > it_sector > s )
continue ;
else if ( pos - > it_sector = = s ) {
found = 1 ;
break ;
} else
break ;
}
if ( found ) {
pos - > it_tags | = ( 1 < < tag ) ;
return 0 ;
} else {
struct pnfs_inval_tracking * new ;
if ( storage )
new = storage ;
else {
new = kmalloc ( sizeof ( * new ) , GFP_NOFS ) ;
if ( ! new )
return - ENOMEM ;
}
new - > it_sector = s ;
new - > it_tags = ( 1 < < tag ) ;
list_add ( & new - > it_link , & pos - > it_link ) ;
return 1 ;
}
}
/* XXXX Really want option to not create */
/* Over range, unions tag with existing entries, else creates entry with tag */
static int _set_range ( struct my_tree * tree , int32_t tag , u64 s , u64 length )
{
u64 i ;
dprintk ( " %s(%i, %llu, %llu) enter \n " , __func__ , tag , s , length ) ;
for ( i = normalize ( s , tree - > mtt_step_size ) ; i < s + length ;
i + = tree - > mtt_step_size )
if ( _add_entry ( tree , i , tag , NULL ) )
return - ENOMEM ;
return 0 ;
}
/* Ensure that future operations on given range of tree will not malloc */
static int _preload_range ( struct my_tree * tree , u64 offset , u64 length )
{
u64 start , end , s ;
int count , i , used = 0 , status = - ENOMEM ;
struct pnfs_inval_tracking * * storage ;
dprintk ( " %s(%llu, %llu) enter \n " , __func__ , offset , length ) ;
start = normalize ( offset , tree - > mtt_step_size ) ;
end = normalize_up ( offset + length , tree - > mtt_step_size ) ;
count = ( int ) ( end - start ) / ( int ) tree - > mtt_step_size ;
/* Pre-malloc what memory we might need */
storage = kmalloc ( sizeof ( * storage ) * count , GFP_NOFS ) ;
if ( ! storage )
return - ENOMEM ;
for ( i = 0 ; i < count ; i + + ) {
storage [ i ] = kmalloc ( sizeof ( struct pnfs_inval_tracking ) ,
GFP_NOFS ) ;
if ( ! storage [ i ] )
goto out_cleanup ;
}
/* Now need lock - HOW??? */
for ( s = start ; s < end ; s + = tree - > mtt_step_size )
used + = _add_entry ( tree , s , INTERNAL_EXISTS , storage [ used ] ) ;
/* Unlock - HOW??? */
status = 0 ;
out_cleanup :
for ( i = used ; i < count ; i + + ) {
if ( ! storage [ i ] )
break ;
kfree ( storage [ i ] ) ;
}
kfree ( storage ) ;
return status ;
}
static void set_needs_init ( sector_t * array , sector_t offset )
{
sector_t * p = array ;
dprintk ( " %s enter \n " , __func__ ) ;
if ( ! p )
return ;
while ( * p < offset )
p + + ;
if ( * p = = offset )
return ;
else if ( * p = = ~ 0 ) {
* p + + = offset ;
* p = ~ 0 ;
return ;
} else {
sector_t * save = p ;
dprintk ( " %s Adding %llu \n " , __func__ , ( u64 ) offset ) ;
while ( * p ! = ~ 0 )
p + + ;
p + + ;
memmove ( save + 1 , save , ( char * ) p - ( char * ) save ) ;
* save = offset ;
return ;
}
}
/* We are relying on page lock to serialize this */
int bl_is_sector_init ( struct pnfs_inval_markings * marks , sector_t isect )
{
int rv ;
spin_lock ( & marks - > im_lock ) ;
rv = _has_tag ( & marks - > im_tree , isect , EXTENT_INITIALIZED ) ;
spin_unlock ( & marks - > im_lock ) ;
return rv ;
}
2011-07-30 20:52:55 -04:00
/* Assume start, end already sector aligned */
static int
_range_has_tag ( struct my_tree * tree , u64 start , u64 end , int32_t tag )
{
struct pnfs_inval_tracking * pos ;
u64 expect = 0 ;
dprintk ( " %s(%llu, %llu, %i) enter \n " , __func__ , start , end , tag ) ;
list_for_each_entry_reverse ( pos , & tree - > mtt_stub , it_link ) {
if ( pos - > it_sector > = end )
continue ;
if ( ! expect ) {
if ( ( pos - > it_sector = = end - tree - > mtt_step_size ) & &
( pos - > it_tags & ( 1 < < tag ) ) ) {
expect = pos - > it_sector - tree - > mtt_step_size ;
if ( pos - > it_sector < tree - > mtt_step_size | | expect < start )
return 1 ;
continue ;
} else {
return 0 ;
}
}
if ( pos - > it_sector ! = expect | | ! ( pos - > it_tags & ( 1 < < tag ) ) )
return 0 ;
expect - = tree - > mtt_step_size ;
if ( expect < start )
return 1 ;
}
return 0 ;
}
static int is_range_written ( struct pnfs_inval_markings * marks ,
sector_t start , sector_t end )
{
int rv ;
spin_lock ( & marks - > im_lock ) ;
rv = _range_has_tag ( & marks - > im_tree , start , end , EXTENT_WRITTEN ) ;
spin_unlock ( & marks - > im_lock ) ;
return rv ;
}
2011-07-30 20:52:49 -04:00
/* Marks sectors in [offest, offset_length) as having been initialized.
* All lengths are step - aligned , where step is min ( pagesize , blocksize ) .
* Notes where partial block is initialized , and helps prepare it for
* complete initialization later .
*/
/* Currently assumes offset is page-aligned */
int bl_mark_sectors_init ( struct pnfs_inval_markings * marks ,
sector_t offset , sector_t length ,
sector_t * * pages )
{
sector_t s , start , end ;
sector_t * array = NULL ; /* Pages to mark */
dprintk ( " %s(offset=%llu,len=%llu) enter \n " ,
__func__ , ( u64 ) offset , ( u64 ) length ) ;
s = max ( ( sector_t ) 3 ,
2 * ( marks - > im_block_size / ( PAGE_CACHE_SECTORS ) ) ) ;
dprintk ( " %s set max=%llu \n " , __func__ , ( u64 ) s ) ;
if ( pages ) {
array = kmalloc ( s * sizeof ( sector_t ) , GFP_NOFS ) ;
if ( ! array )
goto outerr ;
array [ 0 ] = ~ 0 ;
}
start = normalize ( offset , marks - > im_block_size ) ;
end = normalize_up ( offset + length , marks - > im_block_size ) ;
if ( _preload_range ( & marks - > im_tree , start , end - start ) )
goto outerr ;
spin_lock ( & marks - > im_lock ) ;
for ( s = normalize_up ( start , PAGE_CACHE_SECTORS ) ;
s < offset ; s + = PAGE_CACHE_SECTORS ) {
dprintk ( " %s pre-area pages \n " , __func__ ) ;
/* Portion of used block is not initialized */
if ( ! _has_tag ( & marks - > im_tree , s , EXTENT_INITIALIZED ) )
set_needs_init ( array , s ) ;
}
if ( _set_range ( & marks - > im_tree , EXTENT_INITIALIZED , offset , length ) )
goto out_unlock ;
for ( s = normalize_up ( offset + length , PAGE_CACHE_SECTORS ) ;
s < end ; s + = PAGE_CACHE_SECTORS ) {
dprintk ( " %s post-area pages \n " , __func__ ) ;
if ( ! _has_tag ( & marks - > im_tree , s , EXTENT_INITIALIZED ) )
set_needs_init ( array , s ) ;
}
spin_unlock ( & marks - > im_lock ) ;
if ( pages ) {
if ( array [ 0 ] = = ~ 0 ) {
kfree ( array ) ;
* pages = NULL ;
} else
* pages = array ;
}
return 0 ;
out_unlock :
spin_unlock ( & marks - > im_lock ) ;
outerr :
if ( pages ) {
kfree ( array ) ;
* pages = NULL ;
}
return - ENOMEM ;
}
2011-07-30 20:52:51 -04:00
/* Marks sectors in [offest, offset+length) as having been written to disk.
* All lengths should be block aligned .
*/
static int mark_written_sectors ( struct pnfs_inval_markings * marks ,
sector_t offset , sector_t length )
{
int status ;
dprintk ( " %s(offset=%llu,len=%llu) enter \n " , __func__ ,
( u64 ) offset , ( u64 ) length ) ;
spin_lock ( & marks - > im_lock ) ;
status = _set_range ( & marks - > im_tree , EXTENT_WRITTEN , offset , length ) ;
spin_unlock ( & marks - > im_lock ) ;
return status ;
}
static void print_short_extent ( struct pnfs_block_short_extent * be )
{
dprintk ( " PRINT SHORT EXTENT extent %p \n " , be ) ;
if ( be ) {
dprintk ( " be_f_offset %llu \n " , ( u64 ) be - > bse_f_offset ) ;
dprintk ( " be_length %llu \n " , ( u64 ) be - > bse_length ) ;
}
}
static void print_clist ( struct list_head * list , unsigned int count )
{
struct pnfs_block_short_extent * be ;
unsigned int i = 0 ;
ifdebug ( FACILITY ) {
printk ( KERN_DEBUG " **************** \n " ) ;
printk ( KERN_DEBUG " Extent list looks like: \n " ) ;
list_for_each_entry ( be , list , bse_node ) {
i + + ;
print_short_extent ( be ) ;
}
if ( i ! = count )
printk ( KERN_DEBUG " \n \n Expected %u entries \n \n \n " , count ) ;
printk ( KERN_DEBUG " **************** \n " ) ;
}
}
2011-07-30 20:52:52 -04:00
/* Note: In theory, we should do more checking that devid's match between
* old and new , but if they don ' t , the lists are too corrupt to salvage anyway .
*/
/* Note this is very similar to bl_add_merge_extent */
static void add_to_commitlist ( struct pnfs_block_layout * bl ,
struct pnfs_block_short_extent * new )
{
struct list_head * clist = & bl - > bl_commit ;
struct pnfs_block_short_extent * old , * save ;
sector_t end = new - > bse_f_offset + new - > bse_length ;
dprintk ( " %s enter \n " , __func__ ) ;
print_short_extent ( new ) ;
print_clist ( clist , bl - > bl_count ) ;
bl - > bl_count + + ;
/* Scan for proper place to insert, extending new to the left
* as much as possible .
*/
list_for_each_entry_safe ( old , save , clist , bse_node ) {
if ( new - > bse_f_offset < old - > bse_f_offset )
break ;
if ( end < = old - > bse_f_offset + old - > bse_length ) {
/* Range is already in list */
bl - > bl_count - - ;
kfree ( new ) ;
return ;
} else if ( new - > bse_f_offset < =
old - > bse_f_offset + old - > bse_length ) {
/* new overlaps or abuts existing be */
if ( new - > bse_mdev = = old - > bse_mdev ) {
/* extend new to fully replace old */
new - > bse_length + = new - > bse_f_offset -
old - > bse_f_offset ;
new - > bse_f_offset = old - > bse_f_offset ;
list_del ( & old - > bse_node ) ;
bl - > bl_count - - ;
kfree ( old ) ;
}
}
}
/* Note that if we never hit the above break, old will not point to a
* valid extent . However , in that case & old - > bse_node = = list .
*/
list_add_tail ( & new - > bse_node , & old - > bse_node ) ;
/* Scan forward for overlaps. If we find any, extend new and
* remove the overlapped extent .
*/
old = list_prepare_entry ( new , clist , bse_node ) ;
list_for_each_entry_safe_continue ( old , save , clist , bse_node ) {
if ( end < old - > bse_f_offset )
break ;
/* new overlaps or abuts old */
if ( new - > bse_mdev = = old - > bse_mdev ) {
if ( end < old - > bse_f_offset + old - > bse_length ) {
/* extend new to fully cover old */
end = old - > bse_f_offset + old - > bse_length ;
new - > bse_length = end - new - > bse_f_offset ;
}
list_del ( & old - > bse_node ) ;
bl - > bl_count - - ;
kfree ( old ) ;
}
}
dprintk ( " %s: after merging \n " , __func__ ) ;
print_clist ( clist , bl - > bl_count ) ;
}
2011-07-30 20:52:55 -04:00
/* Note the range described by offset, length is guaranteed to be contained
* within be .
*/
int bl_mark_for_commit ( struct pnfs_block_extent * be ,
sector_t offset , sector_t length )
{
sector_t new_end , end = offset + length ;
struct pnfs_block_short_extent * new ;
struct pnfs_block_layout * bl = container_of ( be - > be_inval ,
struct pnfs_block_layout ,
bl_inval ) ;
new = kmalloc ( sizeof ( * new ) , GFP_NOFS ) ;
if ( ! new )
return - ENOMEM ;
mark_written_sectors ( be - > be_inval , offset , length ) ;
/* We want to add the range to commit list, but it must be
* block - normalized , and verified that the normalized range has
* been entirely written to disk .
*/
new - > bse_f_offset = offset ;
offset = normalize ( offset , bl - > bl_blocksize ) ;
if ( offset < new - > bse_f_offset ) {
if ( is_range_written ( be - > be_inval , offset , new - > bse_f_offset ) )
new - > bse_f_offset = offset ;
else
new - > bse_f_offset = offset + bl - > bl_blocksize ;
}
new_end = normalize_up ( end , bl - > bl_blocksize ) ;
if ( end < new_end ) {
if ( is_range_written ( be - > be_inval , end , new_end ) )
end = new_end ;
else
end = new_end - bl - > bl_blocksize ;
}
if ( end < = new - > bse_f_offset ) {
kfree ( new ) ;
return 0 ;
}
new - > bse_length = end - new - > bse_f_offset ;
new - > bse_devid = be - > be_devid ;
new - > bse_mdev = be - > be_mdev ;
spin_lock ( & bl - > bl_ext_lock ) ;
/* new will be freed, either by add_to_commitlist if it decides not
* to use it , or after LAYOUTCOMMIT uses it in the commitlist .
*/
add_to_commitlist ( bl , new ) ;
spin_unlock ( & bl - > bl_ext_lock ) ;
return 0 ;
}
2011-07-30 20:52:41 -04:00
static void print_bl_extent ( struct pnfs_block_extent * be )
{
dprintk ( " PRINT EXTENT extent %p \n " , be ) ;
if ( be ) {
dprintk ( " be_f_offset %llu \n " , ( u64 ) be - > be_f_offset ) ;
dprintk ( " be_length %llu \n " , ( u64 ) be - > be_length ) ;
dprintk ( " be_v_offset %llu \n " , ( u64 ) be - > be_v_offset ) ;
dprintk ( " be_state %d \n " , be - > be_state ) ;
}
}
static void
destroy_extent ( struct kref * kref )
{
struct pnfs_block_extent * be ;
be = container_of ( kref , struct pnfs_block_extent , be_refcnt ) ;
dprintk ( " %s be=%p \n " , __func__ , be ) ;
kfree ( be ) ;
}
void
bl_put_extent ( struct pnfs_block_extent * be )
{
if ( be ) {
dprintk ( " %s enter %p (%i) \n " , __func__ , be ,
atomic_read ( & be - > be_refcnt . refcount ) ) ;
kref_put ( & be - > be_refcnt , destroy_extent ) ;
}
}
struct pnfs_block_extent * bl_alloc_extent ( void )
{
struct pnfs_block_extent * be ;
be = kmalloc ( sizeof ( struct pnfs_block_extent ) , GFP_NOFS ) ;
if ( ! be )
return NULL ;
INIT_LIST_HEAD ( & be - > be_node ) ;
kref_init ( & be - > be_refcnt ) ;
be - > be_inval = NULL ;
return be ;
}
static void print_elist ( struct list_head * list )
{
struct pnfs_block_extent * be ;
dprintk ( " **************** \n " ) ;
dprintk ( " Extent list looks like: \n " ) ;
list_for_each_entry ( be , list , be_node ) {
print_bl_extent ( be ) ;
}
dprintk ( " **************** \n " ) ;
}
2011-07-30 20:52:45 -04:00
static inline int
extents_consistent ( struct pnfs_block_extent * old , struct pnfs_block_extent * new )
{
/* Note this assumes new->be_f_offset >= old->be_f_offset */
return ( new - > be_state = = old - > be_state ) & &
( ( new - > be_state = = PNFS_BLOCK_NONE_DATA ) | |
( ( new - > be_v_offset - old - > be_v_offset = =
new - > be_f_offset - old - > be_f_offset ) & &
new - > be_mdev = = old - > be_mdev ) ) ;
}
/* Adds new to appropriate list in bl, modifying new and removing existing
* extents as appropriate to deal with overlaps .
*
* See bl_find_get_extent for list constraints .
*
* Refcount on new is already set . If end up not using it , or error out ,
* need to put the reference .
*
* bl - > bl_ext_lock is held by caller .
*/
int
bl_add_merge_extent ( struct pnfs_block_layout * bl ,
struct pnfs_block_extent * new )
{
struct pnfs_block_extent * be , * tmp ;
sector_t end = new - > be_f_offset + new - > be_length ;
struct list_head * list ;
dprintk ( " %s enter with be=%p \n " , __func__ , new ) ;
print_bl_extent ( new ) ;
list = & bl - > bl_extents [ bl_choose_list ( new - > be_state ) ] ;
print_elist ( list ) ;
/* Scan for proper place to insert, extending new to the left
* as much as possible .
*/
2011-07-30 20:52:51 -04:00
list_for_each_entry_safe_reverse ( be , tmp , list , be_node ) {
if ( new - > be_f_offset > = be - > be_f_offset + be - > be_length )
2011-07-30 20:52:45 -04:00
break ;
2011-07-30 20:52:51 -04:00
if ( new - > be_f_offset > = be - > be_f_offset ) {
if ( end < = be - > be_f_offset + be - > be_length ) {
/* new is a subset of existing be*/
if ( extents_consistent ( be , new ) ) {
dprintk ( " %s: new is subset, ignoring \n " ,
__func__ ) ;
bl_put_extent ( new ) ;
return 0 ;
} else {
goto out_err ;
}
} else {
/* |<-- be -->|
* | < - - new - - > | */
if ( extents_consistent ( be , new ) ) {
/* extend new to fully replace be */
new - > be_length + = new - > be_f_offset -
be - > be_f_offset ;
new - > be_f_offset = be - > be_f_offset ;
new - > be_v_offset = be - > be_v_offset ;
dprintk ( " %s: removing %p \n " , __func__ , be ) ;
list_del ( & be - > be_node ) ;
bl_put_extent ( be ) ;
} else {
goto out_err ;
}
}
} else if ( end > = be - > be_f_offset + be - > be_length ) {
/* new extent overlap existing be */
2011-07-30 20:52:45 -04:00
if ( extents_consistent ( be , new ) ) {
2011-07-30 20:52:51 -04:00
/* extend new to fully replace be */
dprintk ( " %s: removing %p \n " , __func__ , be ) ;
list_del ( & be - > be_node ) ;
bl_put_extent ( be ) ;
} else {
2011-07-30 20:52:45 -04:00
goto out_err ;
2011-07-30 20:52:51 -04:00
}
} else if ( end > be - > be_f_offset ) {
/* |<-- be -->|
* | < - - new - - > | */
if ( extents_consistent ( new , be ) ) {
2011-07-30 20:52:45 -04:00
/* extend new to fully replace be */
2011-07-30 20:52:51 -04:00
new - > be_length + = be - > be_f_offset + be - > be_length -
new - > be_f_offset - new - > be_length ;
2011-07-30 20:52:45 -04:00
dprintk ( " %s: removing %p \n " , __func__ , be ) ;
list_del ( & be - > be_node ) ;
bl_put_extent ( be ) ;
2011-07-30 20:52:51 -04:00
} else {
2011-07-30 20:52:45 -04:00
goto out_err ;
2011-07-30 20:52:51 -04:00
}
2011-07-30 20:52:45 -04:00
}
}
/* Note that if we never hit the above break, be will not point to a
* valid extent . However , in that case & be - > be_node = = list .
*/
2011-07-30 20:52:51 -04:00
list_add ( & new - > be_node , & be - > be_node ) ;
2011-07-30 20:52:45 -04:00
dprintk ( " %s: inserting new \n " , __func__ ) ;
print_elist ( list ) ;
/* FIXME - The per-list consistency checks have all been done,
* should now check cross - list consistency .
*/
return 0 ;
out_err :
bl_put_extent ( new ) ;
return - EIO ;
}
2011-07-30 20:52:48 -04:00
/* Returns extent, or NULL. If a second READ extent exists, it is returned
* in cow_read , if given .
*
* The extents are kept in two seperate ordered lists , one for READ and NONE ,
* one for READWRITE and INVALID . Within each list , we assume :
* 1. Extents are ordered by file offset .
* 2. For any given isect , there is at most one extents that matches .
*/
struct pnfs_block_extent *
bl_find_get_extent ( struct pnfs_block_layout * bl , sector_t isect ,
struct pnfs_block_extent * * cow_read )
{
struct pnfs_block_extent * be , * cow , * ret ;
int i ;
dprintk ( " %s enter with isect %llu \n " , __func__ , ( u64 ) isect ) ;
cow = ret = NULL ;
spin_lock ( & bl - > bl_ext_lock ) ;
for ( i = 0 ; i < EXTENT_LISTS ; i + + ) {
list_for_each_entry_reverse ( be , & bl - > bl_extents [ i ] , be_node ) {
if ( isect > = be - > be_f_offset + be - > be_length )
break ;
if ( isect > = be - > be_f_offset ) {
/* We have found an extent */
dprintk ( " %s Get %p (%i) \n " , __func__ , be ,
atomic_read ( & be - > be_refcnt . refcount ) ) ;
kref_get ( & be - > be_refcnt ) ;
if ( ! ret )
ret = be ;
else if ( be - > be_state ! = PNFS_BLOCK_READ_DATA )
bl_put_extent ( be ) ;
else
cow = be ;
break ;
}
}
if ( ret & &
( ! cow_read | | ret - > be_state ! = PNFS_BLOCK_INVALID_DATA ) )
break ;
}
spin_unlock ( & bl - > bl_ext_lock ) ;
if ( cow_read )
* cow_read = cow ;
print_bl_extent ( ret ) ;
return ret ;
}
2011-07-30 20:52:50 -04:00
2011-07-30 20:52:52 -04:00
/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
static struct pnfs_block_extent *
bl_find_get_extent_locked ( struct pnfs_block_layout * bl , sector_t isect )
{
struct pnfs_block_extent * be , * ret = NULL ;
int i ;
dprintk ( " %s enter with isect %llu \n " , __func__ , ( u64 ) isect ) ;
for ( i = 0 ; i < EXTENT_LISTS ; i + + ) {
if ( ret )
break ;
list_for_each_entry_reverse ( be , & bl - > bl_extents [ i ] , be_node ) {
if ( isect > = be - > be_f_offset + be - > be_length )
break ;
if ( isect > = be - > be_f_offset ) {
/* We have found an extent */
dprintk ( " %s Get %p (%i) \n " , __func__ , be ,
atomic_read ( & be - > be_refcnt . refcount ) ) ;
kref_get ( & be - > be_refcnt ) ;
ret = be ;
break ;
}
}
}
print_bl_extent ( ret ) ;
return ret ;
}
2011-07-30 20:52:51 -04:00
int
encode_pnfs_block_layoutupdate ( struct pnfs_block_layout * bl ,
struct xdr_stream * xdr ,
const struct nfs4_layoutcommit_args * arg )
{
struct pnfs_block_short_extent * lce , * save ;
unsigned int count = 0 ;
__be32 * p , * xdr_start ;
dprintk ( " %s enter \n " , __func__ ) ;
/* BUG - creation of bl_commit is buggy - need to wait for
* entire block to be marked WRITTEN before it can be added .
*/
spin_lock ( & bl - > bl_ext_lock ) ;
/* Want to adjust for possible truncate */
/* We now want to adjust argument range */
/* XDR encode the ranges found */
xdr_start = xdr_reserve_space ( xdr , 8 ) ;
if ( ! xdr_start )
goto out ;
list_for_each_entry_safe ( lce , save , & bl - > bl_commit , bse_node ) {
p = xdr_reserve_space ( xdr , 7 * 4 + sizeof ( lce - > bse_devid . data ) ) ;
if ( ! p )
break ;
p = xdr_encode_opaque_fixed ( p , lce - > bse_devid . data , NFS4_DEVICEID4_SIZE ) ;
p = xdr_encode_hyper ( p , lce - > bse_f_offset < < SECTOR_SHIFT ) ;
p = xdr_encode_hyper ( p , lce - > bse_length < < SECTOR_SHIFT ) ;
p = xdr_encode_hyper ( p , 0LL ) ;
* p + + = cpu_to_be32 ( PNFS_BLOCK_READWRITE_DATA ) ;
list_del ( & lce - > bse_node ) ;
list_add_tail ( & lce - > bse_node , & bl - > bl_committing ) ;
bl - > bl_count - - ;
count + + ;
}
xdr_start [ 0 ] = cpu_to_be32 ( ( xdr - > p - xdr_start - 1 ) * 4 ) ;
xdr_start [ 1 ] = cpu_to_be32 ( count ) ;
out :
spin_unlock ( & bl - > bl_ext_lock ) ;
dprintk ( " %s found %i ranges \n " , __func__ , count ) ;
return 0 ;
}
2011-07-30 20:52:50 -04:00
/* Helper function to set_to_rw that initialize a new extent */
static void
_prep_new_extent ( struct pnfs_block_extent * new ,
struct pnfs_block_extent * orig ,
sector_t offset , sector_t length , int state )
{
kref_init ( & new - > be_refcnt ) ;
/* don't need to INIT_LIST_HEAD(&new->be_node) */
memcpy ( & new - > be_devid , & orig - > be_devid , sizeof ( struct nfs4_deviceid ) ) ;
new - > be_mdev = orig - > be_mdev ;
new - > be_f_offset = offset ;
new - > be_length = length ;
new - > be_v_offset = orig - > be_v_offset - orig - > be_f_offset + offset ;
new - > be_state = state ;
new - > be_inval = orig - > be_inval ;
}
/* Tries to merge be with extent in front of it in list.
* Frees storage if not used .
*/
static struct pnfs_block_extent *
_front_merge ( struct pnfs_block_extent * be , struct list_head * head ,
struct pnfs_block_extent * storage )
{
struct pnfs_block_extent * prev ;
if ( ! storage )
goto no_merge ;
if ( & be - > be_node = = head | | be - > be_node . prev = = head )
goto no_merge ;
prev = list_entry ( be - > be_node . prev , struct pnfs_block_extent , be_node ) ;
if ( ( prev - > be_f_offset + prev - > be_length ! = be - > be_f_offset ) | |
! extents_consistent ( prev , be ) )
goto no_merge ;
_prep_new_extent ( storage , prev , prev - > be_f_offset ,
prev - > be_length + be - > be_length , prev - > be_state ) ;
list_replace ( & prev - > be_node , & storage - > be_node ) ;
bl_put_extent ( prev ) ;
list_del ( & be - > be_node ) ;
bl_put_extent ( be ) ;
return storage ;
no_merge :
kfree ( storage ) ;
return be ;
}
2011-07-30 20:52:52 -04:00
static u64
set_to_rw ( struct pnfs_block_layout * bl , u64 offset , u64 length )
{
u64 rv = offset + length ;
struct pnfs_block_extent * be , * e1 , * e2 , * e3 , * new , * old ;
struct pnfs_block_extent * children [ 3 ] ;
struct pnfs_block_extent * merge1 = NULL , * merge2 = NULL ;
int i = 0 , j ;
dprintk ( " %s(%llu, %llu) \n " , __func__ , offset , length ) ;
/* Create storage for up to three new extents e1, e2, e3 */
e1 = kmalloc ( sizeof ( * e1 ) , GFP_ATOMIC ) ;
e2 = kmalloc ( sizeof ( * e2 ) , GFP_ATOMIC ) ;
e3 = kmalloc ( sizeof ( * e3 ) , GFP_ATOMIC ) ;
/* BUG - we are ignoring any failure */
if ( ! e1 | | ! e2 | | ! e3 )
goto out_nosplit ;
spin_lock ( & bl - > bl_ext_lock ) ;
be = bl_find_get_extent_locked ( bl , offset ) ;
rv = be - > be_f_offset + be - > be_length ;
if ( be - > be_state ! = PNFS_BLOCK_INVALID_DATA ) {
spin_unlock ( & bl - > bl_ext_lock ) ;
goto out_nosplit ;
}
/* Add e* to children, bumping e*'s krefs */
if ( be - > be_f_offset ! = offset ) {
_prep_new_extent ( e1 , be , be - > be_f_offset ,
offset - be - > be_f_offset ,
PNFS_BLOCK_INVALID_DATA ) ;
children [ i + + ] = e1 ;
print_bl_extent ( e1 ) ;
} else
merge1 = e1 ;
_prep_new_extent ( e2 , be , offset ,
min ( length , be - > be_f_offset + be - > be_length - offset ) ,
PNFS_BLOCK_READWRITE_DATA ) ;
children [ i + + ] = e2 ;
print_bl_extent ( e2 ) ;
if ( offset + length < be - > be_f_offset + be - > be_length ) {
_prep_new_extent ( e3 , be , e2 - > be_f_offset + e2 - > be_length ,
be - > be_f_offset + be - > be_length -
offset - length ,
PNFS_BLOCK_INVALID_DATA ) ;
children [ i + + ] = e3 ;
print_bl_extent ( e3 ) ;
} else
merge2 = e3 ;
/* Remove be from list, and insert the e* */
/* We don't get refs on e*, since this list is the base reference
* set when init ' ed .
*/
if ( i < 3 )
children [ i ] = NULL ;
new = children [ 0 ] ;
list_replace ( & be - > be_node , & new - > be_node ) ;
bl_put_extent ( be ) ;
new = _front_merge ( new , & bl - > bl_extents [ RW_EXTENT ] , merge1 ) ;
for ( j = 1 ; j < i ; j + + ) {
old = new ;
new = children [ j ] ;
list_add ( & new - > be_node , & old - > be_node ) ;
}
if ( merge2 ) {
/* This is a HACK, should just create a _back_merge function */
new = list_entry ( new - > be_node . next ,
struct pnfs_block_extent , be_node ) ;
new = _front_merge ( new , & bl - > bl_extents [ RW_EXTENT ] , merge2 ) ;
}
spin_unlock ( & bl - > bl_ext_lock ) ;
/* Since we removed the base reference above, be is now scheduled for
* destruction .
*/
bl_put_extent ( be ) ;
dprintk ( " %s returns %llu after split \n " , __func__ , rv ) ;
return rv ;
out_nosplit :
kfree ( e1 ) ;
kfree ( e2 ) ;
kfree ( e3 ) ;
dprintk ( " %s returns %llu without splitting \n " , __func__ , rv ) ;
return rv ;
}
void
clean_pnfs_block_layoutupdate ( struct pnfs_block_layout * bl ,
const struct nfs4_layoutcommit_args * arg ,
int status )
{
struct pnfs_block_short_extent * lce , * save ;
dprintk ( " %s status %d \n " , __func__ , status ) ;
list_for_each_entry_safe ( lce , save , & bl - > bl_committing , bse_node ) {
if ( likely ( ! status ) ) {
u64 offset = lce - > bse_f_offset ;
u64 end = offset + lce - > bse_length ;
do {
offset = set_to_rw ( bl , offset , end - offset ) ;
} while ( offset < end ) ;
list_del ( & lce - > bse_node ) ;
kfree ( lce ) ;
} else {
list_del ( & lce - > bse_node ) ;
spin_lock ( & bl - > bl_ext_lock ) ;
add_to_commitlist ( bl , lce ) ;
spin_unlock ( & bl - > bl_ext_lock ) ;
}
}
}