2005-12-16 01:31:24 +03:00
/* -*- mode: c; c-basic-offset: 8; -*-
* vim : noexpandtab sw = 8 ts = 8 sts = 0 :
*
* Copyright ( C ) 2002 , 2004 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation ; either
* version 2 of the License , or ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
# include <linux/fs.h>
# include <linux/slab.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <asm/byteorder.h>
2007-02-10 07:24:12 +03:00
# include <linux/swap.h>
2007-03-07 04:24:46 +03:00
# include <linux/pipe_fs_i.h>
2005-12-16 01:31:24 +03:00
# define MLOG_MASK_PREFIX ML_FILE_IO
# include <cluster/masklog.h>
# include "ocfs2.h"
# include "alloc.h"
# include "aops.h"
# include "dlmglue.h"
# include "extent_map.h"
# include "file.h"
# include "inode.h"
# include "journal.h"
2007-02-10 07:24:12 +03:00
# include "suballoc.h"
2005-12-16 01:31:24 +03:00
# include "super.h"
# include "symlink.h"
# include "buffer_head_io.h"
static int ocfs2_symlink_get_block ( struct inode * inode , sector_t iblock ,
struct buffer_head * bh_result , int create )
{
int err = - EIO ;
int status ;
struct ocfs2_dinode * fe = NULL ;
struct buffer_head * bh = NULL ;
struct buffer_head * buffer_cache_bh = NULL ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
void * kaddr ;
mlog_entry ( " (0x%p, %llu, 0x%p, %d) \n " , inode ,
( unsigned long long ) iblock , bh_result , create ) ;
BUG_ON ( ocfs2_inode_is_fast_symlink ( inode ) ) ;
if ( ( iblock < < inode - > i_sb - > s_blocksize_bits ) > PATH_MAX + 1 ) {
mlog ( ML_ERROR , " block offset > PATH_MAX: %llu " ,
( unsigned long long ) iblock ) ;
goto bail ;
}
status = ocfs2_read_block ( OCFS2_SB ( inode - > i_sb ) ,
OCFS2_I ( inode ) - > ip_blkno ,
& bh , OCFS2_BH_CACHED , inode ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
fe = ( struct ocfs2_dinode * ) bh - > b_data ;
if ( ! OCFS2_IS_VALID_DINODE ( fe ) ) {
2006-03-03 21:24:33 +03:00
mlog ( ML_ERROR , " Invalid dinode #%llu: signature = %.*s \n " ,
2007-04-28 03:01:25 +04:00
( unsigned long long ) le64_to_cpu ( fe - > i_blkno ) , 7 ,
fe - > i_signature ) ;
2005-12-16 01:31:24 +03:00
goto bail ;
}
if ( ( u64 ) iblock > = ocfs2_clusters_to_blocks ( inode - > i_sb ,
le32_to_cpu ( fe - > i_clusters ) ) ) {
mlog ( ML_ERROR , " block offset is outside the allocated size: "
" %llu \n " , ( unsigned long long ) iblock ) ;
goto bail ;
}
/* We don't use the page cache to create symlink data, so if
* need be , copy it over from the buffer cache . */
if ( ! buffer_uptodate ( bh_result ) & & ocfs2_inode_is_new ( inode ) ) {
u64 blkno = le64_to_cpu ( fe - > id2 . i_list . l_recs [ 0 ] . e_blkno ) +
iblock ;
buffer_cache_bh = sb_getblk ( osb - > sb , blkno ) ;
if ( ! buffer_cache_bh ) {
mlog ( ML_ERROR , " couldn't getblock for symlink! \n " ) ;
goto bail ;
}
/* we haven't locked out transactions, so a commit
* could ' ve happened . Since we ' ve got a reference on
* the bh , even if it commits while we ' re doing the
* copy , the data is still good . */
if ( buffer_jbd ( buffer_cache_bh )
& & ocfs2_inode_is_new ( inode ) ) {
kaddr = kmap_atomic ( bh_result - > b_page , KM_USER0 ) ;
if ( ! kaddr ) {
mlog ( ML_ERROR , " couldn't kmap! \n " ) ;
goto bail ;
}
memcpy ( kaddr + ( bh_result - > b_size * iblock ) ,
buffer_cache_bh - > b_data ,
bh_result - > b_size ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
set_buffer_uptodate ( bh_result ) ;
}
brelse ( buffer_cache_bh ) ;
}
map_bh ( bh_result , inode - > i_sb ,
le64_to_cpu ( fe - > id2 . i_list . l_recs [ 0 ] . e_blkno ) + iblock ) ;
err = 0 ;
bail :
if ( bh )
brelse ( bh ) ;
mlog_exit ( err ) ;
return err ;
}
static int ocfs2_get_block ( struct inode * inode , sector_t iblock ,
struct buffer_head * bh_result , int create )
{
int err = 0 ;
2007-03-10 03:21:46 +03:00
unsigned int ext_flags ;
2005-12-16 01:31:24 +03:00
u64 p_blkno , past_eof ;
2007-02-15 02:30:30 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2005-12-16 01:31:24 +03:00
mlog_entry ( " (0x%p, %llu, 0x%p, %d) \n " , inode ,
( unsigned long long ) iblock , bh_result , create ) ;
if ( OCFS2_I ( inode ) - > ip_flags & OCFS2_INODE_SYSTEM_FILE )
mlog ( ML_NOTICE , " get_block on system inode 0x%p (%lu) \n " ,
inode , inode - > i_ino ) ;
if ( S_ISLNK ( inode - > i_mode ) ) {
/* this always does I/O for some reason. */
err = ocfs2_symlink_get_block ( inode , iblock , bh_result , create ) ;
goto bail ;
}
2007-03-10 03:21:46 +03:00
err = ocfs2_extent_map_get_blocks ( inode , iblock , & p_blkno , NULL ,
& ext_flags ) ;
2005-12-16 01:31:24 +03:00
if ( err ) {
mlog ( ML_ERROR , " Error %d from get_blocks(0x%p, %llu, 1, "
2006-03-03 21:24:33 +03:00
" %llu, NULL) \n " , err , inode , ( unsigned long long ) iblock ,
( unsigned long long ) p_blkno ) ;
2005-12-16 01:31:24 +03:00
goto bail ;
}
2007-02-15 02:30:30 +03:00
/*
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we ' re extending i_size on a file
* system which doesn ' t support holes , in which case BH_New
* allows block_prepare_write ( ) to zero .
*/
mlog_bug_on_msg ( create & & p_blkno = = 0 & & ocfs2_sparse_alloc ( osb ) ,
" ino %lu, iblock %llu \n " , inode - > i_ino ,
( unsigned long long ) iblock ) ;
2007-03-10 03:21:46 +03:00
/* Treat the unwritten extent as a hole for zeroing purposes. */
if ( p_blkno & & ! ( ext_flags & OCFS2_EXT_UNWRITTEN ) )
2007-02-15 02:30:30 +03:00
map_bh ( bh_result , inode - > i_sb , p_blkno ) ;
if ( ! ocfs2_sparse_alloc ( osb ) ) {
if ( p_blkno = = 0 ) {
err = - EIO ;
mlog ( ML_ERROR ,
" iblock = %llu p_blkno = %llu blkno=(%llu) \n " ,
( unsigned long long ) iblock ,
( unsigned long long ) p_blkno ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ) ;
mlog ( ML_ERROR , " Size %llu, clusters %u \n " , ( unsigned long long ) i_size_read ( inode ) , OCFS2_I ( inode ) - > ip_clusters ) ;
dump_stack ( ) ;
}
2005-12-16 01:31:24 +03:00
2007-02-15 02:30:30 +03:00
past_eof = ocfs2_blocks_for_bytes ( inode - > i_sb , i_size_read ( inode ) ) ;
mlog ( 0 , " Inode %lu, past_eof = %llu \n " , inode - > i_ino ,
( unsigned long long ) past_eof ) ;
2005-12-16 01:31:24 +03:00
2007-02-15 02:30:30 +03:00
if ( create & & ( iblock > = past_eof ) )
set_buffer_new ( bh_result ) ;
}
2005-12-16 01:31:24 +03:00
bail :
if ( err < 0 )
err = - EIO ;
mlog_exit ( err ) ;
return err ;
}
static int ocfs2_readpage ( struct file * file , struct page * page )
{
struct inode * inode = page - > mapping - > host ;
loff_t start = ( loff_t ) page - > index < < PAGE_CACHE_SHIFT ;
int ret , unlock = 1 ;
mlog_entry ( " (0x%p, %lu) \n " , file , ( page ? page - > index : 0 ) ) ;
2006-10-10 03:02:40 +04:00
ret = ocfs2_meta_lock_with_page ( inode , NULL , 0 , page ) ;
2005-12-16 01:31:24 +03:00
if ( ret ! = 0 ) {
if ( ret = = AOP_TRUNCATED_PAGE )
unlock = 0 ;
mlog_errno ( ret ) ;
goto out ;
}
2007-05-14 22:38:51 +04:00
if ( down_read_trylock ( & OCFS2_I ( inode ) - > ip_alloc_sem ) = = 0 ) {
ret = AOP_TRUNCATED_PAGE ;
goto out_meta_unlock ;
}
2005-12-16 01:31:24 +03:00
/*
* i_size might have just been updated as we grabed the meta lock . We
* might now be discovering a truncate that hit on another node .
* block_read_full_page - > get_block freaks out if it is asked to read
* beyond the end of a file , so we check here . Callers
* ( generic_file_read , fault - > nopage ) are clever enough to check i_size
* and notice that the page they just read isn ' t needed .
*
* XXX sys_readahead ( ) seems to get that wrong ?
*/
if ( start > = i_size_read ( inode ) ) {
2007-05-11 09:56:01 +04:00
zero_user_page ( page , 0 , PAGE_SIZE , KM_USER0 ) ;
2005-12-16 01:31:24 +03:00
SetPageUptodate ( page ) ;
ret = 0 ;
goto out_alloc ;
}
ret = ocfs2_data_lock_with_page ( inode , 0 , page ) ;
if ( ret ! = 0 ) {
if ( ret = = AOP_TRUNCATED_PAGE )
unlock = 0 ;
mlog_errno ( ret ) ;
goto out_alloc ;
}
ret = block_read_full_page ( page , ocfs2_get_block ) ;
unlock = 0 ;
ocfs2_data_unlock ( inode , 0 ) ;
out_alloc :
up_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2007-05-14 22:38:51 +04:00
out_meta_unlock :
2005-12-16 01:31:24 +03:00
ocfs2_meta_unlock ( inode , 0 ) ;
out :
if ( unlock )
unlock_page ( page ) ;
mlog_exit ( ret ) ;
return ret ;
}
/* Note: Because we don't support holes, our allocation has
* already happened ( allocation writes zeros to the file data )
* so we don ' t have to worry about ordered writes in
* ocfs2_writepage .
*
* - > writepage is called during the process of invalidating the page cache
* during blocked lock processing . It can ' t block on any cluster locks
* to during block mapping . It ' s relying on the fact that the block
* mapping can ' t have disappeared under the dirty pages that it is
* being asked to write back .
*/
static int ocfs2_writepage ( struct page * page , struct writeback_control * wbc )
{
int ret ;
mlog_entry ( " (0x%p) \n " , page ) ;
ret = block_write_full_page ( page , ocfs2_get_block , wbc ) ;
mlog_exit ( ret ) ;
return ret ;
}
2007-02-10 07:52:53 +03:00
/*
* This is called from ocfs2_write_zero_page ( ) which has handled it ' s
* own cluster locking and has ensured allocation exists for those
* blocks to be written .
*/
2006-05-06 06:04:03 +04:00
int ocfs2_prepare_write_nolock ( struct inode * inode , struct page * page ,
unsigned from , unsigned to )
{
int ret ;
down_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
ret = block_prepare_write ( page , from , to , ocfs2_get_block ) ;
up_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
return ret ;
}
2005-12-16 01:31:24 +03:00
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet , but IMHO it ' s better to cut and paste the whole
* thing so we can avoid introducing our own bugs ( and easily pick up
* their fixes when they happen ) - - Mark */
2007-02-16 22:46:50 +03:00
int walk_page_buffers ( handle_t * handle ,
struct buffer_head * head ,
unsigned from ,
unsigned to ,
int * partial ,
int ( * fn ) ( handle_t * handle ,
struct buffer_head * bh ) )
2005-12-16 01:31:24 +03:00
{
struct buffer_head * bh ;
unsigned block_start , block_end ;
unsigned blocksize = head - > b_size ;
int err , ret = 0 ;
struct buffer_head * next ;
for ( bh = head , block_start = 0 ;
ret = = 0 & & ( bh ! = head | | ! block_start ) ;
block_start = block_end , bh = next )
{
next = bh - > b_this_page ;
block_end = block_start + blocksize ;
if ( block_end < = from | | block_start > = to ) {
if ( partial & & ! buffer_uptodate ( bh ) )
* partial = 1 ;
continue ;
}
err = ( * fn ) ( handle , bh ) ;
if ( ! ret )
ret = err ;
}
return ret ;
}
2006-10-10 05:11:45 +04:00
handle_t * ocfs2_start_walk_page_trans ( struct inode * inode ,
2005-12-16 01:31:24 +03:00
struct page * page ,
unsigned from ,
unsigned to )
{
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2006-10-10 05:11:45 +04:00
handle_t * handle = NULL ;
2005-12-16 01:31:24 +03:00
int ret = 0 ;
2006-10-10 04:26:22 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2005-12-16 01:31:24 +03:00
if ( ! handle ) {
ret = - ENOMEM ;
mlog_errno ( ret ) ;
goto out ;
}
if ( ocfs2_should_order_data ( inode ) ) {
2006-10-10 05:11:45 +04:00
ret = walk_page_buffers ( handle ,
2005-12-16 01:31:24 +03:00
page_buffers ( page ) ,
from , to , NULL ,
ocfs2_journal_dirty_data ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
}
out :
if ( ret ) {
if ( handle )
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2005-12-16 01:31:24 +03:00
handle = ERR_PTR ( ret ) ;
}
return handle ;
}
static sector_t ocfs2_bmap ( struct address_space * mapping , sector_t block )
{
sector_t status ;
u64 p_blkno = 0 ;
int err = 0 ;
struct inode * inode = mapping - > host ;
mlog_entry ( " (block = %llu) \n " , ( unsigned long long ) block ) ;
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes .
*/
if ( ! INODE_JOURNAL ( inode ) ) {
2006-10-10 03:02:40 +04:00
err = ocfs2_meta_lock ( inode , NULL , 0 ) ;
2005-12-16 01:31:24 +03:00
if ( err ) {
if ( err ! = - ENOENT )
mlog_errno ( err ) ;
goto bail ;
}
down_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
}
2007-03-10 03:21:46 +03:00
err = ocfs2_extent_map_get_blocks ( inode , block , & p_blkno , NULL , NULL ) ;
2005-12-16 01:31:24 +03:00
if ( ! INODE_JOURNAL ( inode ) ) {
up_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
ocfs2_meta_unlock ( inode , 0 ) ;
}
if ( err ) {
mlog ( ML_ERROR , " get_blocks() failed, block = %llu \n " ,
( unsigned long long ) block ) ;
mlog_errno ( err ) ;
goto bail ;
}
bail :
status = err ? 0 : p_blkno ;
mlog_exit ( ( int ) status ) ;
return status ;
}
/*
* TODO : Make this into a generic get_blocks function .
*
* From do_direct_io in direct - io . c :
* " So what we do is to permit the ->get_blocks function to populate
* bh . b_size with the size of IO which is permitted at this offset and
* this i_blkbits . "
*
* This function is called directly from get_more_blocks in direct - io . c .
*
* called like this : dio - > get_blocks ( dio - > inode , fs_startblk ,
* fs_count , map_bh , dio - > rw = = WRITE ) ;
*/
static int ocfs2_direct_IO_get_blocks ( struct inode * inode , sector_t iblock ,
struct buffer_head * bh_result , int create )
{
int ret ;
2007-03-10 03:26:50 +03:00
u64 p_blkno , inode_blocks , contig_blocks ;
2007-03-10 03:21:46 +03:00
unsigned int ext_flags ;
2006-06-04 03:30:10 +04:00
unsigned char blocksize_bits = inode - > i_sb - > s_blocksize_bits ;
2006-03-26 13:38:02 +04:00
unsigned long max_blocks = bh_result - > b_size > > inode - > i_blkbits ;
2005-12-16 01:31:24 +03:00
/* This function won't even be called if the request isn't all
* nicely aligned and of the right size , so there ' s no need
* for us to check any of that . */
2007-02-15 02:30:30 +03:00
inode_blocks = ocfs2_blocks_for_bytes ( inode - > i_sb , i_size_read ( inode ) ) ;
2006-12-15 00:01:05 +03:00
/*
* Any write past EOF is not allowed because we ' d be extending .
*/
if ( create & & ( iblock + max_blocks ) > inode_blocks ) {
2005-12-16 01:31:24 +03:00
ret = - EIO ;
goto bail ;
}
/* This figures out the size of the next contiguous block, and
* our logical offset */
2007-01-17 23:31:35 +03:00
ret = ocfs2_extent_map_get_blocks ( inode , iblock , & p_blkno ,
2007-03-10 03:21:46 +03:00
& contig_blocks , & ext_flags ) ;
2005-12-16 01:31:24 +03:00
if ( ret ) {
mlog ( ML_ERROR , " get_blocks() failed iblock=%llu \n " ,
( unsigned long long ) iblock ) ;
ret = - EIO ;
goto bail ;
}
2007-02-15 02:30:30 +03:00
if ( ! ocfs2_sparse_alloc ( OCFS2_SB ( inode - > i_sb ) ) & & ! p_blkno ) {
ocfs2_error ( inode - > i_sb ,
" Inode %llu has a hole at block %llu \n " ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) iblock ) ;
ret = - EROFS ;
goto bail ;
}
/*
* get_more_blocks ( ) expects us to describe a hole by clearing
* the mapped bit on bh_result ( ) .
2007-03-10 03:21:46 +03:00
*
* Consider an unwritten extent as a hole .
2007-02-15 02:30:30 +03:00
*/
2007-03-10 03:21:46 +03:00
if ( p_blkno & & ! ( ext_flags & OCFS2_EXT_UNWRITTEN ) )
2007-02-15 02:30:30 +03:00
map_bh ( bh_result , inode - > i_sb , p_blkno ) ;
else {
/*
* ocfs2_prepare_inode_for_write ( ) should have caught
* the case where we ' d be filling a hole and triggered
* a buffered write instead .
*/
if ( create ) {
ret = - EIO ;
mlog_errno ( ret ) ;
goto bail ;
}
clear_buffer_mapped ( bh_result ) ;
}
2005-12-16 01:31:24 +03:00
/* make sure we don't map more than max_blocks blocks here as
that ' s all the kernel will handle at this point . */
if ( max_blocks < contig_blocks )
contig_blocks = max_blocks ;
bh_result - > b_size = contig_blocks < < blocksize_bits ;
bail :
return ret ;
}
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished . We ' re
* particularly interested in the aio / dio case . Like the core uses
* i_alloc_sem , we use the rw_lock DLM lock to protect io on one node from
* truncation on another .
*/
static void ocfs2_dio_end_io ( struct kiocb * iocb ,
loff_t offset ,
ssize_t bytes ,
void * private )
{
2006-12-08 13:37:25 +03:00
struct inode * inode = iocb - > ki_filp - > f_path . dentry - > d_inode ;
2007-04-17 04:28:51 +04:00
int level ;
2005-12-16 01:31:24 +03:00
/* this io's submitter should not have unlocked this before we could */
BUG_ON ( ! ocfs2_iocb_is_rw_locked ( iocb ) ) ;
2007-04-17 04:28:51 +04:00
2005-12-16 01:31:24 +03:00
ocfs2_iocb_clear_rw_locked ( iocb ) ;
2007-04-17 04:28:51 +04:00
level = ocfs2_iocb_rw_locked_level ( iocb ) ;
if ( ! level )
up_read ( & inode - > i_alloc_sem ) ;
ocfs2_rw_unlock ( inode , level ) ;
2005-12-16 01:31:24 +03:00
}
2007-01-05 01:54:41 +03:00
/*
* ocfs2_invalidatepage ( ) and ocfs2_releasepage ( ) are shamelessly stolen
* from ext3 . PageChecked ( ) bits have been removed as OCFS2 does not
* do journalled data .
*/
static void ocfs2_invalidatepage ( struct page * page , unsigned long offset )
{
journal_t * journal = OCFS2_SB ( page - > mapping - > host - > i_sb ) - > journal - > j_journal ;
journal_invalidatepage ( journal , page , offset ) ;
}
static int ocfs2_releasepage ( struct page * page , gfp_t wait )
{
journal_t * journal = OCFS2_SB ( page - > mapping - > host - > i_sb ) - > journal - > j_journal ;
if ( ! page_has_buffers ( page ) )
return 0 ;
return journal_try_to_free_buffers ( journal , page , wait ) ;
}
2005-12-16 01:31:24 +03:00
static ssize_t ocfs2_direct_IO ( int rw ,
struct kiocb * iocb ,
const struct iovec * iov ,
loff_t offset ,
unsigned long nr_segs )
{
struct file * file = iocb - > ki_filp ;
2006-12-08 13:37:25 +03:00
struct inode * inode = file - > f_path . dentry - > d_inode - > i_mapping - > host ;
2005-12-16 01:31:24 +03:00
int ret ;
mlog_entry_void ( ) ;
2006-05-06 06:04:03 +04:00
2007-02-10 07:24:12 +03:00
if ( ! ocfs2_sparse_alloc ( OCFS2_SB ( inode - > i_sb ) ) ) {
/*
* We get PR data locks even for O_DIRECT . This
* allows concurrent O_DIRECT I / O but doesn ' t let
* O_DIRECT with extending and buffered zeroing writes
* race . If they did race then the buffered zeroing
* could be written back after the O_DIRECT I / O . It ' s
* one thing to tell people not to mix buffered and
* O_DIRECT writes , but expecting them to understand
* that file extension is also an implicit buffered
* write is too much . By getting the PR we force
* writeback of the buffered zeroing before
* proceeding .
*/
ret = ocfs2_data_lock ( inode , 0 ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
ocfs2_data_unlock ( inode , 0 ) ;
2006-05-06 06:04:03 +04:00
}
2005-12-16 01:31:24 +03:00
ret = blockdev_direct_IO_no_locking ( rw , iocb , inode ,
inode - > i_sb - > s_bdev , iov , offset ,
nr_segs ,
ocfs2_direct_IO_get_blocks ,
ocfs2_dio_end_io ) ;
2006-05-06 06:04:03 +04:00
out :
2005-12-16 01:31:24 +03:00
mlog_exit ( ret ) ;
return ret ;
}
2007-02-10 07:24:12 +03:00
static void ocfs2_figure_cluster_boundaries ( struct ocfs2_super * osb ,
u32 cpos ,
unsigned int * start ,
unsigned int * end )
{
unsigned int cluster_start = 0 , cluster_end = PAGE_CACHE_SIZE ;
if ( unlikely ( PAGE_CACHE_SHIFT > osb - > s_clustersize_bits ) ) {
unsigned int cpp ;
cpp = 1 < < ( PAGE_CACHE_SHIFT - osb - > s_clustersize_bits ) ;
cluster_start = cpos % cpp ;
cluster_start = cluster_start < < osb - > s_clustersize_bits ;
cluster_end = cluster_start + osb - > s_clustersize ;
}
BUG_ON ( cluster_start > PAGE_SIZE ) ;
BUG_ON ( cluster_end > PAGE_SIZE ) ;
if ( start )
* start = cluster_start ;
if ( end )
* end = cluster_end ;
}
/*
* ' from ' and ' to ' are the region in the page to avoid zeroing .
*
* If pagesize > clustersize , this function will avoid zeroing outside
* of the cluster boundary .
*
* from = = to = = 0 is code for " zero the entire cluster region "
*/
static void ocfs2_clear_page_regions ( struct page * page ,
struct ocfs2_super * osb , u32 cpos ,
unsigned from , unsigned to )
{
void * kaddr ;
unsigned int cluster_start , cluster_end ;
ocfs2_figure_cluster_boundaries ( osb , cpos , & cluster_start , & cluster_end ) ;
kaddr = kmap_atomic ( page , KM_USER0 ) ;
if ( from | | to ) {
if ( from > cluster_start )
memset ( kaddr + cluster_start , 0 , from - cluster_start ) ;
if ( to < cluster_end )
memset ( kaddr + to , 0 , cluster_end - to ) ;
} else {
memset ( kaddr + cluster_start , 0 , cluster_end - cluster_start ) ;
}
kunmap_atomic ( kaddr , KM_USER0 ) ;
}
/*
* Some of this taken from block_prepare_write ( ) . We already have our
* mapping by now though , and the entire write will be allocating or
* it won ' t , so not much need to use BH_New .
*
* This will also skip zeroing , which is handled externally .
*/
2007-02-16 22:46:50 +03:00
int ocfs2_map_page_blocks ( struct page * page , u64 * p_blkno ,
struct inode * inode , unsigned int from ,
unsigned int to , int new )
2007-02-10 07:24:12 +03:00
{
int ret = 0 ;
struct buffer_head * head , * bh , * wait [ 2 ] , * * wait_bh = wait ;
unsigned int block_end , block_start ;
unsigned int bsize = 1 < < inode - > i_blkbits ;
if ( ! page_has_buffers ( page ) )
create_empty_buffers ( page , bsize , 0 ) ;
head = page_buffers ( page ) ;
for ( bh = head , block_start = 0 ; bh ! = head | | ! block_start ;
bh = bh - > b_this_page , block_start + = bsize ) {
block_end = block_start + bsize ;
2007-05-09 04:47:32 +04:00
clear_buffer_new ( bh ) ;
2007-02-10 07:24:12 +03:00
/*
* Ignore blocks outside of our i / o range -
* they may belong to unallocated clusters .
*/
2007-02-16 22:46:50 +03:00
if ( block_start > = to | | block_end < = from ) {
2007-02-10 07:24:12 +03:00
if ( PageUptodate ( page ) )
set_buffer_uptodate ( bh ) ;
continue ;
}
/*
* For an allocating write with cluster size > = page
* size , we always write the entire page .
*/
2007-05-09 04:47:32 +04:00
if ( new )
set_buffer_new ( bh ) ;
2007-02-10 07:24:12 +03:00
if ( ! buffer_mapped ( bh ) ) {
map_bh ( bh , inode - > i_sb , * p_blkno ) ;
unmap_underlying_metadata ( bh - > b_bdev , bh - > b_blocknr ) ;
}
if ( PageUptodate ( page ) ) {
if ( ! buffer_uptodate ( bh ) )
set_buffer_uptodate ( bh ) ;
} else if ( ! buffer_uptodate ( bh ) & & ! buffer_delay ( bh ) & &
2007-06-18 22:12:36 +04:00
! buffer_new ( bh ) & &
( block_start < from | | block_end > to ) ) {
2007-02-10 07:24:12 +03:00
ll_rw_block ( READ , 1 , & bh ) ;
* wait_bh + + = bh ;
}
* p_blkno = * p_blkno + 1 ;
}
/*
* If we issued read requests - let them complete .
*/
while ( wait_bh > wait ) {
wait_on_buffer ( * - - wait_bh ) ;
if ( ! buffer_uptodate ( * wait_bh ) )
ret = - EIO ;
}
if ( ret = = 0 | | ! new )
return ret ;
/*
* If we get - EIO above , zero out any newly allocated blocks
* to avoid exposing stale data .
*/
bh = head ;
block_start = 0 ;
do {
void * kaddr ;
block_end = block_start + bsize ;
if ( block_end < = from )
goto next_bh ;
if ( block_start > = to )
break ;
kaddr = kmap_atomic ( page , KM_USER0 ) ;
memset ( kaddr + block_start , 0 , bh - > b_size ) ;
flush_dcache_page ( page ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
set_buffer_uptodate ( bh ) ;
mark_buffer_dirty ( bh ) ;
next_bh :
block_start = block_end ;
bh = bh - > b_this_page ;
} while ( bh ! = head ) ;
return ret ;
}
2007-05-09 04:47:32 +04:00
# if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
# define OCFS2_MAX_CTXT_PAGES 1
# else
# define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
# endif
# define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
2007-03-07 04:24:46 +03:00
/*
2007-05-09 04:47:32 +04:00
* Describe the state of a single cluster to be written to .
2007-03-07 04:24:46 +03:00
*/
2007-05-09 04:47:32 +04:00
struct ocfs2_write_cluster_desc {
u32 c_cpos ;
u32 c_phys ;
/*
* Give this a unique field because c_phys eventually gets
* filled .
*/
unsigned c_new ;
} ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
struct ocfs2_write_ctxt {
/* Logical cluster position / len of write */
u32 w_cpos ;
u32 w_clen ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
struct ocfs2_write_cluster_desc w_desc [ OCFS2_MAX_CLUSTERS_PER_PAGE ] ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
/*
* This is true if page_size > cluster_size .
*
* It triggers a set of special cases during write which might
* have to deal with allocating writes to partial pages .
*/
unsigned int w_large_pages ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
/*
* Pages involved in this write .
*
* w_target_page is the page being written to by the user .
*
* w_pages is an array of pages which always contains
* w_target_page , and in the case of an allocating write with
* page_size < cluster size , it will contain zero ' d and mapped
* pages adjacent to w_target_page which need to be written
* out in so that future reads from that region will get
* zero ' s .
*/
struct page * w_pages [ OCFS2_MAX_CTXT_PAGES ] ;
unsigned int w_num_pages ;
struct page * w_target_page ;
2007-06-07 03:15:24 +04:00
2007-05-09 04:47:32 +04:00
/*
* ocfs2_write_end ( ) uses this to know what the real range to
* write in the target should be .
*/
unsigned int w_target_from ;
unsigned int w_target_to ;
/*
* We could use journal_current_handle ( ) but this is cleaner ,
* IMHO - Mark
*/
handle_t * w_handle ;
struct buffer_head * w_di_bh ;
} ;
static void ocfs2_free_write_ctxt ( struct ocfs2_write_ctxt * wc )
{
int i ;
for ( i = 0 ; i < wc - > w_num_pages ; i + + ) {
if ( wc - > w_pages [ i ] = = NULL )
continue ;
unlock_page ( wc - > w_pages [ i ] ) ;
mark_page_accessed ( wc - > w_pages [ i ] ) ;
page_cache_release ( wc - > w_pages [ i ] ) ;
2007-03-07 04:24:46 +03:00
}
2007-05-09 04:47:32 +04:00
brelse ( wc - > w_di_bh ) ;
kfree ( wc ) ;
}
static int ocfs2_alloc_write_ctxt ( struct ocfs2_write_ctxt * * wcp ,
struct ocfs2_super * osb , loff_t pos ,
2007-05-10 02:14:45 +04:00
unsigned len , struct buffer_head * di_bh )
2007-05-09 04:47:32 +04:00
{
struct ocfs2_write_ctxt * wc ;
wc = kzalloc ( sizeof ( struct ocfs2_write_ctxt ) , GFP_NOFS ) ;
if ( ! wc )
return - ENOMEM ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
wc - > w_cpos = pos > > osb - > s_clustersize_bits ;
wc - > w_clen = ocfs2_clusters_for_bytes ( osb - > sb , len ) ;
2007-05-10 02:14:45 +04:00
get_bh ( di_bh ) ;
wc - > w_di_bh = di_bh ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
if ( unlikely ( PAGE_CACHE_SHIFT > osb - > s_clustersize_bits ) )
wc - > w_large_pages = 1 ;
else
wc - > w_large_pages = 0 ;
* wcp = wc ;
2007-03-07 04:24:46 +03:00
2007-05-09 04:47:32 +04:00
return 0 ;
2007-03-07 04:24:46 +03:00
}
2007-02-10 07:24:12 +03:00
/*
2007-05-09 04:47:32 +04:00
* If a page has any new buffers , zero them out here , and mark them uptodate
* and dirty so they ' ll be written out ( in order to prevent uninitialised
* block data from leaking ) . And clear the new bit .
2007-02-10 07:24:12 +03:00
*/
2007-05-09 04:47:32 +04:00
static void ocfs2_zero_new_buffers ( struct page * page , unsigned from , unsigned to )
2007-02-10 07:24:12 +03:00
{
2007-05-09 04:47:32 +04:00
unsigned int block_start , block_end ;
struct buffer_head * head , * bh ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
BUG_ON ( ! PageLocked ( page ) ) ;
if ( ! page_has_buffers ( page ) )
return ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
bh = head = page_buffers ( page ) ;
block_start = 0 ;
do {
block_end = block_start + bh - > b_size ;
if ( buffer_new ( bh ) ) {
if ( block_end > from & & block_start < to ) {
if ( ! PageUptodate ( page ) ) {
unsigned start , end ;
void * kaddr ;
start = max ( from , block_start ) ;
end = min ( to , block_end ) ;
kaddr = kmap_atomic ( page , KM_USER0 ) ;
memset ( kaddr + start , 0 , end - start ) ;
flush_dcache_page ( page ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
set_buffer_uptodate ( bh ) ;
}
clear_buffer_new ( bh ) ;
mark_buffer_dirty ( bh ) ;
}
}
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
block_start = block_end ;
bh = bh - > b_this_page ;
} while ( bh ! = head ) ;
}
/*
* Only called when we have a failure during allocating write to write
* zero ' s to the newly allocated region .
*/
static void ocfs2_write_failure ( struct inode * inode ,
struct ocfs2_write_ctxt * wc ,
loff_t user_pos , unsigned user_len )
{
int i ;
unsigned from , to ;
struct page * tmppage ;
ocfs2_zero_new_buffers ( wc - > w_target_page , user_pos , user_len ) ;
2007-02-10 07:24:12 +03:00
if ( wc - > w_large_pages ) {
2007-05-09 04:47:32 +04:00
from = wc - > w_target_from ;
to = wc - > w_target_to ;
2007-02-10 07:24:12 +03:00
} else {
2007-05-09 04:47:32 +04:00
from = 0 ;
to = PAGE_CACHE_SIZE ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
for ( i = 0 ; i < wc - > w_num_pages ; i + + ) {
tmppage = wc - > w_pages [ i ] ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
if ( ocfs2_should_order_data ( inode ) )
walk_page_buffers ( wc - > w_handle , page_buffers ( tmppage ) ,
from , to , NULL ,
ocfs2_journal_dirty_data ) ;
2007-06-07 03:15:24 +04:00
2007-05-09 04:47:32 +04:00
block_commit_write ( tmppage , from , to ) ;
2007-02-10 07:24:12 +03:00
}
}
2007-05-09 04:47:32 +04:00
static int ocfs2_prepare_page_for_write ( struct inode * inode , u64 * p_blkno ,
struct ocfs2_write_ctxt * wc ,
struct page * page , u32 cpos ,
loff_t user_pos , unsigned user_len ,
int new )
2007-02-10 07:24:12 +03:00
{
2007-05-09 04:47:32 +04:00
int ret ;
unsigned int map_from = 0 , map_to = 0 ;
2007-02-10 07:24:12 +03:00
unsigned int cluster_start , cluster_end ;
2007-05-09 04:47:32 +04:00
unsigned int user_data_from = 0 , user_data_to = 0 ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
ocfs2_figure_cluster_boundaries ( OCFS2_SB ( inode - > i_sb ) , cpos ,
2007-02-10 07:24:12 +03:00
& cluster_start , & cluster_end ) ;
2007-05-09 04:47:32 +04:00
if ( page = = wc - > w_target_page ) {
map_from = user_pos & ( PAGE_CACHE_SIZE - 1 ) ;
map_to = map_from + user_len ;
if ( new )
ret = ocfs2_map_page_blocks ( page , p_blkno , inode ,
cluster_start , cluster_end ,
new ) ;
else
ret = ocfs2_map_page_blocks ( page , p_blkno , inode ,
map_from , map_to , new ) ;
if ( ret ) {
2007-02-10 07:24:12 +03:00
mlog_errno ( ret ) ;
goto out ;
}
2007-05-09 04:47:32 +04:00
user_data_from = map_from ;
user_data_to = map_to ;
2007-02-10 07:24:12 +03:00
if ( new ) {
2007-05-09 04:47:32 +04:00
map_from = cluster_start ;
map_to = cluster_end ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
wc - > w_target_from = map_from ;
wc - > w_target_to = map_to ;
2007-02-10 07:24:12 +03:00
} else {
/*
* If we haven ' t allocated the new page yet , we
* shouldn ' t be writing it out without copying user
* data . This is likely a math error from the caller .
*/
BUG_ON ( ! new ) ;
2007-05-09 04:47:32 +04:00
map_from = cluster_start ;
map_to = cluster_end ;
2007-02-10 07:24:12 +03:00
ret = ocfs2_map_page_blocks ( page , p_blkno , inode ,
2007-05-09 04:47:32 +04:00
cluster_start , cluster_end , new ) ;
2007-02-10 07:24:12 +03:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
}
/*
* Parts of newly allocated pages need to be zero ' d .
*
* Above , we have also rewritten ' to ' and ' from ' - as far as
* the rest of the function is concerned , the entire cluster
* range inside of a page needs to be written .
*
* We can skip this if the page is up to date - it ' s already
* been zero ' d from being read in as a hole .
*/
if ( new & & ! PageUptodate ( page ) )
ocfs2_clear_page_regions ( page , OCFS2_SB ( inode - > i_sb ) ,
2007-05-09 04:47:32 +04:00
cpos , user_data_from , user_data_to ) ;
2007-02-10 07:24:12 +03:00
flush_dcache_page ( page ) ;
out :
2007-05-09 04:47:32 +04:00
return ret ;
2007-02-10 07:24:12 +03:00
}
/*
2007-05-09 04:47:32 +04:00
* This function will only grab one clusters worth of pages .
2007-02-10 07:24:12 +03:00
*/
2007-05-09 04:47:32 +04:00
static int ocfs2_grab_pages_for_write ( struct address_space * mapping ,
struct ocfs2_write_ctxt * wc ,
2007-05-10 02:16:19 +04:00
u32 cpos , loff_t user_pos , int new ,
struct page * mmap_page )
2007-02-10 07:24:12 +03:00
{
2007-05-09 04:47:32 +04:00
int ret = 0 , i ;
unsigned long start , target_index , index ;
2007-02-10 07:24:12 +03:00
struct inode * inode = mapping - > host ;
2007-05-09 04:47:32 +04:00
target_index = user_pos > > PAGE_CACHE_SHIFT ;
2007-02-10 07:24:12 +03:00
/*
* Figure out how many pages we ' ll be manipulating here . For
2007-02-16 22:46:50 +03:00
* non allocating write , we just change the one
* page . Otherwise , we ' ll need a whole clusters worth .
2007-02-10 07:24:12 +03:00
*/
if ( new ) {
2007-05-09 04:47:32 +04:00
wc - > w_num_pages = ocfs2_pages_per_cluster ( inode - > i_sb ) ;
start = ocfs2_align_clusters_to_page_index ( inode - > i_sb , cpos ) ;
2007-02-10 07:24:12 +03:00
} else {
2007-05-09 04:47:32 +04:00
wc - > w_num_pages = 1 ;
start = target_index ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
for ( i = 0 ; i < wc - > w_num_pages ; i + + ) {
2007-02-10 07:24:12 +03:00
index = start + i ;
2007-05-10 02:16:19 +04:00
if ( index = = target_index & & mmap_page ) {
/*
* ocfs2_pagemkwrite ( ) is a little different
* and wants us to directly use the page
* passed in .
*/
lock_page ( mmap_page ) ;
if ( mmap_page - > mapping ! = mapping ) {
unlock_page ( mmap_page ) ;
/*
* Sanity check - the locking in
* ocfs2_pagemkwrite ( ) should ensure
* that this code doesn ' t trigger .
*/
ret = - EINVAL ;
mlog_errno ( ret ) ;
goto out ;
}
page_cache_get ( mmap_page ) ;
wc - > w_pages [ i ] = mmap_page ;
} else {
wc - > w_pages [ i ] = find_or_create_page ( mapping , index ,
GFP_NOFS ) ;
if ( ! wc - > w_pages [ i ] ) {
ret = - ENOMEM ;
mlog_errno ( ret ) ;
goto out ;
}
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
if ( index = = target_index )
wc - > w_target_page = wc - > w_pages [ i ] ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
out :
return ret ;
}
/*
* Prepare a single cluster for write one cluster into the file .
*/
static int ocfs2_write_cluster ( struct address_space * mapping ,
u32 phys , struct ocfs2_alloc_context * data_ac ,
struct ocfs2_alloc_context * meta_ac ,
struct ocfs2_write_ctxt * wc , u32 cpos ,
loff_t user_pos , unsigned user_len )
{
int ret , i , new ;
u64 v_blkno , p_blkno ;
struct inode * inode = mapping - > host ;
new = phys = = 0 ? 1 : 0 ;
2007-02-10 07:24:12 +03:00
if ( new ) {
2007-05-09 04:47:32 +04:00
u32 tmp_pos ;
2007-02-10 07:24:12 +03:00
/*
* This is safe to call with the page locks - it won ' t take
* any additional semaphores or cluster locks .
*/
2007-05-09 04:47:32 +04:00
tmp_pos = cpos ;
2007-02-10 07:24:12 +03:00
ret = ocfs2_do_extend_allocation ( OCFS2_SB ( inode - > i_sb ) , inode ,
2007-05-09 04:47:32 +04:00
& tmp_pos , 1 , wc - > w_di_bh ,
wc - > w_handle , data_ac ,
meta_ac , NULL ) ;
2007-02-10 07:24:12 +03:00
/*
* This shouldn ' t happen because we must have already
* calculated the correct meta data allocation required . The
* internal tree allocation code should know how to increase
* transaction credits itself .
*
* If need be , we could handle - EAGAIN for a
* RESTART_TRANS here .
*/
mlog_bug_on_msg ( ret = = - EAGAIN ,
" Inode %llu: EAGAIN return during allocation. \n " ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
2007-05-09 04:47:32 +04:00
v_blkno = ocfs2_clusters_to_blocks ( inode - > i_sb , cpos ) ;
} else {
v_blkno = user_pos > > inode - > i_sb - > s_blocksize_bits ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
/*
* The only reason this should fail is due to an inability to
* find the extent added .
*/
2007-03-10 03:21:46 +03:00
ret = ocfs2_extent_map_get_blocks ( inode , v_blkno , & p_blkno , NULL ,
NULL ) ;
2007-02-10 07:24:12 +03:00
if ( ret < 0 ) {
2007-05-09 04:47:32 +04:00
ocfs2_error ( inode - > i_sb , " Corrupting extend for inode %llu, "
" at logical block %llu " ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) v_blkno ) ;
2007-02-10 07:24:12 +03:00
goto out ;
}
BUG_ON ( p_blkno = = 0 ) ;
2007-05-09 04:47:32 +04:00
for ( i = 0 ; i < wc - > w_num_pages ; i + + ) {
int tmpret ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
tmpret = ocfs2_prepare_page_for_write ( inode , & p_blkno , wc ,
wc - > w_pages [ i ] , cpos ,
user_pos , user_len , new ) ;
if ( tmpret ) {
mlog_errno ( tmpret ) ;
if ( ret = = 0 )
tmpret = ret ;
}
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
/*
* We only have cleanup to do in case of allocating write .
*/
if ( ret & & new )
ocfs2_write_failure ( inode , wc , user_pos , user_len ) ;
2007-02-10 07:24:12 +03:00
out :
2007-05-09 04:47:32 +04:00
return ret ;
2007-02-10 07:24:12 +03:00
}
2007-05-15 05:09:54 +04:00
static int ocfs2_write_cluster_by_desc ( struct address_space * mapping ,
struct ocfs2_alloc_context * data_ac ,
struct ocfs2_alloc_context * meta_ac ,
struct ocfs2_write_ctxt * wc ,
loff_t pos , unsigned len )
{
int ret , i ;
struct ocfs2_write_cluster_desc * desc ;
for ( i = 0 ; i < wc - > w_clen ; i + + ) {
desc = & wc - > w_desc [ i ] ;
ret = ocfs2_write_cluster ( mapping , desc - > c_phys , data_ac ,
meta_ac , wc , desc - > c_cpos , pos , len ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
}
ret = 0 ;
out :
return ret ;
}
2007-05-09 04:47:32 +04:00
/*
* ocfs2_write_end ( ) wants to know which parts of the target page it
* should complete the write on . It ' s easiest to compute them ahead of
* time when a more complete view of the write is available .
*/
static void ocfs2_set_target_boundaries ( struct ocfs2_super * osb ,
struct ocfs2_write_ctxt * wc ,
loff_t pos , unsigned len , int alloc )
2007-02-10 07:24:12 +03:00
{
2007-05-09 04:47:32 +04:00
struct ocfs2_write_cluster_desc * desc ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
wc - > w_target_from = pos & ( PAGE_CACHE_SIZE - 1 ) ;
wc - > w_target_to = wc - > w_target_from + len ;
if ( alloc = = 0 )
return ;
/*
* Allocating write - we may have different boundaries based
* on page size and cluster size .
*
* NOTE : We can no longer compute one value from the other as
* the actual write length and user provided length may be
* different .
*/
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
if ( wc - > w_large_pages ) {
/*
* We only care about the 1 st and last cluster within
* our range and whether they are holes or not . Either
* value may be extended out to the start / end of a
* newly allocated cluster .
*/
desc = & wc - > w_desc [ 0 ] ;
if ( desc - > c_new )
ocfs2_figure_cluster_boundaries ( osb ,
desc - > c_cpos ,
& wc - > w_target_from ,
NULL ) ;
desc = & wc - > w_desc [ wc - > w_clen - 1 ] ;
if ( desc - > c_new )
ocfs2_figure_cluster_boundaries ( osb ,
desc - > c_cpos ,
NULL ,
& wc - > w_target_to ) ;
} else {
wc - > w_target_from = 0 ;
wc - > w_target_to = PAGE_CACHE_SIZE ;
}
2007-02-10 07:24:12 +03:00
}
2007-05-15 05:09:54 +04:00
/*
* Populate each single - cluster write descriptor in the write context
* with information about the i / o to be done .
*/
static int ocfs2_populate_write_desc ( struct inode * inode ,
struct ocfs2_write_ctxt * wc ,
unsigned int * clusters_to_alloc )
2007-02-10 07:24:12 +03:00
{
2007-05-15 05:09:54 +04:00
int ret ;
2007-05-09 04:47:32 +04:00
struct ocfs2_write_cluster_desc * desc ;
2007-05-15 05:09:54 +04:00
unsigned int num_clusters = 0 ;
u32 phys = 0 ;
int i ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
for ( i = 0 ; i < wc - > w_clen ; i + + ) {
desc = & wc - > w_desc [ i ] ;
desc - > c_cpos = wc - > w_cpos + i ;
if ( num_clusters = = 0 ) {
ret = ocfs2_get_clusters ( inode , desc - > c_cpos , & phys ,
& num_clusters , NULL ) ;
if ( ret ) {
mlog_errno ( ret ) ;
2007-05-10 02:14:45 +04:00
goto out ;
2007-05-09 04:47:32 +04:00
}
} else if ( phys ) {
/*
* Only increment phys if it doesn ' t describe
* a hole .
*/
phys + + ;
}
desc - > c_phys = phys ;
if ( phys = = 0 ) {
desc - > c_new = 1 ;
2007-05-15 05:09:54 +04:00
* clusters_to_alloc = * clusters_to_alloc + 1 ;
2007-05-09 04:47:32 +04:00
}
num_clusters - - ;
2007-02-10 07:24:12 +03:00
}
2007-05-15 05:09:54 +04:00
ret = 0 ;
out :
return ret ;
}
int ocfs2_write_begin_nolock ( struct address_space * mapping ,
loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , void * * fsdata ,
struct buffer_head * di_bh , struct page * mmap_page )
{
int ret , credits = OCFS2_INODE_UPDATE_CREDITS ;
unsigned int clusters_to_alloc = 0 ;
struct ocfs2_write_ctxt * wc ;
struct inode * inode = mapping - > host ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct ocfs2_dinode * di ;
struct ocfs2_alloc_context * data_ac = NULL ;
struct ocfs2_alloc_context * meta_ac = NULL ;
handle_t * handle ;
ret = ocfs2_alloc_write_ctxt ( & wc , osb , pos , len , di_bh ) ;
if ( ret ) {
mlog_errno ( ret ) ;
return ret ;
}
ret = ocfs2_populate_write_desc ( inode , wc , & clusters_to_alloc ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
di = ( struct ocfs2_dinode * ) wc - > w_di_bh - > b_data ;
2007-05-09 04:47:32 +04:00
/*
* We set w_target_from , w_target_to here so that
* ocfs2_write_end ( ) knows which range in the target page to
* write out . An allocation requires that we write the entire
* cluster range .
*/
if ( clusters_to_alloc > 0 ) {
/*
* XXX : We are stretching the limits of
* ocfs2_lock_allocators ( ) . It greately over - estimates
* the work to be done .
*/
ret = ocfs2_lock_allocators ( inode , di , clusters_to_alloc ,
& data_ac , & meta_ac ) ;
2007-02-10 07:24:12 +03:00
if ( ret ) {
mlog_errno ( ret ) ;
2007-05-10 02:14:45 +04:00
goto out ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
credits = ocfs2_calc_extend_credits ( inode - > i_sb , di ,
clusters_to_alloc ) ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
ocfs2_set_target_boundaries ( osb , wc , pos , len , clusters_to_alloc ) ;
2007-02-10 07:24:12 +03:00
handle = ocfs2_start_trans ( osb , credits ) ;
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
mlog_errno ( ret ) ;
2007-05-10 02:14:45 +04:00
goto out ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
wc - > w_handle = handle ;
/*
* We don ' t want this to fail in ocfs2_write_end ( ) , so do it
* here .
*/
ret = ocfs2_journal_access ( handle , inode , wc - > w_di_bh ,
OCFS2_JOURNAL_ACCESS_WRITE ) ;
if ( ret ) {
2007-02-10 07:24:12 +03:00
mlog_errno ( ret ) ;
goto out_commit ;
}
2007-05-09 04:47:32 +04:00
/*
* Fill our page array first . That way we ' ve grabbed enough so
* that we can zero and flush if we error after adding the
* extent .
*/
ret = ocfs2_grab_pages_for_write ( mapping , wc , wc - > w_cpos , pos ,
2007-05-10 02:16:19 +04:00
clusters_to_alloc , mmap_page ) ;
2007-02-10 07:24:12 +03:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out_commit ;
}
2007-05-15 05:09:54 +04:00
ret = ocfs2_write_cluster_by_desc ( mapping , data_ac , meta_ac , wc , pos ,
len ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out_commit ;
2007-02-10 07:24:12 +03:00
}
2007-05-09 04:47:32 +04:00
if ( data_ac )
ocfs2_free_alloc_context ( data_ac ) ;
if ( meta_ac )
ocfs2_free_alloc_context ( meta_ac ) ;
2007-02-10 07:24:12 +03:00
2007-05-09 04:47:32 +04:00
* pagep = wc - > w_target_page ;
* fsdata = wc ;
return 0 ;
2007-02-10 07:24:12 +03:00
out_commit :
ocfs2_commit_trans ( osb , handle ) ;
out :
2007-05-09 04:47:32 +04:00
ocfs2_free_write_ctxt ( wc ) ;
2007-02-10 07:24:12 +03:00
if ( data_ac )
ocfs2_free_alloc_context ( data_ac ) ;
if ( meta_ac )
ocfs2_free_alloc_context ( meta_ac ) ;
2007-05-09 04:47:32 +04:00
return ret ;
}
2007-05-10 02:14:45 +04:00
int ocfs2_write_begin ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , void * * fsdata )
{
int ret ;
struct buffer_head * di_bh = NULL ;
struct inode * inode = mapping - > host ;
ret = ocfs2_meta_lock ( inode , & di_bh , 1 ) ;
if ( ret ) {
mlog_errno ( ret ) ;
return ret ;
}
/*
* Take alloc sem here to prevent concurrent lookups . That way
* the mapping , zeroing and tree manipulation within
* ocfs2_write ( ) will be safe against - > readpage ( ) . This
* should also serve to lock out allocation from a shared
* writeable region .
*/
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
ret = ocfs2_data_lock ( inode , 1 ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out_fail ;
}
ret = ocfs2_write_begin_nolock ( mapping , pos , len , flags , pagep ,
2007-05-10 02:16:19 +04:00
fsdata , di_bh , NULL ) ;
2007-05-10 02:14:45 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out_fail_data ;
}
brelse ( di_bh ) ;
return 0 ;
out_fail_data :
ocfs2_data_unlock ( inode , 1 ) ;
out_fail :
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
brelse ( di_bh ) ;
ocfs2_meta_unlock ( inode , 1 ) ;
return ret ;
}
2007-05-10 02:16:19 +04:00
int ocfs2_write_end_nolock ( struct address_space * mapping ,
loff_t pos , unsigned len , unsigned copied ,
struct page * page , void * fsdata )
2007-05-09 04:47:32 +04:00
{
int i ;
unsigned from , to , start = pos & ( PAGE_CACHE_SIZE - 1 ) ;
struct inode * inode = mapping - > host ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct ocfs2_write_ctxt * wc = fsdata ;
struct ocfs2_dinode * di = ( struct ocfs2_dinode * ) wc - > w_di_bh - > b_data ;
handle_t * handle = wc - > w_handle ;
struct page * tmppage ;
if ( unlikely ( copied < len ) ) {
if ( ! PageUptodate ( wc - > w_target_page ) )
copied = 0 ;
ocfs2_zero_new_buffers ( wc - > w_target_page , start + copied ,
start + len ) ;
}
flush_dcache_page ( wc - > w_target_page ) ;
for ( i = 0 ; i < wc - > w_num_pages ; i + + ) {
tmppage = wc - > w_pages [ i ] ;
if ( tmppage = = wc - > w_target_page ) {
from = wc - > w_target_from ;
to = wc - > w_target_to ;
BUG_ON ( from > PAGE_CACHE_SIZE | |
to > PAGE_CACHE_SIZE | |
to < from ) ;
} else {
/*
* Pages adjacent to the target ( if any ) imply
* a hole - filling write in which case we want
* to flush their entire range .
*/
from = 0 ;
to = PAGE_CACHE_SIZE ;
}
if ( ocfs2_should_order_data ( inode ) )
walk_page_buffers ( wc - > w_handle , page_buffers ( tmppage ) ,
from , to , NULL ,
ocfs2_journal_dirty_data ) ;
block_commit_write ( tmppage , from , to ) ;
}
pos + = copied ;
if ( pos > inode - > i_size ) {
i_size_write ( inode , pos ) ;
mark_inode_dirty ( inode ) ;
}
inode - > i_blocks = ocfs2_inode_sector_count ( inode ) ;
di - > i_size = cpu_to_le64 ( ( u64 ) i_size_read ( inode ) ) ;
inode - > i_mtime = inode - > i_ctime = CURRENT_TIME ;
di - > i_mtime = di - > i_ctime = cpu_to_le64 ( inode - > i_mtime . tv_sec ) ;
di - > i_mtime_nsec = di - > i_ctime_nsec = cpu_to_le32 ( inode - > i_mtime . tv_nsec ) ;
ocfs2_journal_dirty ( handle , wc - > w_di_bh ) ;
ocfs2_commit_trans ( osb , handle ) ;
2007-06-23 02:52:36 +04:00
2007-05-10 02:14:45 +04:00
ocfs2_free_write_ctxt ( wc ) ;
return copied ;
}
int ocfs2_write_end ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned copied ,
struct page * page , void * fsdata )
{
int ret ;
struct inode * inode = mapping - > host ;
ret = ocfs2_write_end_nolock ( mapping , pos , len , copied , page , fsdata ) ;
2007-05-09 04:47:32 +04:00
ocfs2_data_unlock ( inode , 1 ) ;
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
ocfs2_meta_unlock ( inode , 1 ) ;
2007-02-10 07:24:12 +03:00
2007-05-10 02:14:45 +04:00
return ret ;
2007-02-10 07:24:12 +03:00
}
2006-06-28 15:26:44 +04:00
const struct address_space_operations ocfs2_aops = {
2005-12-16 01:31:24 +03:00
. readpage = ocfs2_readpage ,
. writepage = ocfs2_writepage ,
. bmap = ocfs2_bmap ,
. sync_page = block_sync_page ,
2007-01-05 01:54:41 +03:00
. direct_IO = ocfs2_direct_IO ,
. invalidatepage = ocfs2_invalidatepage ,
. releasepage = ocfs2_releasepage ,
. migratepage = buffer_migrate_page ,
2005-12-16 01:31:24 +03:00
} ;