2005-12-16 01:31:24 +03:00
/* -*- mode: c; c-basic-offset: 8; -*-
* vim : noexpandtab sw = 8 ts = 8 sts = 0 :
*
* journal . c
*
* Defines functions of journalling api
*
* Copyright ( C ) 2003 , 2004 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation ; either
* version 2 of the License , or ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
# include <linux/fs.h>
# include <linux/types.h>
# include <linux/slab.h>
# include <linux/highmem.h>
# include <linux/kthread.h>
# define MLOG_MASK_PREFIX ML_JOURNAL
# include <cluster/masklog.h>
# include "ocfs2.h"
# include "alloc.h"
2007-09-08 05:21:26 +04:00
# include "dir.h"
2005-12-16 01:31:24 +03:00
# include "dlmglue.h"
# include "extent_map.h"
# include "heartbeat.h"
# include "inode.h"
# include "journal.h"
# include "localalloc.h"
# include "slot_map.h"
# include "super.h"
# include "sysfile.h"
# include "buffer_head_io.h"
2006-06-27 13:53:55 +04:00
DEFINE_SPINLOCK ( trans_inc_lock ) ;
2005-12-16 01:31:24 +03:00
static int ocfs2_force_read_journal ( struct inode * inode ) ;
static int ocfs2_recover_node ( struct ocfs2_super * osb ,
int node_num ) ;
static int __ocfs2_recovery_thread ( void * arg ) ;
static int ocfs2_commit_cache ( struct ocfs2_super * osb ) ;
static int ocfs2_wait_on_mount ( struct ocfs2_super * osb ) ;
static int ocfs2_journal_toggle_dirty ( struct ocfs2_super * osb ,
int dirty ) ;
static int ocfs2_trylock_journal ( struct ocfs2_super * osb ,
int slot_num ) ;
static int ocfs2_recover_orphans ( struct ocfs2_super * osb ,
int slot ) ;
static int ocfs2_commit_thread ( void * arg ) ;
2008-02-01 23:03:57 +03:00
/*
* The recovery_list is a simple linked list of node numbers to recover .
* It is protected by the recovery_lock .
*/
struct ocfs2_recovery_map {
2008-02-01 23:04:48 +03:00
unsigned int rm_used ;
2008-02-01 23:03:57 +03:00
unsigned int * rm_entries ;
} ;
int ocfs2_recovery_init ( struct ocfs2_super * osb )
{
struct ocfs2_recovery_map * rm ;
mutex_init ( & osb - > recovery_lock ) ;
osb - > disable_recovery = 0 ;
osb - > recovery_thread_task = NULL ;
init_waitqueue_head ( & osb - > recovery_event ) ;
rm = kzalloc ( sizeof ( struct ocfs2_recovery_map ) +
osb - > max_slots * sizeof ( unsigned int ) ,
GFP_KERNEL ) ;
if ( ! rm ) {
mlog_errno ( - ENOMEM ) ;
return - ENOMEM ;
}
rm - > rm_entries = ( unsigned int * ) ( ( char * ) rm +
sizeof ( struct ocfs2_recovery_map ) ) ;
osb - > recovery_map = rm ;
return 0 ;
}
/* we can't grab the goofy sem lock from inside wait_event, so we use
* memory barriers to make sure that we ' ll see the null task before
* being woken up */
static int ocfs2_recovery_thread_running ( struct ocfs2_super * osb )
{
mb ( ) ;
return osb - > recovery_thread_task ! = NULL ;
}
void ocfs2_recovery_exit ( struct ocfs2_super * osb )
{
struct ocfs2_recovery_map * rm ;
/* disable any new recovery threads and wait for any currently
* running ones to exit . Do this before setting the vol_state . */
mutex_lock ( & osb - > recovery_lock ) ;
osb - > disable_recovery = 1 ;
mutex_unlock ( & osb - > recovery_lock ) ;
wait_event ( osb - > recovery_event , ! ocfs2_recovery_thread_running ( osb ) ) ;
/* At this point, we know that no more recovery threads can be
* launched , so wait for any recovery completion work to
* complete . */
flush_workqueue ( ocfs2_wq ) ;
/*
* Now that recovery is shut down , and the osb is about to be
* freed , the osb_lock is not taken here .
*/
rm = osb - > recovery_map ;
/* XXX: Should we bug if there are dirty entries? */
kfree ( rm ) ;
}
static int __ocfs2_recovery_map_test ( struct ocfs2_super * osb ,
unsigned int node_num )
{
int i ;
struct ocfs2_recovery_map * rm = osb - > recovery_map ;
assert_spin_locked ( & osb - > osb_lock ) ;
for ( i = 0 ; i < rm - > rm_used ; i + + ) {
if ( rm - > rm_entries [ i ] = = node_num )
return 1 ;
}
return 0 ;
}
/* Behaves like test-and-set. Returns the previous value */
static int ocfs2_recovery_map_set ( struct ocfs2_super * osb ,
unsigned int node_num )
{
struct ocfs2_recovery_map * rm = osb - > recovery_map ;
spin_lock ( & osb - > osb_lock ) ;
if ( __ocfs2_recovery_map_test ( osb , node_num ) ) {
spin_unlock ( & osb - > osb_lock ) ;
return 1 ;
}
/* XXX: Can this be exploited? Not from o2dlm... */
BUG_ON ( rm - > rm_used > = osb - > max_slots ) ;
rm - > rm_entries [ rm - > rm_used ] = node_num ;
rm - > rm_used + + ;
spin_unlock ( & osb - > osb_lock ) ;
return 0 ;
}
static void ocfs2_recovery_map_clear ( struct ocfs2_super * osb ,
unsigned int node_num )
{
int i ;
struct ocfs2_recovery_map * rm = osb - > recovery_map ;
spin_lock ( & osb - > osb_lock ) ;
for ( i = 0 ; i < rm - > rm_used ; i + + ) {
if ( rm - > rm_entries [ i ] = = node_num )
break ;
}
if ( i < rm - > rm_used ) {
/* XXX: be careful with the pointer math */
memmove ( & ( rm - > rm_entries [ i ] ) , & ( rm - > rm_entries [ i + 1 ] ) ,
( rm - > rm_used - i - 1 ) * sizeof ( unsigned int ) ) ;
rm - > rm_used - - ;
}
spin_unlock ( & osb - > osb_lock ) ;
}
2005-12-16 01:31:24 +03:00
static int ocfs2_commit_cache ( struct ocfs2_super * osb )
{
int status = 0 ;
unsigned int flushed ;
unsigned long old_id ;
struct ocfs2_journal * journal = NULL ;
mlog_entry_void ( ) ;
journal = osb - > journal ;
/* Flush all pending commits and checkpoint the journal. */
down_write ( & journal - > j_trans_barrier ) ;
if ( atomic_read ( & journal - > j_num_trans ) = = 0 ) {
up_write ( & journal - > j_trans_barrier ) ;
mlog ( 0 , " No transactions for me to flush! \n " ) ;
goto finally ;
}
journal_lock_updates ( journal - > j_journal ) ;
status = journal_flush ( journal - > j_journal ) ;
journal_unlock_updates ( journal - > j_journal ) ;
if ( status < 0 ) {
up_write ( & journal - > j_trans_barrier ) ;
mlog_errno ( status ) ;
goto finally ;
}
old_id = ocfs2_inc_trans_id ( journal ) ;
flushed = atomic_read ( & journal - > j_num_trans ) ;
atomic_set ( & journal - > j_num_trans , 0 ) ;
up_write ( & journal - > j_trans_barrier ) ;
mlog ( 0 , " commit_thread: flushed transaction %lu (%u handles) \n " ,
journal - > j_trans_id , flushed ) ;
2007-09-25 02:56:19 +04:00
ocfs2_wake_downconvert_thread ( osb ) ;
2005-12-16 01:31:24 +03:00
wake_up ( & journal - > j_checkpointed ) ;
finally :
mlog_exit ( status ) ;
return status ;
}
/* pass it NULL and it will allocate a new handle object for you. If
* you pass it a handle however , it may still return error , in which
* case it has free ' d the passed handle for you . */
2006-10-10 05:11:45 +04:00
handle_t * ocfs2_start_trans ( struct ocfs2_super * osb , int max_buffs )
2005-12-16 01:31:24 +03:00
{
journal_t * journal = osb - > journal - > j_journal ;
2006-10-10 05:11:45 +04:00
handle_t * handle ;
2005-12-16 01:31:24 +03:00
2006-01-27 12:32:52 +03:00
BUG_ON ( ! osb | | ! osb - > journal - > j_journal ) ;
2005-12-16 01:31:24 +03:00
2006-10-10 04:26:22 +04:00
if ( ocfs2_is_hard_readonly ( osb ) )
return ERR_PTR ( - EROFS ) ;
2005-12-16 01:31:24 +03:00
BUG_ON ( osb - > journal - > j_state = = OCFS2_JOURNAL_FREE ) ;
BUG_ON ( max_buffs < = 0 ) ;
/* JBD might support this, but our journalling code doesn't yet. */
if ( journal_current_handle ( ) ) {
mlog ( ML_ERROR , " Recursive transaction attempted! \n " ) ;
BUG ( ) ;
}
down_read ( & osb - > journal - > j_trans_barrier ) ;
2006-10-10 05:11:45 +04:00
handle = journal_start ( journal , max_buffs ) ;
if ( IS_ERR ( handle ) ) {
2005-12-16 01:31:24 +03:00
up_read ( & osb - > journal - > j_trans_barrier ) ;
2006-10-10 05:11:45 +04:00
mlog_errno ( PTR_ERR ( handle ) ) ;
2005-12-16 01:31:24 +03:00
if ( is_journal_aborted ( journal ) ) {
ocfs2_abort ( osb - > sb , " Detected aborted journal " ) ;
2006-10-10 05:11:45 +04:00
handle = ERR_PTR ( - EROFS ) ;
2005-12-16 01:31:24 +03:00
}
2006-12-06 04:56:35 +03:00
} else {
if ( ! ocfs2_mount_local ( osb ) )
atomic_inc ( & ( osb - > journal - > j_num_trans ) ) ;
}
2005-12-16 01:31:24 +03:00
return handle ;
}
2006-10-10 05:11:45 +04:00
int ocfs2_commit_trans ( struct ocfs2_super * osb ,
handle_t * handle )
2005-12-16 01:31:24 +03:00
{
2006-10-10 05:11:45 +04:00
int ret ;
2006-10-10 03:48:10 +04:00
struct ocfs2_journal * journal = osb - > journal ;
2005-12-16 01:31:24 +03:00
BUG_ON ( ! handle ) ;
2006-10-10 05:11:45 +04:00
ret = journal_stop ( handle ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
up_read ( & journal - > j_trans_barrier ) ;
2006-10-10 05:11:45 +04:00
return ret ;
2005-12-16 01:31:24 +03:00
}
/*
* ' nblocks ' is what you want to add to the current
* transaction . extend_trans will either extend the current handle by
* nblocks , or commit it and start a new one with nblocks credits .
*
2007-12-04 03:43:01 +03:00
* This might call journal_restart ( ) which will commit dirty buffers
* and then restart the transaction . Before calling
* ocfs2_extend_trans ( ) , any changed blocks should have been
* dirtied . After calling it , all blocks which need to be changed must
* go through another set of journal_access / journal_dirty calls .
*
2005-12-16 01:31:24 +03:00
* WARNING : This will not release any semaphores or disk locks taken
* during the transaction , so make sure they were taken * before *
* start_trans or we ' ll have ordering deadlocks .
*
* WARNING2 : Note that we do * not * drop j_trans_barrier here . This is
* good because transaction ids haven ' t yet been recorded on the
* cluster locks associated with this handle .
*/
2006-10-06 01:15:36 +04:00
int ocfs2_extend_trans ( handle_t * handle , int nblocks )
2005-12-16 01:31:24 +03:00
{
int status ;
BUG_ON ( ! handle ) ;
BUG_ON ( ! nblocks ) ;
mlog_entry_void ( ) ;
mlog ( 0 , " Trying to extend transaction by %d blocks \n " , nblocks ) ;
2007-12-04 03:42:19 +03:00
# ifdef OCFS2_DEBUG_FS
status = 1 ;
# else
2006-10-06 01:15:36 +04:00
status = journal_extend ( handle , nblocks ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
2007-12-04 03:42:19 +03:00
# endif
2005-12-16 01:31:24 +03:00
if ( status > 0 ) {
mlog ( 0 , " journal_extend failed, trying journal_restart \n " ) ;
2006-10-06 01:15:36 +04:00
status = journal_restart ( handle , nblocks ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
2006-10-06 00:54:39 +04:00
}
2005-12-16 01:31:24 +03:00
status = 0 ;
bail :
mlog_exit ( status ) ;
return status ;
}
2006-10-10 05:11:45 +04:00
int ocfs2_journal_access ( handle_t * handle ,
2005-12-16 01:31:24 +03:00
struct inode * inode ,
struct buffer_head * bh ,
int type )
{
int status ;
BUG_ON ( ! inode ) ;
BUG_ON ( ! handle ) ;
BUG_ON ( ! bh ) ;
2006-03-26 13:38:00 +04:00
mlog_entry ( " bh->b_blocknr=%llu, type=%d ( \" %s \" ), bh->b_size = %zu \n " ,
2005-12-16 01:31:24 +03:00
( unsigned long long ) bh - > b_blocknr , type ,
( type = = OCFS2_JOURNAL_ACCESS_CREATE ) ?
" OCFS2_JOURNAL_ACCESS_CREATE " :
" OCFS2_JOURNAL_ACCESS_WRITE " ,
bh - > b_size ) ;
/* we can safely remove this assertion after testing. */
if ( ! buffer_uptodate ( bh ) ) {
mlog ( ML_ERROR , " giving me a buffer that's not uptodate! \n " ) ;
mlog ( ML_ERROR , " b_blocknr=%llu \n " ,
( unsigned long long ) bh - > b_blocknr ) ;
BUG ( ) ;
}
/* Set the current transaction information on the inode so
* that the locking code knows whether it can drop it ' s locks
* on this inode or not . We ' re protected from the commit
* thread updating the current transaction id until
* ocfs2_commit_trans ( ) because ocfs2_start_trans ( ) took
* j_trans_barrier for us . */
ocfs2_set_inode_lock_trans ( OCFS2_SB ( inode - > i_sb ) - > journal , inode ) ;
2006-01-11 02:41:43 +03:00
mutex_lock ( & OCFS2_I ( inode ) - > ip_io_mutex ) ;
2005-12-16 01:31:24 +03:00
switch ( type ) {
case OCFS2_JOURNAL_ACCESS_CREATE :
case OCFS2_JOURNAL_ACCESS_WRITE :
2006-10-10 05:11:45 +04:00
status = journal_get_write_access ( handle , bh ) ;
2005-12-16 01:31:24 +03:00
break ;
case OCFS2_JOURNAL_ACCESS_UNDO :
2006-10-10 05:11:45 +04:00
status = journal_get_undo_access ( handle , bh ) ;
2005-12-16 01:31:24 +03:00
break ;
default :
status = - EINVAL ;
mlog ( ML_ERROR , " Uknown access type! \n " ) ;
}
2006-01-11 02:41:43 +03:00
mutex_unlock ( & OCFS2_I ( inode ) - > ip_io_mutex ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 )
mlog ( ML_ERROR , " Error %d getting %d access to buffer! \n " ,
status , type ) ;
mlog_exit ( status ) ;
return status ;
}
2006-10-10 05:11:45 +04:00
int ocfs2_journal_dirty ( handle_t * handle ,
2005-12-16 01:31:24 +03:00
struct buffer_head * bh )
{
int status ;
mlog_entry ( " (bh->b_blocknr=%llu) \n " ,
( unsigned long long ) bh - > b_blocknr ) ;
2006-10-10 05:11:45 +04:00
status = journal_dirty_metadata ( handle , bh ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 )
mlog ( ML_ERROR , " Could not dirty metadata buffer. "
" (bh->b_blocknr=%llu) \n " ,
( unsigned long long ) bh - > b_blocknr ) ;
mlog_exit ( status ) ;
return status ;
}
int ocfs2_journal_dirty_data ( handle_t * handle ,
struct buffer_head * bh )
{
int err = journal_dirty_data ( handle , bh ) ;
if ( err )
mlog_errno ( err ) ;
/* TODO: When we can handle it, abort the handle and go RO on
* error here . */
return err ;
}
2007-11-08 01:40:36 +03:00
# define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
2005-12-16 01:31:24 +03:00
void ocfs2_set_journal_params ( struct ocfs2_super * osb )
{
journal_t * journal = osb - > journal - > j_journal ;
2007-11-08 01:40:36 +03:00
unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL ;
if ( osb - > osb_commit_interval )
commit_interval = osb - > osb_commit_interval ;
2005-12-16 01:31:24 +03:00
spin_lock ( & journal - > j_state_lock ) ;
2007-11-08 01:40:36 +03:00
journal - > j_commit_interval = commit_interval ;
2005-12-16 01:31:24 +03:00
if ( osb - > s_mount_opt & OCFS2_MOUNT_BARRIER )
journal - > j_flags | = JFS_BARRIER ;
else
journal - > j_flags & = ~ JFS_BARRIER ;
spin_unlock ( & journal - > j_state_lock ) ;
}
int ocfs2_journal_init ( struct ocfs2_journal * journal , int * dirty )
{
int status = - 1 ;
struct inode * inode = NULL ; /* the journal inode */
journal_t * j_journal = NULL ;
struct ocfs2_dinode * di = NULL ;
struct buffer_head * bh = NULL ;
struct ocfs2_super * osb ;
2007-10-19 02:30:42 +04:00
int inode_lock = 0 ;
2005-12-16 01:31:24 +03:00
mlog_entry_void ( ) ;
BUG_ON ( ! journal ) ;
osb = journal - > j_osb ;
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode ( osb , JOURNAL_SYSTEM_INODE ,
osb - > slot_num ) ;
if ( inode = = NULL ) {
status = - EACCES ;
mlog_errno ( status ) ;
goto done ;
}
if ( is_bad_inode ( inode ) ) {
mlog ( ML_ERROR , " access error (bad inode) \n " ) ;
iput ( inode ) ;
inode = NULL ;
status = - EACCES ;
goto done ;
}
SET_INODE_JOURNAL ( inode ) ;
OCFS2_I ( inode ) - > ip_open_count + + ;
2006-01-18 21:31:47 +03:00
/* Skip recovery waits here - journal inode metadata never
* changes in a live cluster so it can be considered an
* exception to the rule . */
2007-10-19 02:30:42 +04:00
status = ocfs2_inode_lock_full ( inode , & bh , 1 , OCFS2_META_LOCK_RECOVERY ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
if ( status ! = - ERESTARTSYS )
mlog ( ML_ERROR , " Could not get lock on journal! \n " ) ;
goto done ;
}
2007-10-19 02:30:42 +04:00
inode_lock = 1 ;
2005-12-16 01:31:24 +03:00
di = ( struct ocfs2_dinode * ) bh - > b_data ;
if ( inode - > i_size < OCFS2_MIN_JOURNAL_SIZE ) {
mlog ( ML_ERROR , " Journal file size (%lld) is too small! \n " ,
inode - > i_size ) ;
status = - EINVAL ;
goto done ;
}
mlog ( 0 , " inode->i_size = %lld \n " , inode - > i_size ) ;
2006-03-26 13:37:53 +04:00
mlog ( 0 , " inode->i_blocks = %llu \n " ,
( unsigned long long ) inode - > i_blocks ) ;
2005-12-16 01:31:24 +03:00
mlog ( 0 , " inode->ip_clusters = %u \n " , OCFS2_I ( inode ) - > ip_clusters ) ;
/* call the kernels journal init function now */
j_journal = journal_init_inode ( inode ) ;
if ( j_journal = = NULL ) {
mlog ( ML_ERROR , " Linux journal layer error \n " ) ;
status = - EINVAL ;
goto done ;
}
mlog ( 0 , " Returned from journal_init_inode \n " ) ;
mlog ( 0 , " j_journal->j_maxlen = %u \n " , j_journal - > j_maxlen ) ;
* dirty = ( le32_to_cpu ( di - > id1 . journal1 . ij_flags ) &
OCFS2_JOURNAL_DIRTY_FL ) ;
journal - > j_journal = j_journal ;
journal - > j_inode = inode ;
journal - > j_bh = bh ;
ocfs2_set_journal_params ( osb ) ;
journal - > j_state = OCFS2_JOURNAL_LOADED ;
status = 0 ;
done :
if ( status < 0 ) {
2007-10-19 02:30:42 +04:00
if ( inode_lock )
ocfs2_inode_unlock ( inode , 1 ) ;
2005-12-16 01:31:24 +03:00
if ( bh ! = NULL )
brelse ( bh ) ;
if ( inode ) {
OCFS2_I ( inode ) - > ip_open_count - - ;
iput ( inode ) ;
}
}
mlog_exit ( status ) ;
return status ;
}
static int ocfs2_journal_toggle_dirty ( struct ocfs2_super * osb ,
int dirty )
{
int status ;
unsigned int flags ;
struct ocfs2_journal * journal = osb - > journal ;
struct buffer_head * bh = journal - > j_bh ;
struct ocfs2_dinode * fe ;
mlog_entry_void ( ) ;
fe = ( struct ocfs2_dinode * ) bh - > b_data ;
if ( ! OCFS2_IS_VALID_DINODE ( fe ) ) {
/* This is called from startup/shutdown which will
* handle the errors in a specific manner , so no need
* to call ocfs2_error ( ) here . */
2006-03-03 21:24:33 +03:00
mlog ( ML_ERROR , " Journal dinode %llu has invalid "
2007-04-28 03:01:25 +04:00
" signature: %.*s " ,
( unsigned long long ) le64_to_cpu ( fe - > i_blkno ) , 7 ,
2006-03-03 21:24:33 +03:00
fe - > i_signature ) ;
2005-12-16 01:31:24 +03:00
status = - EIO ;
goto out ;
}
flags = le32_to_cpu ( fe - > id1 . journal1 . ij_flags ) ;
if ( dirty )
flags | = OCFS2_JOURNAL_DIRTY_FL ;
else
flags & = ~ OCFS2_JOURNAL_DIRTY_FL ;
fe - > id1 . journal1 . ij_flags = cpu_to_le32 ( flags ) ;
status = ocfs2_write_block ( osb , bh , journal - > j_inode ) ;
if ( status < 0 )
mlog_errno ( status ) ;
out :
mlog_exit ( status ) ;
return status ;
}
/*
* If the journal has been kmalloc ' d it needs to be freed after this
* call .
*/
void ocfs2_journal_shutdown ( struct ocfs2_super * osb )
{
struct ocfs2_journal * journal = NULL ;
int status = 0 ;
struct inode * inode = NULL ;
int num_running_trans = 0 ;
mlog_entry_void ( ) ;
2006-01-27 12:32:52 +03:00
BUG_ON ( ! osb ) ;
2005-12-16 01:31:24 +03:00
journal = osb - > journal ;
if ( ! journal )
goto done ;
inode = journal - > j_inode ;
if ( journal - > j_state ! = OCFS2_JOURNAL_LOADED )
goto done ;
/* need to inc inode use count as journal_destroy will iput. */
if ( ! igrab ( inode ) )
BUG ( ) ;
num_running_trans = atomic_read ( & ( osb - > journal - > j_num_trans ) ) ;
if ( num_running_trans > 0 )
mlog ( 0 , " Shutting down journal: must wait on %d "
" running transactions! \n " ,
num_running_trans ) ;
/* Do a commit_cache here. It will flush our journal, *and*
* release any locks that are still held .
* set the SHUTDOWN flag and release the trans lock .
* the commit thread will take the trans lock for us below . */
journal - > j_state = OCFS2_JOURNAL_IN_SHUTDOWN ;
/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
* drop the trans_lock ( which we want to hold until we
* completely destroy the journal . */
if ( osb - > commit_task ) {
/* Wait for the commit thread */
mlog ( 0 , " Waiting for ocfs2commit to exit.... \n " ) ;
kthread_stop ( osb - > commit_task ) ;
osb - > commit_task = NULL ;
}
BUG_ON ( atomic_read ( & ( osb - > journal - > j_num_trans ) ) ! = 0 ) ;
2006-12-06 04:56:35 +03:00
if ( ocfs2_mount_local ( osb ) ) {
journal_lock_updates ( journal - > j_journal ) ;
status = journal_flush ( journal - > j_journal ) ;
journal_unlock_updates ( journal - > j_journal ) ;
if ( status < 0 )
mlog_errno ( status ) ;
}
if ( status = = 0 ) {
/*
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a " clean " journal
*/
status = ocfs2_journal_toggle_dirty ( osb , 0 ) ;
if ( status < 0 )
mlog_errno ( status ) ;
}
2005-12-16 01:31:24 +03:00
/* Shutdown the kernel journal system */
journal_destroy ( journal - > j_journal ) ;
OCFS2_I ( inode ) - > ip_open_count - - ;
/* unlock our journal */
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( inode , 1 ) ;
2005-12-16 01:31:24 +03:00
brelse ( journal - > j_bh ) ;
journal - > j_bh = NULL ;
journal - > j_state = OCFS2_JOURNAL_FREE ;
// up_write(&journal->j_trans_barrier);
done :
if ( inode )
iput ( inode ) ;
mlog_exit_void ( ) ;
}
static void ocfs2_clear_journal_error ( struct super_block * sb ,
journal_t * journal ,
int slot )
{
int olderr ;
olderr = journal_errno ( journal ) ;
if ( olderr ) {
mlog ( ML_ERROR , " File system error %d recorded in "
" journal %u. \n " , olderr , slot ) ;
mlog ( ML_ERROR , " File system on device %s needs checking. \n " ,
sb - > s_id ) ;
journal_ack_err ( journal ) ;
journal_clear_err ( journal ) ;
}
}
2006-12-06 04:56:35 +03:00
int ocfs2_journal_load ( struct ocfs2_journal * journal , int local )
2005-12-16 01:31:24 +03:00
{
int status = 0 ;
struct ocfs2_super * osb ;
mlog_entry_void ( ) ;
2008-03-05 02:21:05 +03:00
BUG_ON ( ! journal ) ;
2005-12-16 01:31:24 +03:00
osb = journal - > j_osb ;
status = journal_load ( journal - > j_journal ) ;
if ( status < 0 ) {
mlog ( ML_ERROR , " Failed to load journal! \n " ) ;
goto done ;
}
ocfs2_clear_journal_error ( osb - > sb , journal - > j_journal , osb - > slot_num ) ;
status = ocfs2_journal_toggle_dirty ( osb , 1 ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto done ;
}
/* Launch the commit thread */
2006-12-06 04:56:35 +03:00
if ( ! local ) {
osb - > commit_task = kthread_run ( ocfs2_commit_thread , osb ,
" ocfs2cmt " ) ;
if ( IS_ERR ( osb - > commit_task ) ) {
status = PTR_ERR ( osb - > commit_task ) ;
osb - > commit_task = NULL ;
mlog ( ML_ERROR , " unable to launch ocfs2commit thread, "
" error=%d " , status ) ;
goto done ;
}
} else
2005-12-16 01:31:24 +03:00
osb - > commit_task = NULL ;
done :
mlog_exit ( status ) ;
return status ;
}
/* 'full' flag tells us whether we clear out all blocks or if we just
* mark the journal clean */
int ocfs2_journal_wipe ( struct ocfs2_journal * journal , int full )
{
int status ;
mlog_entry_void ( ) ;
2006-01-27 12:32:52 +03:00
BUG_ON ( ! journal ) ;
2005-12-16 01:31:24 +03:00
status = journal_wipe ( journal - > j_journal , full ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
status = ocfs2_journal_toggle_dirty ( journal - > j_osb , 0 ) ;
if ( status < 0 )
mlog_errno ( status ) ;
bail :
mlog_exit ( status ) ;
return status ;
}
2008-02-01 23:03:57 +03:00
static int ocfs2_recovery_completed ( struct ocfs2_super * osb )
{
int empty ;
struct ocfs2_recovery_map * rm = osb - > recovery_map ;
spin_lock ( & osb - > osb_lock ) ;
empty = ( rm - > rm_used = = 0 ) ;
spin_unlock ( & osb - > osb_lock ) ;
return empty ;
}
void ocfs2_wait_for_recovery ( struct ocfs2_super * osb )
{
wait_event ( osb - > recovery_event , ocfs2_recovery_completed ( osb ) ) ;
}
2005-12-16 01:31:24 +03:00
/*
* JBD Might read a cached version of another nodes journal file . We
* don ' t want this as this file changes often and we get no
* notification on those changes . The only way to be sure that we ' ve
* got the most up to date version of those blocks then is to force
* read them off disk . Just searching through the buffer cache won ' t
* work as there may be pages backing this file which are still marked
* up to date . We know things can ' t change on this file underneath us
* as we have the lock by now : )
*/
static int ocfs2_force_read_journal ( struct inode * inode )
{
int status = 0 ;
2007-03-10 03:26:50 +03:00
int i ;
2007-03-23 02:53:23 +03:00
u64 v_blkno , p_blkno , p_blocks , num_blocks ;
2007-03-10 03:26:50 +03:00
# define CONCURRENT_JOURNAL_FILL 32ULL
2005-12-16 01:31:24 +03:00
struct buffer_head * bhs [ CONCURRENT_JOURNAL_FILL ] ;
mlog_entry_void ( ) ;
memset ( bhs , 0 , sizeof ( struct buffer_head * ) * CONCURRENT_JOURNAL_FILL ) ;
2007-03-23 02:53:23 +03:00
num_blocks = ocfs2_blocks_for_bytes ( inode - > i_sb , inode - > i_size ) ;
2005-12-16 01:31:24 +03:00
v_blkno = 0 ;
2007-03-23 02:53:23 +03:00
while ( v_blkno < num_blocks ) {
2005-12-16 01:31:24 +03:00
status = ocfs2_extent_map_get_blocks ( inode , v_blkno ,
2007-03-10 03:21:46 +03:00
& p_blkno , & p_blocks , NULL ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
if ( p_blocks > CONCURRENT_JOURNAL_FILL )
p_blocks = CONCURRENT_JOURNAL_FILL ;
2006-04-13 01:24:05 +04:00
/* We are reading journal data which should not
* be put in the uptodate cache */
2005-12-16 01:31:24 +03:00
status = ocfs2_read_blocks ( OCFS2_SB ( inode - > i_sb ) ,
p_blkno , p_blocks , bhs , 0 ,
2006-04-13 01:24:05 +04:00
NULL ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
for ( i = 0 ; i < p_blocks ; i + + ) {
brelse ( bhs [ i ] ) ;
bhs [ i ] = NULL ;
}
v_blkno + = p_blocks ;
}
bail :
for ( i = 0 ; i < CONCURRENT_JOURNAL_FILL ; i + + )
if ( bhs [ i ] )
brelse ( bhs [ i ] ) ;
mlog_exit ( status ) ;
return status ;
}
struct ocfs2_la_recovery_item {
struct list_head lri_list ;
int lri_slot ;
struct ocfs2_dinode * lri_la_dinode ;
struct ocfs2_dinode * lri_tl_dinode ;
} ;
/* Does the second half of the recovery process. By this point, the
* node is marked clean and can actually be considered recovered ,
* hence it ' s no longer in the recovery map , but there ' s still some
* cleanup we can do which shouldn ' t happen within the recovery thread
* as locking in that context becomes very difficult if we are to take
* recovering nodes into account .
*
* NOTE : This function can and will sleep on recovery of other nodes
* during cluster locking , just like any other ocfs2 process .
*/
2006-11-22 17:57:56 +03:00
void ocfs2_complete_recovery ( struct work_struct * work )
2005-12-16 01:31:24 +03:00
{
int ret ;
2006-11-22 17:57:56 +03:00
struct ocfs2_journal * journal =
container_of ( work , struct ocfs2_journal , j_recovery_work ) ;
struct ocfs2_super * osb = journal - > j_osb ;
2005-12-16 01:31:24 +03:00
struct ocfs2_dinode * la_dinode , * tl_dinode ;
2007-05-17 18:03:13 +04:00
struct ocfs2_la_recovery_item * item , * n ;
2005-12-16 01:31:24 +03:00
LIST_HEAD ( tmp_la_list ) ;
mlog_entry_void ( ) ;
mlog ( 0 , " completing recovery from keventd \n " ) ;
spin_lock ( & journal - > j_lock ) ;
list_splice_init ( & journal - > j_la_cleanups , & tmp_la_list ) ;
spin_unlock ( & journal - > j_lock ) ;
2007-05-17 18:03:13 +04:00
list_for_each_entry_safe ( item , n , & tmp_la_list , lri_list ) {
2005-12-16 01:31:24 +03:00
list_del_init ( & item - > lri_list ) ;
mlog ( 0 , " Complete recovery for slot %d \n " , item - > lri_slot ) ;
la_dinode = item - > lri_la_dinode ;
if ( la_dinode ) {
2006-03-03 21:24:33 +03:00
mlog ( 0 , " Clean up local alloc %llu \n " ,
2007-04-28 03:01:25 +04:00
( unsigned long long ) le64_to_cpu ( la_dinode - > i_blkno ) ) ;
2005-12-16 01:31:24 +03:00
ret = ocfs2_complete_local_alloc_recovery ( osb ,
la_dinode ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
kfree ( la_dinode ) ;
}
tl_dinode = item - > lri_tl_dinode ;
if ( tl_dinode ) {
2006-03-03 21:24:33 +03:00
mlog ( 0 , " Clean up truncate log %llu \n " ,
2007-04-28 03:01:25 +04:00
( unsigned long long ) le64_to_cpu ( tl_dinode - > i_blkno ) ) ;
2005-12-16 01:31:24 +03:00
ret = ocfs2_complete_truncate_log_recovery ( osb ,
tl_dinode ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
kfree ( tl_dinode ) ;
}
ret = ocfs2_recover_orphans ( osb , item - > lri_slot ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
kfree ( item ) ;
}
mlog ( 0 , " Recovery completion \n " ) ;
mlog_exit_void ( ) ;
}
/* NOTE: This function always eats your references to la_dinode and
* tl_dinode , either manually on error , or by passing them to
* ocfs2_complete_recovery */
static void ocfs2_queue_recovery_completion ( struct ocfs2_journal * journal ,
int slot_num ,
struct ocfs2_dinode * la_dinode ,
struct ocfs2_dinode * tl_dinode )
{
struct ocfs2_la_recovery_item * item ;
2006-04-13 01:37:00 +04:00
item = kmalloc ( sizeof ( struct ocfs2_la_recovery_item ) , GFP_NOFS ) ;
2005-12-16 01:31:24 +03:00
if ( ! item ) {
/* Though we wish to avoid it, we are in fact safe in
* skipping local alloc cleanup as fsck . ocfs2 is more
* than capable of reclaiming unused space . */
if ( la_dinode )
kfree ( la_dinode ) ;
if ( tl_dinode )
kfree ( tl_dinode ) ;
mlog_errno ( - ENOMEM ) ;
return ;
}
INIT_LIST_HEAD ( & item - > lri_list ) ;
item - > lri_la_dinode = la_dinode ;
item - > lri_slot = slot_num ;
item - > lri_tl_dinode = tl_dinode ;
spin_lock ( & journal - > j_lock ) ;
list_add_tail ( & item - > lri_list , & journal - > j_la_cleanups ) ;
queue_work ( ocfs2_wq , & journal - > j_recovery_work ) ;
spin_unlock ( & journal - > j_lock ) ;
}
/* Called by the mount code to queue recovery the last part of
* recovery for it ' s own slot . */
void ocfs2_complete_mount_recovery ( struct ocfs2_super * osb )
{
struct ocfs2_journal * journal = osb - > journal ;
if ( osb - > dirty ) {
/* No need to queue up our truncate_log as regular
* cleanup will catch that . */
ocfs2_queue_recovery_completion ( journal ,
osb - > slot_num ,
osb - > local_alloc_copy ,
NULL ) ;
ocfs2_schedule_truncate_log_flush ( osb , 0 ) ;
osb - > local_alloc_copy = NULL ;
osb - > dirty = 0 ;
}
}
static int __ocfs2_recovery_thread ( void * arg )
{
int status , node_num ;
struct ocfs2_super * osb = arg ;
2008-02-01 23:03:57 +03:00
struct ocfs2_recovery_map * rm = osb - > recovery_map ;
2005-12-16 01:31:24 +03:00
mlog_entry_void ( ) ;
status = ocfs2_wait_on_mount ( osb ) ;
if ( status < 0 ) {
goto bail ;
}
restart :
status = ocfs2_super_lock ( osb , 1 ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
2008-02-01 23:03:57 +03:00
spin_lock ( & osb - > osb_lock ) ;
while ( rm - > rm_used ) {
/* It's always safe to remove entry zero, as we won't
* clear it until ocfs2_recover_node ( ) has succeeded . */
node_num = rm - > rm_entries [ 0 ] ;
spin_unlock ( & osb - > osb_lock ) ;
2005-12-16 01:31:24 +03:00
status = ocfs2_recover_node ( osb , node_num ) ;
2008-02-01 23:03:57 +03:00
if ( ! status ) {
ocfs2_recovery_map_clear ( osb , node_num ) ;
} else {
2005-12-16 01:31:24 +03:00
mlog ( ML_ERROR ,
" Error %d recovering node %d on device (%u,%u)! \n " ,
status , node_num ,
MAJOR ( osb - > sb - > s_dev ) , MINOR ( osb - > sb - > s_dev ) ) ;
mlog ( ML_ERROR , " Volume requires unmount. \n " ) ;
}
2008-02-01 23:03:57 +03:00
spin_lock ( & osb - > osb_lock ) ;
2005-12-16 01:31:24 +03:00
}
2008-02-01 23:03:57 +03:00
spin_unlock ( & osb - > osb_lock ) ;
mlog ( 0 , " All nodes recovered \n " ) ;
2005-12-16 01:31:24 +03:00
ocfs2_super_unlock ( osb , 1 ) ;
/* We always run recovery on our own orphan dir - the dead
2007-09-25 02:56:19 +04:00
* node ( s ) may have disallowd a previos inode delete . Re - processing
* is therefore required . */
2005-12-16 01:31:24 +03:00
ocfs2_queue_recovery_completion ( osb - > journal , osb - > slot_num , NULL ,
NULL ) ;
bail :
2006-01-14 08:54:23 +03:00
mutex_lock ( & osb - > recovery_lock ) ;
2008-02-01 23:03:57 +03:00
if ( ! status & & ! ocfs2_recovery_completed ( osb ) ) {
2006-01-14 08:54:23 +03:00
mutex_unlock ( & osb - > recovery_lock ) ;
2005-12-16 01:31:24 +03:00
goto restart ;
}
osb - > recovery_thread_task = NULL ;
mb ( ) ; /* sync with ocfs2_recovery_thread_running */
wake_up ( & osb - > recovery_event ) ;
2006-01-14 08:54:23 +03:00
mutex_unlock ( & osb - > recovery_lock ) ;
2005-12-16 01:31:24 +03:00
mlog_exit ( status ) ;
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit ( ) . And it isn ' t exported , but
* complete_and_exit ( ) seems to be a minimal wrapper around it . */
complete_and_exit ( NULL , status ) ;
return status ;
}
void ocfs2_recovery_thread ( struct ocfs2_super * osb , int node_num )
{
mlog_entry ( " (node_num=%d, osb->node_num = %d) \n " ,
node_num , osb - > node_num ) ;
2006-01-14 08:54:23 +03:00
mutex_lock ( & osb - > recovery_lock ) ;
2005-12-16 01:31:24 +03:00
if ( osb - > disable_recovery )
goto out ;
/* People waiting on recovery will wait on
* the recovery map to empty . */
2008-02-01 23:03:57 +03:00
if ( ocfs2_recovery_map_set ( osb , node_num ) )
mlog ( 0 , " node %d already in recovery map. \n " , node_num ) ;
2005-12-16 01:31:24 +03:00
mlog ( 0 , " starting recovery thread... \n " ) ;
if ( osb - > recovery_thread_task )
goto out ;
osb - > recovery_thread_task = kthread_run ( __ocfs2_recovery_thread , osb ,
2006-05-04 23:03:26 +04:00
" ocfs2rec " ) ;
2005-12-16 01:31:24 +03:00
if ( IS_ERR ( osb - > recovery_thread_task ) ) {
mlog_errno ( ( int ) PTR_ERR ( osb - > recovery_thread_task ) ) ;
osb - > recovery_thread_task = NULL ;
}
out :
2006-01-14 08:54:23 +03:00
mutex_unlock ( & osb - > recovery_lock ) ;
2005-12-16 01:31:24 +03:00
wake_up ( & osb - > recovery_event ) ;
mlog_exit_void ( ) ;
}
/* Does the actual journal replay and marks the journal inode as
* clean . Will only replay if the journal inode is marked dirty . */
static int ocfs2_replay_journal ( struct ocfs2_super * osb ,
int node_num ,
int slot_num )
{
int status ;
int got_lock = 0 ;
unsigned int flags ;
struct inode * inode = NULL ;
struct ocfs2_dinode * fe ;
journal_t * journal = NULL ;
struct buffer_head * bh = NULL ;
inode = ocfs2_get_system_file_inode ( osb , JOURNAL_SYSTEM_INODE ,
slot_num ) ;
if ( inode = = NULL ) {
status = - EACCES ;
mlog_errno ( status ) ;
goto done ;
}
if ( is_bad_inode ( inode ) ) {
status = - EACCES ;
iput ( inode ) ;
inode = NULL ;
mlog_errno ( status ) ;
goto done ;
}
SET_INODE_JOURNAL ( inode ) ;
2007-10-19 02:30:42 +04:00
status = ocfs2_inode_lock_full ( inode , & bh , 1 , OCFS2_META_LOCK_RECOVERY ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
2007-10-19 02:30:42 +04:00
mlog ( 0 , " status returned from ocfs2_inode_lock=%d \n " , status ) ;
2005-12-16 01:31:24 +03:00
if ( status ! = - ERESTARTSYS )
mlog ( ML_ERROR , " Could not lock journal! \n " ) ;
goto done ;
}
got_lock = 1 ;
fe = ( struct ocfs2_dinode * ) bh - > b_data ;
flags = le32_to_cpu ( fe - > id1 . journal1 . ij_flags ) ;
if ( ! ( flags & OCFS2_JOURNAL_DIRTY_FL ) ) {
mlog ( 0 , " No recovery required for node %d \n " , node_num ) ;
goto done ;
}
mlog ( ML_NOTICE , " Recovering node %d from slot %d on device (%u,%u) \n " ,
node_num , slot_num ,
MAJOR ( osb - > sb - > s_dev ) , MINOR ( osb - > sb - > s_dev ) ) ;
OCFS2_I ( inode ) - > ip_clusters = le32_to_cpu ( fe - > i_clusters ) ;
status = ocfs2_force_read_journal ( inode ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto done ;
}
mlog ( 0 , " calling journal_init_inode \n " ) ;
journal = journal_init_inode ( inode ) ;
if ( journal = = NULL ) {
mlog ( ML_ERROR , " Linux journal layer error \n " ) ;
status = - EIO ;
goto done ;
}
status = journal_load ( journal ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
if ( ! igrab ( inode ) )
BUG ( ) ;
journal_destroy ( journal ) ;
goto done ;
}
ocfs2_clear_journal_error ( osb - > sb , journal , slot_num ) ;
/* wipe the journal */
mlog ( 0 , " flushing the journal. \n " ) ;
journal_lock_updates ( journal ) ;
status = journal_flush ( journal ) ;
journal_unlock_updates ( journal ) ;
if ( status < 0 )
mlog_errno ( status ) ;
/* This will mark the node clean */
flags = le32_to_cpu ( fe - > id1 . journal1 . ij_flags ) ;
flags & = ~ OCFS2_JOURNAL_DIRTY_FL ;
fe - > id1 . journal1 . ij_flags = cpu_to_le32 ( flags ) ;
status = ocfs2_write_block ( osb , bh , inode ) ;
if ( status < 0 )
mlog_errno ( status ) ;
if ( ! igrab ( inode ) )
BUG ( ) ;
journal_destroy ( journal ) ;
done :
/* drop the lock on this nodes journal */
if ( got_lock )
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( inode , 1 ) ;
2005-12-16 01:31:24 +03:00
if ( inode )
iput ( inode ) ;
if ( bh )
brelse ( bh ) ;
mlog_exit ( status ) ;
return status ;
}
/*
* Do the most important parts of node recovery :
* - Replay it ' s journal
* - Stamp a clean local allocator file
* - Stamp a clean truncate log
* - Mark the node clean
*
* If this function completes without error , a node in OCFS2 can be
* said to have been safely recovered . As a result , failure during the
* second part of a nodes recovery process ( local alloc recovery ) is
* far less concerning .
*/
static int ocfs2_recover_node ( struct ocfs2_super * osb ,
int node_num )
{
int status = 0 ;
int slot_num ;
struct ocfs2_dinode * la_copy = NULL ;
struct ocfs2_dinode * tl_copy = NULL ;
mlog_entry ( " (node_num=%d, osb->node_num = %d) \n " ,
node_num , osb - > node_num ) ;
mlog ( 0 , " checking node %d \n " , node_num ) ;
/* Should not ever be called to recover ourselves -- in that
* case we should ' ve called ocfs2_journal_load instead . */
2006-01-27 12:32:52 +03:00
BUG_ON ( osb - > node_num = = node_num ) ;
2005-12-16 01:31:24 +03:00
2008-02-01 23:01:05 +03:00
slot_num = ocfs2_node_num_to_slot ( osb , node_num ) ;
if ( slot_num = = - ENOENT ) {
2005-12-16 01:31:24 +03:00
status = 0 ;
mlog ( 0 , " no slot for this node, so no recovery required. \n " ) ;
goto done ;
}
mlog ( 0 , " node %d was using slot %d \n " , node_num , slot_num ) ;
status = ocfs2_replay_journal ( osb , node_num , slot_num ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto done ;
}
/* Stamp a clean local alloc file AFTER recovering the journal... */
status = ocfs2_begin_local_alloc_recovery ( osb , slot_num , & la_copy ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto done ;
}
/* An error from begin_truncate_log_recovery is not
* serious enough to warrant halting the rest of
* recovery . */
status = ocfs2_begin_truncate_log_recovery ( osb , slot_num , & tl_copy ) ;
if ( status < 0 )
mlog_errno ( status ) ;
/* Likewise, this would be a strange but ultimately not so
* harmful place to get an error . . . */
2008-02-01 22:59:09 +03:00
status = ocfs2_clear_slot ( osb , slot_num ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 )
mlog_errno ( status ) ;
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion ( osb - > journal , slot_num , la_copy ,
tl_copy ) ;
status = 0 ;
done :
mlog_exit ( status ) ;
return status ;
}
/* Test node liveness by trylocking his journal. If we get the lock,
* we drop it here . Return 0 if we got the lock , - EAGAIN if node is
* still alive ( we couldn ' t get the lock ) and < 0 on error . */
static int ocfs2_trylock_journal ( struct ocfs2_super * osb ,
int slot_num )
{
int status , flags ;
struct inode * inode = NULL ;
inode = ocfs2_get_system_file_inode ( osb , JOURNAL_SYSTEM_INODE ,
slot_num ) ;
if ( inode = = NULL ) {
mlog ( ML_ERROR , " access error \n " ) ;
status = - EACCES ;
goto bail ;
}
if ( is_bad_inode ( inode ) ) {
mlog ( ML_ERROR , " access error (bad inode) \n " ) ;
iput ( inode ) ;
inode = NULL ;
status = - EACCES ;
goto bail ;
}
SET_INODE_JOURNAL ( inode ) ;
flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE ;
2007-10-19 02:30:42 +04:00
status = ocfs2_inode_lock_full ( inode , NULL , 1 , flags ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
if ( status ! = - EAGAIN )
mlog_errno ( status ) ;
goto bail ;
}
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( inode , 1 ) ;
2005-12-16 01:31:24 +03:00
bail :
if ( inode )
iput ( inode ) ;
return status ;
}
/* Call this underneath ocfs2_super_lock. It also assumes that the
* slot info struct has been updated from disk . */
int ocfs2_mark_dead_nodes ( struct ocfs2_super * osb )
{
2008-02-01 23:01:05 +03:00
unsigned int node_num ;
int status , i ;
2005-12-16 01:31:24 +03:00
/* This is called with the super block cluster lock, so we
* know that the slot map can ' t change underneath us . */
2008-02-01 23:01:05 +03:00
spin_lock ( & osb - > osb_lock ) ;
for ( i = 0 ; i < osb - > max_slots ; i + + ) {
2005-12-16 01:31:24 +03:00
if ( i = = osb - > slot_num )
continue ;
2008-02-01 23:01:05 +03:00
status = ocfs2_slot_to_node_num_locked ( osb , i , & node_num ) ;
if ( status = = - ENOENT )
2005-12-16 01:31:24 +03:00
continue ;
2008-02-01 23:03:57 +03:00
if ( __ocfs2_recovery_map_test ( osb , node_num ) )
2005-12-16 01:31:24 +03:00
continue ;
2008-02-01 23:01:05 +03:00
spin_unlock ( & osb - > osb_lock ) ;
2005-12-16 01:31:24 +03:00
/* Ok, we have a slot occupied by another node which
* is not in the recovery map . We trylock his journal
* file here to test if he ' s alive . */
status = ocfs2_trylock_journal ( osb , i ) ;
if ( ! status ) {
/* Since we're called from mount, we know that
* the recovery thread can ' t race us on
* setting / checking the recovery bits . */
ocfs2_recovery_thread ( osb , node_num ) ;
} else if ( ( status < 0 ) & & ( status ! = - EAGAIN ) ) {
mlog_errno ( status ) ;
goto bail ;
}
2008-02-01 23:01:05 +03:00
spin_lock ( & osb - > osb_lock ) ;
2005-12-16 01:31:24 +03:00
}
2008-02-01 23:01:05 +03:00
spin_unlock ( & osb - > osb_lock ) ;
2005-12-16 01:31:24 +03:00
status = 0 ;
bail :
mlog_exit ( status ) ;
return status ;
}
2007-09-11 04:50:51 +04:00
struct ocfs2_orphan_filldir_priv {
struct inode * head ;
struct ocfs2_super * osb ;
} ;
static int ocfs2_orphan_filldir ( void * priv , const char * name , int name_len ,
loff_t pos , u64 ino , unsigned type )
{
struct ocfs2_orphan_filldir_priv * p = priv ;
struct inode * iter ;
if ( name_len = = 1 & & ! strncmp ( " . " , name , 1 ) )
return 0 ;
if ( name_len = = 2 & & ! strncmp ( " .. " , name , 2 ) )
return 0 ;
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget ( p - > osb , ino ,
2008-01-11 02:11:45 +03:00
OCFS2_FI_FLAG_ORPHAN_RECOVERY , 0 ) ;
2007-09-11 04:50:51 +04:00
if ( IS_ERR ( iter ) )
return 0 ;
mlog ( 0 , " queue orphan %llu \n " ,
( unsigned long long ) OCFS2_I ( iter ) - > ip_blkno ) ;
/* No locking is required for the next_orphan queue as there
* is only ever a single process doing orphan recovery . */
OCFS2_I ( iter ) - > ip_next_orphan = p - > head ;
p - > head = iter ;
return 0 ;
}
2006-02-23 04:35:08 +03:00
static int ocfs2_queue_orphans ( struct ocfs2_super * osb ,
int slot ,
struct inode * * head )
2005-12-16 01:31:24 +03:00
{
2006-02-23 04:35:08 +03:00
int status ;
2005-12-16 01:31:24 +03:00
struct inode * orphan_dir_inode = NULL ;
2007-09-11 04:50:51 +04:00
struct ocfs2_orphan_filldir_priv priv ;
loff_t pos = 0 ;
priv . osb = osb ;
priv . head = * head ;
2005-12-16 01:31:24 +03:00
orphan_dir_inode = ocfs2_get_system_file_inode ( osb ,
ORPHAN_DIR_SYSTEM_INODE ,
slot ) ;
if ( ! orphan_dir_inode ) {
status = - ENOENT ;
mlog_errno ( status ) ;
2006-02-23 04:35:08 +03:00
return status ;
}
2005-12-16 01:31:24 +03:00
2006-01-10 02:59:24 +03:00
mutex_lock ( & orphan_dir_inode - > i_mutex ) ;
2007-10-19 02:30:42 +04:00
status = ocfs2_inode_lock ( orphan_dir_inode , NULL , 0 ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto out ;
}
2007-09-11 04:50:51 +04:00
status = ocfs2_dir_foreach ( orphan_dir_inode , & pos , & priv ,
ocfs2_orphan_filldir ) ;
if ( status ) {
mlog_errno ( status ) ;
2007-12-04 01:06:23 +03:00
goto out_cluster ;
2005-12-16 01:31:24 +03:00
}
2007-09-11 04:50:51 +04:00
* head = priv . head ;
2007-12-04 01:06:23 +03:00
out_cluster :
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( orphan_dir_inode , 0 ) ;
2006-02-23 04:35:08 +03:00
out :
mutex_unlock ( & orphan_dir_inode - > i_mutex ) ;
2005-12-16 01:31:24 +03:00
iput ( orphan_dir_inode ) ;
2006-02-23 04:35:08 +03:00
return status ;
}
static int ocfs2_orphan_recovery_can_continue ( struct ocfs2_super * osb ,
int slot )
{
int ret ;
spin_lock ( & osb - > osb_lock ) ;
ret = ! osb - > osb_orphan_wipes [ slot ] ;
spin_unlock ( & osb - > osb_lock ) ;
return ret ;
}
static void ocfs2_mark_recovering_orphan_dir ( struct ocfs2_super * osb ,
int slot )
{
spin_lock ( & osb - > osb_lock ) ;
/* Mark ourselves such that new processes in delete_inode()
* know to quit early . */
ocfs2_node_map_set_bit ( osb , & osb - > osb_recovering_orphan_dirs , slot ) ;
while ( osb - > osb_orphan_wipes [ slot ] ) {
/* If any processes are already in the middle of an
* orphan wipe on this dir , then we need to wait for
* them . */
spin_unlock ( & osb - > osb_lock ) ;
wait_event_interruptible ( osb - > osb_wipe_event ,
ocfs2_orphan_recovery_can_continue ( osb , slot ) ) ;
spin_lock ( & osb - > osb_lock ) ;
}
spin_unlock ( & osb - > osb_lock ) ;
}
static void ocfs2_clear_recovering_orphan_dir ( struct ocfs2_super * osb ,
int slot )
{
ocfs2_node_map_clear_bit ( osb , & osb - > osb_recovering_orphan_dirs , slot ) ;
}
/*
* Orphan recovery . Each mounted node has it ' s own orphan dir which we
* must run during recovery . Our strategy here is to build a list of
* the inodes in the orphan dir and iget / iput them . The VFS does
* ( most ) of the rest of the work .
*
* Orphan recovery can happen at any time , not just mount so we have a
* couple of extra considerations .
*
* - We grab as many inodes as we can under the orphan dir lock -
* doing iget ( ) outside the orphan dir risks getting a reference on
* an invalid inode .
* - We must be sure not to deadlock with other processes on the
* system wanting to run delete_inode ( ) . This can happen when they go
* to lock the orphan dir and the orphan recovery process attempts to
* iget ( ) inside the orphan dir lock . This can be avoided by
* advertising our state to ocfs2_delete_inode ( ) .
*/
static int ocfs2_recover_orphans ( struct ocfs2_super * osb ,
int slot )
{
int ret = 0 ;
struct inode * inode = NULL ;
struct inode * iter ;
struct ocfs2_inode_info * oi ;
mlog ( 0 , " Recover inodes from orphan dir in slot %d \n " , slot ) ;
ocfs2_mark_recovering_orphan_dir ( osb , slot ) ;
ret = ocfs2_queue_orphans ( osb , slot , & inode ) ;
ocfs2_clear_recovering_orphan_dir ( osb , slot ) ;
/* Error here should be noted, but we want to continue with as
* many queued inodes as we ' ve got . */
if ( ret )
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
while ( inode ) {
oi = OCFS2_I ( inode ) ;
2006-03-03 21:24:33 +03:00
mlog ( 0 , " iput orphan %llu \n " , ( unsigned long long ) oi - > ip_blkno ) ;
2005-12-16 01:31:24 +03:00
iter = oi - > ip_next_orphan ;
spin_lock ( & oi - > ip_lock ) ;
2007-09-25 02:56:19 +04:00
/* The remote delete code may have set these on the
* assumption that the other node would wipe them
* successfully . If they are still in the node ' s
* orphan dir , we need to reset that state . */
2005-12-16 01:31:24 +03:00
oi - > ip_flags & = ~ ( OCFS2_INODE_DELETED | OCFS2_INODE_SKIP_DELETE ) ;
/* Set the proper information to get us going into
* ocfs2_delete_inode . */
oi - > ip_flags | = OCFS2_INODE_MAYBE_ORPHANED ;
spin_unlock ( & oi - > ip_lock ) ;
iput ( inode ) ;
inode = iter ;
}
2006-02-23 04:35:08 +03:00
return ret ;
2005-12-16 01:31:24 +03:00
}
static int ocfs2_wait_on_mount ( struct ocfs2_super * osb )
{
/* This check is good because ocfs2 will wait on our recovery
* thread before changing it to something other than MOUNTED
* or DISABLED . */
wait_event ( osb - > osb_mount_event ,
atomic_read ( & osb - > vol_state ) = = VOLUME_MOUNTED | |
atomic_read ( & osb - > vol_state ) = = VOLUME_DISABLED ) ;
/* If there's an error on mount, then we may never get to the
* MOUNTED flag , but this is set right before
* dismount_volume ( ) so we can trust it . */
if ( atomic_read ( & osb - > vol_state ) = = VOLUME_DISABLED ) {
mlog ( 0 , " mount error, exiting! \n " ) ;
return - EBUSY ;
}
return 0 ;
}
static int ocfs2_commit_thread ( void * arg )
{
int status ;
struct ocfs2_super * osb = arg ;
struct ocfs2_journal * journal = osb - > journal ;
/* we can trust j_num_trans here because _should_stop() is only set in
* shutdown and nobody other than ourselves should be able to start
* transactions . committing on shutdown might take a few iterations
* as final transactions put deleted inodes on the list */
while ( ! ( kthread_should_stop ( ) & &
atomic_read ( & journal - > j_num_trans ) = = 0 ) ) {
2006-02-10 00:23:39 +03:00
wait_event_interruptible ( osb - > checkpoint_event ,
atomic_read ( & journal - > j_num_trans )
| | kthread_should_stop ( ) ) ;
2005-12-16 01:31:24 +03:00
status = ocfs2_commit_cache ( osb ) ;
if ( status < 0 )
mlog_errno ( status ) ;
if ( kthread_should_stop ( ) & & atomic_read ( & journal - > j_num_trans ) ) {
mlog ( ML_KTHREAD ,
" commit_thread: %u transactions pending on "
" shutdown \n " ,
atomic_read ( & journal - > j_num_trans ) ) ;
}
}
return 0 ;
}
/* Look for a dirty journal without taking any cluster locks. Used for
* hard readonly access to determine whether the file system journals
* require recovery . */
int ocfs2_check_journals_nolocks ( struct ocfs2_super * osb )
{
int ret = 0 ;
unsigned int slot ;
struct buffer_head * di_bh ;
struct ocfs2_dinode * di ;
struct inode * journal = NULL ;
for ( slot = 0 ; slot < osb - > max_slots ; slot + + ) {
journal = ocfs2_get_system_file_inode ( osb ,
JOURNAL_SYSTEM_INODE ,
slot ) ;
if ( ! journal | | is_bad_inode ( journal ) ) {
ret = - EACCES ;
mlog_errno ( ret ) ;
goto out ;
}
di_bh = NULL ;
ret = ocfs2_read_block ( osb , OCFS2_I ( journal ) - > ip_blkno , & di_bh ,
0 , journal ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
di = ( struct ocfs2_dinode * ) di_bh - > b_data ;
if ( le32_to_cpu ( di - > id1 . journal1 . ij_flags ) &
OCFS2_JOURNAL_DIRTY_FL )
ret = - EROFS ;
brelse ( di_bh ) ;
if ( ret )
break ;
}
out :
if ( journal )
iput ( journal ) ;
return ret ;
}