2019-05-31 11:09:56 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2006-01-16 19:50:04 +03:00
/*
* Copyright ( C ) Sistina Software , Inc . 1997 - 2003 All rights reserved .
2006-05-18 23:09:15 +04:00
* Copyright ( C ) 2004 - 2006 Red Hat , Inc . All rights reserved .
2006-01-16 19:50:04 +03:00
*/
2009-11-21 00:50:40 +03:00
# include <linux/module.h>
2006-01-16 19:50:04 +03:00
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/completion.h>
# include <linux/buffer_head.h>
2006-02-28 01:23:27 +03:00
# include <linux/gfs2_ondisk.h>
2006-03-28 23:14:04 +04:00
# include <linux/crc32.h>
2018-01-17 02:01:33 +03:00
# include <linux/crc32c.h>
2018-03-29 20:41:27 +03:00
# include <linux/ktime.h>
2006-01-16 19:50:04 +03:00
# include "gfs2.h"
2006-02-28 01:23:27 +03:00
# include "incore.h"
2006-01-16 19:50:04 +03:00
# include "bmap.h"
# include "glock.h"
# include "glops.h"
2017-12-18 21:48:29 +03:00
# include "log.h"
2006-01-16 19:50:04 +03:00
# include "lops.h"
# include "meta_io.h"
# include "recovery.h"
# include "super.h"
2006-02-28 01:23:27 +03:00
# include "util.h"
2006-03-28 23:14:04 +04:00
# include "dir.h"
2006-01-16 19:50:04 +03:00
2010-07-21 00:09:02 +04:00
struct workqueue_struct * gfs_recovery_wq ;
2006-01-16 19:50:04 +03:00
int gfs2_replay_read_block ( struct gfs2_jdesc * jd , unsigned int blk ,
struct buffer_head * * bh )
{
2006-06-14 23:32:57 +04:00
struct gfs2_inode * ip = GFS2_I ( jd - > jd_inode ) ;
2006-02-28 01:23:27 +03:00
struct gfs2_glock * gl = ip - > i_gl ;
2006-09-04 20:49:07 +04:00
u64 dblock ;
u32 extlen ;
2006-01-16 19:50:04 +03:00
int error ;
2021-04-01 00:17:38 +03:00
extlen = 32 ;
error = gfs2_get_extent ( & ip - > i_inode , blk , & dblock , & extlen ) ;
2006-01-16 19:50:04 +03:00
if ( error )
return error ;
if ( ! dblock ) {
2006-02-28 01:23:27 +03:00
gfs2_consist_inode ( ip ) ;
2006-01-16 19:50:04 +03:00
return - EIO ;
}
2006-09-22 01:05:23 +04:00
* bh = gfs2_meta_ra ( gl , dblock , extlen ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
2014-03-07 02:19:15 +04:00
int gfs2_revoke_add ( struct gfs2_jdesc * jd , u64 blkno , unsigned int where )
2006-01-16 19:50:04 +03:00
{
2014-03-07 02:19:15 +04:00
struct list_head * head = & jd - > jd_revoke_list ;
2006-01-16 19:50:04 +03:00
struct gfs2_revoke_replay * rr ;
int found = 0 ;
list_for_each_entry ( rr , head , rr_list ) {
if ( rr - > rr_blkno = = blkno ) {
found = 1 ;
break ;
}
}
if ( found ) {
rr - > rr_where = where ;
return 0 ;
}
2008-04-09 17:33:41 +04:00
rr = kmalloc ( sizeof ( struct gfs2_revoke_replay ) , GFP_NOFS ) ;
2006-01-16 19:50:04 +03:00
if ( ! rr )
return - ENOMEM ;
rr - > rr_blkno = blkno ;
rr - > rr_where = where ;
list_add ( & rr - > rr_list , head ) ;
return 1 ;
}
2014-03-07 02:19:15 +04:00
int gfs2_revoke_check ( struct gfs2_jdesc * jd , u64 blkno , unsigned int where )
2006-01-16 19:50:04 +03:00
{
struct gfs2_revoke_replay * rr ;
int wrap , a , b , revoke ;
int found = 0 ;
2014-03-07 02:19:15 +04:00
list_for_each_entry ( rr , & jd - > jd_revoke_list , rr_list ) {
2006-01-16 19:50:04 +03:00
if ( rr - > rr_blkno = = blkno ) {
found = 1 ;
break ;
}
}
if ( ! found )
return 0 ;
2014-03-07 02:19:15 +04:00
wrap = ( rr - > rr_where < jd - > jd_replay_tail ) ;
a = ( jd - > jd_replay_tail < where ) ;
2006-01-16 19:50:04 +03:00
b = ( where < rr - > rr_where ) ;
revoke = ( wrap ) ? ( a | | b ) : ( a & & b ) ;
return revoke ;
}
2014-03-07 02:19:15 +04:00
void gfs2_revoke_clean ( struct gfs2_jdesc * jd )
2006-01-16 19:50:04 +03:00
{
2014-03-07 02:19:15 +04:00
struct list_head * head = & jd - > jd_revoke_list ;
2006-01-16 19:50:04 +03:00
struct gfs2_revoke_replay * rr ;
while ( ! list_empty ( head ) ) {
2020-02-03 21:22:45 +03:00
rr = list_first_entry ( head , struct gfs2_revoke_replay , rr_list ) ;
2006-01-16 19:50:04 +03:00
list_del ( & rr - > rr_list ) ;
kfree ( rr ) ;
}
}
2018-11-09 18:54:18 +03:00
int __get_log_header ( struct gfs2_sbd * sdp , const struct gfs2_log_header * lh ,
unsigned int blkno , struct gfs2_log_header_host * head )
{
u32 hash , crc ;
if ( lh - > lh_header . mh_magic ! = cpu_to_be32 ( GFS2_MAGIC ) | |
lh - > lh_header . mh_type ! = cpu_to_be32 ( GFS2_METATYPE_LH ) | |
( blkno & & be32_to_cpu ( lh - > lh_blkno ) ! = blkno ) )
return 1 ;
hash = crc32 ( ~ 0 , lh , LH_V1_SIZE - 4 ) ;
hash = ~ crc32_le_shift ( hash , 4 ) ; /* assume lh_hash is zero */
if ( be32_to_cpu ( lh - > lh_hash ) ! = hash )
return 1 ;
crc = crc32c ( ~ 0 , ( void * ) lh + LH_V1_SIZE + 4 ,
sdp - > sd_sb . sb_bsize - LH_V1_SIZE - 4 ) ;
if ( ( lh - > lh_crc ! = 0 & & be32_to_cpu ( lh - > lh_crc ) ! = crc ) )
return 1 ;
head - > lh_sequence = be64_to_cpu ( lh - > lh_sequence ) ;
head - > lh_flags = be32_to_cpu ( lh - > lh_flags ) ;
head - > lh_tail = be32_to_cpu ( lh - > lh_tail ) ;
head - > lh_blkno = be32_to_cpu ( lh - > lh_blkno ) ;
2020-10-20 23:58:03 +03:00
head - > lh_local_total = be64_to_cpu ( lh - > lh_local_total ) ;
head - > lh_local_free = be64_to_cpu ( lh - > lh_local_free ) ;
head - > lh_local_dinodes = be64_to_cpu ( lh - > lh_local_dinodes ) ;
2018-11-09 18:54:18 +03:00
return 0 ;
}
2006-01-16 19:50:04 +03:00
/**
* get_log_header - read the log header for a given segment
* @ jd : the journal
* @ blk : the block to look at
2021-03-30 19:44:29 +03:00
* @ head : the log header to return
2006-01-16 19:50:04 +03:00
*
* Read the log header for a given segement in a given journal . Do a few
* sanity checks on it .
*
* Returns : 0 on success ,
* 1 if the header was invalid or incomplete ,
* errno on error
*/
static int get_log_header ( struct gfs2_jdesc * jd , unsigned int blk ,
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host * head )
2006-01-16 19:50:04 +03:00
{
2018-11-09 18:54:18 +03:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-01-16 19:50:04 +03:00
struct buffer_head * bh ;
int error ;
error = gfs2_replay_read_block ( jd , blk , & bh ) ;
if ( error )
return error ;
2018-11-09 18:54:18 +03:00
error = __get_log_header ( sdp , ( const struct gfs2_log_header * ) bh - > b_data ,
blk , head ) ;
2018-01-17 01:07:57 +03:00
brelse ( bh ) ;
2006-01-16 19:50:04 +03:00
2018-01-17 01:07:57 +03:00
return error ;
2006-01-16 19:50:04 +03:00
}
/**
* foreach_descriptor - go through the active part of the log
* @ jd : the journal
* @ start : the first log header in the active region
* @ end : the last log header ( don ' t process the contents of this entry ) )
2021-03-30 19:44:29 +03:00
* @ pass : iteration number ( foreach_descriptor ( ) is called in a for ( ) loop )
2006-01-16 19:50:04 +03:00
*
* Call a given function once for every log descriptor in the active
* portion of the log .
*
* Returns : errno
*/
2019-03-25 18:34:19 +03:00
static int foreach_descriptor ( struct gfs2_jdesc * jd , u32 start ,
2006-01-16 19:50:04 +03:00
unsigned int end , int pass )
{
2006-06-14 23:32:57 +04:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-01-16 19:50:04 +03:00
struct buffer_head * bh ;
struct gfs2_log_descriptor * ld ;
int error = 0 ;
u32 length ;
__be64 * ptr ;
unsigned int offset = sizeof ( struct gfs2_log_descriptor ) ;
2006-09-05 22:41:30 +04:00
offset + = sizeof ( __be64 ) - 1 ;
offset & = ~ ( sizeof ( __be64 ) - 1 ) ;
2006-01-16 19:50:04 +03:00
while ( start ! = end ) {
error = gfs2_replay_read_block ( jd , start , & bh ) ;
if ( error )
return error ;
if ( gfs2_meta_check ( sdp , bh ) ) {
brelse ( bh ) ;
return - EIO ;
}
ld = ( struct gfs2_log_descriptor * ) bh - > b_data ;
length = be32_to_cpu ( ld - > ld_length ) ;
2006-03-31 00:46:23 +04:00
if ( be32_to_cpu ( ld - > ld_header . mh_type ) = = GFS2_METATYPE_LH ) {
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host lh ;
2006-01-16 19:50:04 +03:00
error = get_log_header ( jd , start , & lh ) ;
if ( ! error ) {
2016-07-21 21:02:44 +03:00
gfs2_replay_incr_blk ( jd , & start ) ;
2006-08-10 20:08:40 +04:00
brelse ( bh ) ;
2006-01-16 19:50:04 +03:00
continue ;
}
if ( error = = 1 ) {
2006-06-14 23:32:57 +04:00
gfs2_consist_inode ( GFS2_I ( jd - > jd_inode ) ) ;
2006-01-16 19:50:04 +03:00
error = - EIO ;
}
brelse ( bh ) ;
return error ;
} else if ( gfs2_metatype_check ( sdp , bh , GFS2_METATYPE_LD ) ) {
brelse ( bh ) ;
return - EIO ;
}
ptr = ( __be64 * ) ( bh - > b_data + offset ) ;
error = lops_scan_elements ( jd , start , ld , ptr , pass ) ;
if ( error ) {
brelse ( bh ) ;
return error ;
}
while ( length - - )
2016-07-21 21:02:44 +03:00
gfs2_replay_incr_blk ( jd , & start ) ;
2006-01-16 19:50:04 +03:00
brelse ( bh ) ;
}
return 0 ;
}
/**
* clean_journal - mark a dirty journal as being clean
* @ jd : the journal
* @ head : the head journal to start from
*
* Returns : errno
*/
2017-12-18 21:48:29 +03:00
static void clean_journal ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head )
2006-01-16 19:50:04 +03:00
{
2006-06-14 23:32:57 +04:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2019-03-25 18:34:19 +03:00
u32 lblock = head - > lh_blkno ;
2006-01-16 19:50:04 +03:00
2019-03-25 18:34:19 +03:00
gfs2_replay_incr_blk ( jd , & lblock ) ;
gfs2_write_log_header ( sdp , jd , head - > lh_sequence + 1 , 0 , lblock ,
2018-01-17 02:01:33 +03:00
GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY ,
REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC ) ;
2019-08-28 23:21:34 +03:00
if ( jd - > jd_jid = = sdp - > sd_lockstruct . ls_jid ) {
sdp - > sd_log_flush_head = lblock ;
gfs2_log_incr_head ( sdp ) ;
}
2006-01-16 19:50:04 +03:00
}
2008-01-30 18:34:04 +03:00
2009-01-12 13:43:39 +03:00
static void gfs2_recovery_done ( struct gfs2_sbd * sdp , unsigned int jid ,
unsigned int message )
2008-01-30 18:34:04 +03:00
{
2009-01-12 13:43:39 +03:00
char env_jid [ 20 ] ;
char env_status [ 20 ] ;
char * envp [ ] = { env_jid , env_status , NULL } ;
struct lm_lockstruct * ls = & sdp - > sd_lockstruct ;
2012-01-10 02:18:05 +04:00
2009-01-12 13:43:39 +03:00
ls - > ls_recover_jid_done = jid ;
ls - > ls_recover_jid_status = message ;
2015-01-12 14:01:03 +03:00
sprintf ( env_jid , " JID=%u " , jid ) ;
2009-01-12 13:43:39 +03:00
sprintf ( env_status , " RECOVERY=%s " ,
message = = LM_RD_SUCCESS ? " Done " : " Failed " ) ;
kobject_uevent_env ( & sdp - > sd_kobj , KOBJ_CHANGE , envp ) ;
2012-01-10 02:18:05 +04:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_recovery_result )
sdp - > sd_lockstruct . ls_ops - > lm_recovery_result ( sdp , jid , message ) ;
2008-01-30 18:34:04 +03:00
}
2020-10-20 23:58:05 +03:00
/**
* update_statfs_inode - Update the master statfs inode or zero out the local
* statfs inode for a given journal .
* @ jd : The journal
* @ head : If NULL , @ inode is the local statfs inode and we need to zero it out .
* Otherwise , it @ head contains the statfs change info that needs to be
* synced to the master statfs inode ( pointed to by @ inode ) .
* @ inode : statfs inode to update .
*/
static int update_statfs_inode ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head ,
struct inode * inode )
{
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
struct gfs2_inode * ip ;
struct buffer_head * bh ;
struct gfs2_statfs_change_host sc ;
int error = 0 ;
BUG_ON ( ! inode ) ;
ip = GFS2_I ( inode ) ;
error = gfs2_meta_inode_buffer ( ip , & bh ) ;
if ( error )
goto out ;
spin_lock ( & sdp - > sd_statfs_spin ) ;
if ( head ) { /* Update the master statfs inode */
gfs2_statfs_change_in ( & sc , bh - > b_data + sizeof ( struct gfs2_dinode ) ) ;
sc . sc_total + = head - > lh_local_total ;
sc . sc_free + = head - > lh_local_free ;
sc . sc_dinodes + = head - > lh_local_dinodes ;
gfs2_statfs_change_out ( & sc , bh - > b_data + sizeof ( struct gfs2_dinode ) ) ;
fs_info ( sdp , " jid=%u: Updated master statfs Total:%lld, "
" Free:%lld, Dinodes:%lld after change "
" [%+lld,%+lld,%+lld] \n " , jd - > jd_jid , sc . sc_total ,
sc . sc_free , sc . sc_dinodes , head - > lh_local_total ,
head - > lh_local_free , head - > lh_local_dinodes ) ;
} else { /* Zero out the local statfs inode */
memset ( bh - > b_data + sizeof ( struct gfs2_dinode ) , 0 ,
sizeof ( struct gfs2_statfs_change ) ) ;
/* If it's our own journal, reset any in-memory changes too */
if ( jd - > jd_jid = = sdp - > sd_lockstruct . ls_jid ) {
memset ( & sdp - > sd_statfs_local , 0 ,
sizeof ( struct gfs2_statfs_change_host ) ) ;
}
}
spin_unlock ( & sdp - > sd_statfs_spin ) ;
mark_buffer_dirty ( bh ) ;
brelse ( bh ) ;
2020-10-27 20:29:37 +03:00
gfs2_inode_metasync ( ip - > i_gl ) ;
2020-10-20 23:58:05 +03:00
out :
return error ;
}
/**
* recover_local_statfs - Update the master and local statfs changes for this
* journal .
*
* Previously , statfs updates would be read in from the local statfs inode and
* synced to the master statfs inode during recovery .
*
* We now use the statfs updates in the journal head to update the master statfs
* inode instead of reading in from the local statfs inode . To preserve backward
* compatibility with kernels that can ' t do this , we still need to keep the
* local statfs inode up to date by writing changes to it . At some point in the
* future , we can do away with the local statfs inodes altogether and keep the
* statfs changes solely in the journal .
*
* @ jd : the journal
* @ head : the journal head
*
* Returns : errno
*/
static void recover_local_statfs ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head )
{
int error ;
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
if ( ! head - > lh_local_total & & ! head - > lh_local_free
& & ! head - > lh_local_dinodes ) /* No change */
goto zero_local ;
/* First update the master statfs inode with the changes we
* found in the journal . */
error = update_statfs_inode ( jd , head , sdp - > sd_statfs_inode ) ;
if ( error )
goto out ;
zero_local :
/* Zero out the local statfs inode so any changes in there
* are not re - recovered . */
error = update_statfs_inode ( jd , NULL ,
find_local_statfs_inode ( sdp , jd - > jd_jid ) ) ;
out :
return ;
}
2010-07-21 00:09:02 +04:00
void gfs2_recover_func ( struct work_struct * work )
2006-01-16 19:50:04 +03:00
{
2009-05-19 13:01:18 +04:00
struct gfs2_jdesc * jd = container_of ( work , struct gfs2_jdesc , jd_work ) ;
2006-06-14 23:32:57 +04:00
struct gfs2_inode * ip = GFS2_I ( jd - > jd_inode ) ;
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host head ;
GFS2: remove transaction glock
GFS2 has a transaction glock, which must be grabbed for every
transaction, whose purpose is to deal with freezing the filesystem.
Aside from this involving a large amount of locking, it is very easy to
make the current fsfreeze code hang on unfreezing.
This patch rewrites how gfs2 handles freezing the filesystem. The
transaction glock is removed. In it's place is a freeze glock, which is
cached (but not held) in a shared state by every node in the cluster
when the filesystem is mounted. This lock only needs to be grabbed on
freezing, and actions which need to be safe from freezing, like
recovery.
When a node wants to freeze the filesystem, it grabs this glock
exclusively. When the freeze glock state changes on the nodes (either
from shared to unlocked, or shared to exclusive), the filesystem does a
special log flush. gfs2_log_flush() does all the work for flushing out
the and shutting down the incore log, and then it tries to grab the
freeze glock in a shared state again. Since the filesystem is stuck in
gfs2_log_flush, no new transaction can start, and nothing can be written
to disk. Unfreezing the filesytem simply involes dropping the freeze
glock, allowing gfs2_log_flush() to grab and then release the shared
lock, so it is cached for next time.
However, in order for the unfreezing ioctl to occur, gfs2 needs to get a
shared lock on the filesystem root directory inode to check permissions.
If that glock has already been grabbed exclusively, fsfreeze will be
unable to get the shared lock and unfreeze the filesystem.
In order to allow the unfreeze, this patch makes gfs2 grab a shared lock
on the filesystem root directory during the freeze, and hold it until it
unfreezes the filesystem. The functions which need to grab a shared
lock in order to allow the unfreeze ioctl to be issued now use the lock
grabbed by the freeze code instead.
The freeze and unfreeze code take care to make sure that this shared
lock will not be dropped while another process is using it.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 07:26:55 +04:00
struct gfs2_holder j_gh , ji_gh , thaw_gh ;
2018-03-29 20:41:27 +03:00
ktime_t t_start , t_jlck , t_jhd , t_tlck , t_rep ;
2006-01-16 19:50:04 +03:00
int ro = 0 ;
unsigned int pass ;
GFS2: Fix recovery issues for spectators
This patch fixes a couple problems dealing with spectators who
remain with gfs2 mounts after the last non-spectator node fails.
Before this patch, spectator mounts would try to acquire the dlm's
mounted lock EX as part of its normal recovery sequence.
The mounted lock is only used to determine whether the node is
the first mounter, the first node to mount the file system, for
the purposes of file system recovery and journal replay.
It's not necessary for spectators: they should never do journal
recovery. If they acquire the lock it will prevent another "real"
first-mounter from acquiring the lock in EX mode, which means it
also cannot do journal recovery because it doesn't think it's the
first node to mount the file system.
This patch checks if the mounter is a spectator, and if so, avoids
grabbing the mounted lock. This allows a secondary mounter who is
really the first non-spectator mounter, to do journal recovery:
since the spectator doesn't acquire the lock, it can grab it in
EX mode, and therefore consider itself to be the first mounter
both as a "real" first mount, and as a first-real-after-spectator.
Note that the control lock still needs to be taken in PR mode
in order to fetch the lvb value so it has the current status of
all journal's recovery. This is used as it is today by a first
mounter to replay the journals. For spectators, it's merely
used to fetch the status bits. All recovery is bypassed and the
node waits until recovery is completed by a non-spectator node.
I also improved the cryptic message given by control_mount when
a spectator is waiting for a non-spectator to perform recovery.
It also fixes a problem in gfs2_recover_set whereby spectators
were never queueing recovery work for their own journal.
They cannot do recovery themselves, but they still need to queue
the work so they can check the recovery bits and clear the
DFL_BLOCK_LOCKS bit once the recovery happens on another node.
When the work queue runs on a spectator, it bypasses most of the
work so it won't print a bunch of annoying messages. All it will
print is a bunch of messages that look like this until recovery
completes on the non-spectator node:
GFS2: fsid=mycluster:scratch.s: recover generation 3 jid 0
GFS2: fsid=mycluster:scratch.s: recover jid 0 result busy
These continue every 1.5 seconds until the recovery is done by
the non-spectator, at which time it says:
GFS2: fsid=mycluster:scratch.s: recover generation 4 done
Then it proceeds with its mount.
If the file system is mounted in spectator node and the last
remaining non-spectator is fenced, any IO to the file system is
blocked by dlm and the spectator waits until recovery is
performed by a non-spectator.
If a spectator tries to mount the file system before any
non-spectators, it blocks and repeatedly gives this kernel
message:
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2018-07-05 22:40:46 +03:00
int error = 0 ;
2010-09-29 17:20:52 +04:00
int jlocked = 0 ;
2006-01-16 19:50:04 +03:00
gfs2: Ignore dlm recovery requests if gfs2 is withdrawn
When a node fails, user space informs dlm of the node failure,
and dlm instructs gfs2 on the surviving nodes to perform journal
recovery. It does this by calling various callback functions in
lock_dlm.c. To mark its progress, it keeps generation numbers
and recover bits in a dlm "control" lock lvb, which is seen by
all nodes to determine which journals need to be replayed.
The gfs2 on all nodes get the same recovery requests from dlm,
so they all try to do the recovery, but only one will be
granted the exclusive lock on the journal. The others fail
with a "Busy" message on their "try lock."
However, when a node is withdrawn, it cannot safely do any
recovery or replay any journals. To make matters worse,
gfs2 might withdraw as a result of attempting recovery. For
example, this might happen if the device goes offline, or if
an hba fails. But in today's gfs2 code, it doesn't check for
being withdrawn at any step in the recovery process. What's
worse is that these callbacks from dlm have no return code,
so there is no way to indicate failure back to dlm. We can
send a "Recovery failed" uevent eventually, but that tells
user space what happened, not dlm's kernel code.
Before this patch, lock_dlm would perform its recovery steps but
ignore the result, and eventually it would still update its
generation number in the lvb, despite the fact that it may have
withdrawn or encountered an error. The other nodes would then
see the newer generation number in the lvb and conclude that
they don't need to do recovery because the generation number
is newer than the last one they saw. They think a different
node has already recovered the journal.
This patch adds checks to several of the callbacks used by dlm
in its recovery state machine so that the functions are ignored
and skipped if an io error has occurred or if the file system
is withdrawn. That prevents the lvb bits from being updated, and
therefore dlm and user space still see the need for recovery to
take place.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-12 23:58:40 +03:00
if ( gfs2_withdrawn ( sdp ) ) {
fs_err ( sdp , " jid=%u: Recovery not attempted due to withdraw. \n " ,
jd - > jd_jid ) ;
goto fail ;
}
2018-03-29 20:41:27 +03:00
t_start = ktime_get ( ) ;
GFS2: Fix recovery issues for spectators
This patch fixes a couple problems dealing with spectators who
remain with gfs2 mounts after the last non-spectator node fails.
Before this patch, spectator mounts would try to acquire the dlm's
mounted lock EX as part of its normal recovery sequence.
The mounted lock is only used to determine whether the node is
the first mounter, the first node to mount the file system, for
the purposes of file system recovery and journal replay.
It's not necessary for spectators: they should never do journal
recovery. If they acquire the lock it will prevent another "real"
first-mounter from acquiring the lock in EX mode, which means it
also cannot do journal recovery because it doesn't think it's the
first node to mount the file system.
This patch checks if the mounter is a spectator, and if so, avoids
grabbing the mounted lock. This allows a secondary mounter who is
really the first non-spectator mounter, to do journal recovery:
since the spectator doesn't acquire the lock, it can grab it in
EX mode, and therefore consider itself to be the first mounter
both as a "real" first mount, and as a first-real-after-spectator.
Note that the control lock still needs to be taken in PR mode
in order to fetch the lvb value so it has the current status of
all journal's recovery. This is used as it is today by a first
mounter to replay the journals. For spectators, it's merely
used to fetch the status bits. All recovery is bypassed and the
node waits until recovery is completed by a non-spectator node.
I also improved the cryptic message given by control_mount when
a spectator is waiting for a non-spectator to perform recovery.
It also fixes a problem in gfs2_recover_set whereby spectators
were never queueing recovery work for their own journal.
They cannot do recovery themselves, but they still need to queue
the work so they can check the recovery bits and clear the
DFL_BLOCK_LOCKS bit once the recovery happens on another node.
When the work queue runs on a spectator, it bypasses most of the
work so it won't print a bunch of annoying messages. All it will
print is a bunch of messages that look like this until recovery
completes on the non-spectator node:
GFS2: fsid=mycluster:scratch.s: recover generation 3 jid 0
GFS2: fsid=mycluster:scratch.s: recover jid 0 result busy
These continue every 1.5 seconds until the recovery is done by
the non-spectator, at which time it says:
GFS2: fsid=mycluster:scratch.s: recover generation 4 done
Then it proceeds with its mount.
If the file system is mounted in spectator node and the last
remaining non-spectator is fenced, any IO to the file system is
blocked by dlm and the spectator waits until recovery is
performed by a non-spectator.
If a spectator tries to mount the file system before any
non-spectators, it blocks and repeatedly gives this kernel
message:
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2018-07-05 22:40:46 +03:00
if ( sdp - > sd_args . ar_spectator )
goto fail ;
if ( jd - > jd_jid ! = sdp - > sd_lockstruct . ls_jid ) {
2006-04-26 21:21:55 +04:00
fs_info ( sdp , " jid=%u: Trying to acquire journal lock... \n " ,
jd - > jd_jid ) ;
2010-09-29 17:20:52 +04:00
jlocked = 1 ;
2008-02-03 18:33:42 +03:00
/* Acquire the journal lock so we can do recovery */
2006-01-16 19:50:04 +03:00
2006-04-26 21:21:55 +04:00
error = gfs2_glock_nq_num ( sdp , jd - > jd_jid , & gfs2_journal_glops ,
LM_ST_EXCLUSIVE ,
LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE ,
& j_gh ) ;
switch ( error ) {
case 0 :
break ;
2006-09-25 17:26:04 +04:00
2006-04-26 21:21:55 +04:00
case GLR_TRYFAILED :
fs_info ( sdp , " jid=%u: Busy \n " , jd - > jd_jid ) ;
error = 0 ;
2020-11-20 21:25:03 +03:00
goto fail ;
2006-09-25 17:26:04 +04:00
2006-04-26 21:21:55 +04:00
default :
goto fail ;
2019-10-04 18:55:29 +03:00
}
2006-01-16 19:50:04 +03:00
2006-04-26 21:21:55 +04:00
error = gfs2_glock_nq_init ( ip - > i_gl , LM_ST_SHARED ,
2007-08-09 02:08:14 +04:00
LM_FLAG_NOEXP | GL_NOCACHE , & ji_gh ) ;
2006-04-26 21:21:55 +04:00
if ( error )
goto fail_gunlock_j ;
} else {
fs_info ( sdp , " jid=%u, already locked for use \n " , jd - > jd_jid ) ;
}
2006-01-16 19:50:04 +03:00
2018-03-29 20:41:27 +03:00
t_jlck = ktime_get ( ) ;
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: Looking at journal... \n " , jd - > jd_jid ) ;
error = gfs2_jdesc_check ( jd ) ;
if ( error )
goto fail_gunlock_ji ;
2019-05-02 22:17:40 +03:00
error = gfs2_find_jhead ( jd , & head , true ) ;
2006-01-16 19:50:04 +03:00
if ( error )
goto fail_gunlock_ji ;
2018-03-29 20:41:27 +03:00
t_jhd = ktime_get ( ) ;
2018-11-09 18:35:14 +03:00
fs_info ( sdp , " jid=%u: Journal head lookup took %lldms \n " , jd - > jd_jid ,
ktime_ms_delta ( t_jhd , t_jlck ) ) ;
2006-01-16 19:50:04 +03:00
if ( ! ( head . lh_flags & GFS2_LOG_HEAD_UNMOUNT ) ) {
fs_info ( sdp , " jid=%u: Acquiring the transaction lock... \n " ,
jd - > jd_jid ) ;
GFS2: remove transaction glock
GFS2 has a transaction glock, which must be grabbed for every
transaction, whose purpose is to deal with freezing the filesystem.
Aside from this involving a large amount of locking, it is very easy to
make the current fsfreeze code hang on unfreezing.
This patch rewrites how gfs2 handles freezing the filesystem. The
transaction glock is removed. In it's place is a freeze glock, which is
cached (but not held) in a shared state by every node in the cluster
when the filesystem is mounted. This lock only needs to be grabbed on
freezing, and actions which need to be safe from freezing, like
recovery.
When a node wants to freeze the filesystem, it grabs this glock
exclusively. When the freeze glock state changes on the nodes (either
from shared to unlocked, or shared to exclusive), the filesystem does a
special log flush. gfs2_log_flush() does all the work for flushing out
the and shutting down the incore log, and then it tries to grab the
freeze glock in a shared state again. Since the filesystem is stuck in
gfs2_log_flush, no new transaction can start, and nothing can be written
to disk. Unfreezing the filesytem simply involes dropping the freeze
glock, allowing gfs2_log_flush() to grab and then release the shared
lock, so it is cached for next time.
However, in order for the unfreezing ioctl to occur, gfs2 needs to get a
shared lock on the filesystem root directory inode to check permissions.
If that glock has already been grabbed exclusively, fsfreeze will be
unable to get the shared lock and unfreeze the filesystem.
In order to allow the unfreeze, this patch makes gfs2 grab a shared lock
on the filesystem root directory during the freeze, and hold it until it
unfreezes the filesystem. The functions which need to grab a shared
lock in order to allow the unfreeze ioctl to be issued now use the lock
grabbed by the freeze code instead.
The freeze and unfreeze code take care to make sure that this shared
lock will not be dropped while another process is using it.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 07:26:55 +04:00
/* Acquire a shared hold on the freeze lock */
2006-01-16 19:50:04 +03:00
2020-12-22 23:43:27 +03:00
error = gfs2_freeze_lock ( sdp , & thaw_gh , LM_FLAG_PRIORITY ) ;
2006-01-16 19:50:04 +03:00
if ( error )
goto fail_gunlock_ji ;
2012-01-09 23:40:06 +04:00
if ( test_bit ( SDF_RORECOVERY , & sdp - > sd_flags ) ) {
ro = 1 ;
} else if ( test_bit ( SDF_JOURNAL_CHECKED , & sdp - > sd_flags ) ) {
2006-01-16 19:50:04 +03:00
if ( ! test_bit ( SDF_JOURNAL_LIVE , & sdp - > sd_flags ) )
ro = 1 ;
} else {
2017-07-17 10:45:34 +03:00
if ( sb_rdonly ( sdp - > sd_vfs ) ) {
2008-01-18 23:06:37 +03:00
/* check if device itself is read-only */
ro = bdev_read_only ( sdp - > sd_vfs - > s_bdev ) ;
if ( ! ro ) {
fs_info ( sdp , " recovery required on "
" read-only filesystem. \n " ) ;
fs_info ( sdp , " write access will be "
" enabled during recovery. \n " ) ;
}
}
2006-01-16 19:50:04 +03:00
}
if ( ro ) {
2008-01-18 23:06:37 +03:00
fs_warn ( sdp , " jid=%u: Can't replay: read-only block "
" device \n " , jd - > jd_jid ) ;
2006-01-16 19:50:04 +03:00
error = - EROFS ;
GFS2: remove transaction glock
GFS2 has a transaction glock, which must be grabbed for every
transaction, whose purpose is to deal with freezing the filesystem.
Aside from this involving a large amount of locking, it is very easy to
make the current fsfreeze code hang on unfreezing.
This patch rewrites how gfs2 handles freezing the filesystem. The
transaction glock is removed. In it's place is a freeze glock, which is
cached (but not held) in a shared state by every node in the cluster
when the filesystem is mounted. This lock only needs to be grabbed on
freezing, and actions which need to be safe from freezing, like
recovery.
When a node wants to freeze the filesystem, it grabs this glock
exclusively. When the freeze glock state changes on the nodes (either
from shared to unlocked, or shared to exclusive), the filesystem does a
special log flush. gfs2_log_flush() does all the work for flushing out
the and shutting down the incore log, and then it tries to grab the
freeze glock in a shared state again. Since the filesystem is stuck in
gfs2_log_flush, no new transaction can start, and nothing can be written
to disk. Unfreezing the filesytem simply involes dropping the freeze
glock, allowing gfs2_log_flush() to grab and then release the shared
lock, so it is cached for next time.
However, in order for the unfreezing ioctl to occur, gfs2 needs to get a
shared lock on the filesystem root directory inode to check permissions.
If that glock has already been grabbed exclusively, fsfreeze will be
unable to get the shared lock and unfreeze the filesystem.
In order to allow the unfreeze, this patch makes gfs2 grab a shared lock
on the filesystem root directory during the freeze, and hold it until it
unfreezes the filesystem. The functions which need to grab a shared
lock in order to allow the unfreeze ioctl to be issued now use the lock
grabbed by the freeze code instead.
The freeze and unfreeze code take care to make sure that this shared
lock will not be dropped while another process is using it.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 07:26:55 +04:00
goto fail_gunlock_thaw ;
2006-01-16 19:50:04 +03:00
}
2018-03-29 20:41:27 +03:00
t_tlck = ktime_get ( ) ;
2019-02-27 23:32:36 +03:00
fs_info ( sdp , " jid=%u: Replaying journal...0x%x to 0x%x \n " ,
jd - > jd_jid , head . lh_tail , head . lh_blkno ) ;
2006-01-16 19:50:04 +03:00
2020-02-17 23:15:05 +03:00
/* We take the sd_log_flush_lock here primarily to prevent log
* flushes and simultaneous journal replays from stomping on
2021-01-21 18:10:26 +03:00
* each other wrt jd_log_bio . */
2020-03-26 20:22:05 +03:00
down_read ( & sdp - > sd_log_flush_lock ) ;
2006-01-16 19:50:04 +03:00
for ( pass = 0 ; pass < 2 ; pass + + ) {
lops_before_scan ( jd , & head , pass ) ;
error = foreach_descriptor ( jd , head . lh_tail ,
head . lh_blkno , pass ) ;
lops_after_scan ( jd , error , pass ) ;
2021-02-05 20:11:28 +03:00
if ( error ) {
up_read ( & sdp - > sd_log_flush_lock ) ;
GFS2: remove transaction glock
GFS2 has a transaction glock, which must be grabbed for every
transaction, whose purpose is to deal with freezing the filesystem.
Aside from this involving a large amount of locking, it is very easy to
make the current fsfreeze code hang on unfreezing.
This patch rewrites how gfs2 handles freezing the filesystem. The
transaction glock is removed. In it's place is a freeze glock, which is
cached (but not held) in a shared state by every node in the cluster
when the filesystem is mounted. This lock only needs to be grabbed on
freezing, and actions which need to be safe from freezing, like
recovery.
When a node wants to freeze the filesystem, it grabs this glock
exclusively. When the freeze glock state changes on the nodes (either
from shared to unlocked, or shared to exclusive), the filesystem does a
special log flush. gfs2_log_flush() does all the work for flushing out
the and shutting down the incore log, and then it tries to grab the
freeze glock in a shared state again. Since the filesystem is stuck in
gfs2_log_flush, no new transaction can start, and nothing can be written
to disk. Unfreezing the filesytem simply involes dropping the freeze
glock, allowing gfs2_log_flush() to grab and then release the shared
lock, so it is cached for next time.
However, in order for the unfreezing ioctl to occur, gfs2 needs to get a
shared lock on the filesystem root directory inode to check permissions.
If that glock has already been grabbed exclusively, fsfreeze will be
unable to get the shared lock and unfreeze the filesystem.
In order to allow the unfreeze, this patch makes gfs2 grab a shared lock
on the filesystem root directory during the freeze, and hold it until it
unfreezes the filesystem. The functions which need to grab a shared
lock in order to allow the unfreeze ioctl to be issued now use the lock
grabbed by the freeze code instead.
The freeze and unfreeze code take care to make sure that this shared
lock will not be dropped while another process is using it.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 07:26:55 +04:00
goto fail_gunlock_thaw ;
2021-02-05 20:11:28 +03:00
}
2006-01-16 19:50:04 +03:00
}
2020-10-20 23:58:05 +03:00
recover_local_statfs ( jd , & head ) ;
2017-12-18 21:48:29 +03:00
clean_journal ( jd , & head ) ;
2020-03-26 20:22:05 +03:00
up_read ( & sdp - > sd_log_flush_lock ) ;
2006-01-16 19:50:04 +03:00
2020-12-22 23:43:27 +03:00
gfs2_freeze_unlock ( & thaw_gh ) ;
2018-03-29 20:41:27 +03:00
t_rep = ktime_get ( ) ;
fs_info ( sdp , " jid=%u: Journal replayed in %lldms [jlck:%lldms, "
" jhead:%lldms, tlck:%lldms, replay:%lldms] \n " ,
jd - > jd_jid , ktime_ms_delta ( t_rep , t_start ) ,
ktime_ms_delta ( t_jlck , t_start ) ,
ktime_ms_delta ( t_jhd , t_jlck ) ,
ktime_ms_delta ( t_tlck , t_jhd ) ,
ktime_ms_delta ( t_rep , t_tlck ) ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
gfs2_recovery_done ( sdp , jd - > jd_jid , LM_RD_SUCCESS ) ;
2006-01-16 19:50:04 +03:00
2010-09-29 17:20:52 +04:00
if ( jlocked ) {
gfs2_glock_dq_uninit ( & ji_gh ) ;
2006-04-26 21:21:55 +04:00
gfs2_glock_dq_uninit ( & j_gh ) ;
2010-09-29 17:20:52 +04:00
}
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: Done \n " , jd - > jd_jid ) ;
2010-07-21 00:09:02 +04:00
goto done ;
2006-01-16 19:50:04 +03:00
GFS2: remove transaction glock
GFS2 has a transaction glock, which must be grabbed for every
transaction, whose purpose is to deal with freezing the filesystem.
Aside from this involving a large amount of locking, it is very easy to
make the current fsfreeze code hang on unfreezing.
This patch rewrites how gfs2 handles freezing the filesystem. The
transaction glock is removed. In it's place is a freeze glock, which is
cached (but not held) in a shared state by every node in the cluster
when the filesystem is mounted. This lock only needs to be grabbed on
freezing, and actions which need to be safe from freezing, like
recovery.
When a node wants to freeze the filesystem, it grabs this glock
exclusively. When the freeze glock state changes on the nodes (either
from shared to unlocked, or shared to exclusive), the filesystem does a
special log flush. gfs2_log_flush() does all the work for flushing out
the and shutting down the incore log, and then it tries to grab the
freeze glock in a shared state again. Since the filesystem is stuck in
gfs2_log_flush, no new transaction can start, and nothing can be written
to disk. Unfreezing the filesytem simply involes dropping the freeze
glock, allowing gfs2_log_flush() to grab and then release the shared
lock, so it is cached for next time.
However, in order for the unfreezing ioctl to occur, gfs2 needs to get a
shared lock on the filesystem root directory inode to check permissions.
If that glock has already been grabbed exclusively, fsfreeze will be
unable to get the shared lock and unfreeze the filesystem.
In order to allow the unfreeze, this patch makes gfs2 grab a shared lock
on the filesystem root directory during the freeze, and hold it until it
unfreezes the filesystem. The functions which need to grab a shared
lock in order to allow the unfreeze ioctl to be issued now use the lock
grabbed by the freeze code instead.
The freeze and unfreeze code take care to make sure that this shared
lock will not be dropped while another process is using it.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 07:26:55 +04:00
fail_gunlock_thaw :
2020-12-22 23:43:27 +03:00
gfs2_freeze_unlock ( & thaw_gh ) ;
2006-04-26 21:21:55 +04:00
fail_gunlock_ji :
2010-09-29 17:20:52 +04:00
if ( jlocked ) {
2006-04-26 21:21:55 +04:00
gfs2_glock_dq_uninit ( & ji_gh ) ;
fail_gunlock_j :
gfs2_glock_dq_uninit ( & j_gh ) ;
}
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: %s \n " , jd - > jd_jid , ( error ) ? " Failed " : " Done " ) ;
2006-04-26 21:21:55 +04:00
fail :
2012-01-10 00:29:20 +04:00
jd - > jd_recover_error = error ;
2009-01-12 13:43:39 +03:00
gfs2_recovery_done ( sdp , jd - > jd_jid , LM_RD_GAVEUP ) ;
2010-07-21 00:09:02 +04:00
done :
clear_bit ( JDF_RECOVERY , & jd - > jd_flags ) ;
2014-03-17 21:06:10 +04:00
smp_mb__after_atomic ( ) ;
2010-07-21 00:09:02 +04:00
wake_up_bit ( & jd - > jd_flags , JDF_RECOVERY ) ;
2006-01-16 19:50:04 +03:00
}
2010-07-21 00:09:02 +04:00
int gfs2_recover_journal ( struct gfs2_jdesc * jd , bool wait )
2008-11-19 13:08:22 +03:00
{
2009-05-19 13:01:18 +04:00
int rv ;
2010-07-21 00:09:02 +04:00
if ( test_and_set_bit ( JDF_RECOVERY , & jd - > jd_flags ) )
return - EBUSY ;
/* we have JDF_RECOVERY, queue should always succeed */
rv = queue_work ( gfs_recovery_wq , & jd - > jd_work ) ;
BUG_ON ( ! rv ) ;
if ( wait )
sched: Remove proliferation of wait_on_bit() action functions
The current "wait_on_bit" interface requires an 'action'
function to be provided which does the actual waiting.
There are over 20 such functions, many of them identical.
Most cases can be satisfied by one of just two functions, one
which uses io_schedule() and one which just uses schedule().
So:
Rename wait_on_bit and wait_on_bit_lock to
wait_on_bit_action and wait_on_bit_lock_action
to make it explicit that they need an action function.
Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io
which are *not* given an action function but implicitly use
a standard one.
The decision to error-out if a signal is pending is now made
based on the 'mode' argument rather than being encoded in the action
function.
All instances of the old wait_on_bit and wait_on_bit_lock which
can use the new version have been changed accordingly and their
action functions have been discarded.
wait_on_bit{_lock} does not return any specific error code in the
event of a signal so the caller must check for non-zero and
interpolate their own error code as appropriate.
The wait_on_bit() call in __fscache_wait_on_invalidate() was
ambiguous as it specified TASK_UNINTERRUPTIBLE but used
fscache_wait_bit_interruptible as an action function.
David Howells confirms this should be uniformly
"uninterruptible"
The main remaining user of wait_on_bit{,_lock}_action is NFS
which needs to use a freezer-aware schedule() call.
A comment in fs/gfs2/glock.c notes that having multiple 'action'
functions is useful as they display differently in the 'wchan'
field of 'ps'. (and /proc/$PID/wchan).
As the new bit_wait{,_io} functions are tagged "__sched", they
will not show up at all, but something higher in the stack. So
the distinction will still be visible, only with different
function names (gds2_glock_wait versus gfs2_glock_dq_wait in the
gfs2/glock.c case).
Since first version of this patch (against 3.15) two new action
functions appeared, on in NFS and one in CIFS. CIFS also now
uses an action function that makes the same freezer aware
schedule call as NFS.
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: David Howells <dhowells@redhat.com> (fscache, keys)
Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2)
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steve French <sfrench@samba.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 09:16:04 +04:00
wait_on_bit ( & jd - > jd_flags , JDF_RECOVERY ,
2010-07-21 00:09:02 +04:00
TASK_UNINTERRUPTIBLE ) ;
2012-01-10 00:29:20 +04:00
return wait ? jd - > jd_recover_error : 0 ;
2008-11-19 13:08:22 +03:00
}