2019-05-31 11:09:56 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2006-01-16 19:50:04 +03:00
/*
* Copyright ( C ) Sistina Software , Inc . 1997 - 2003 All rights reserved .
2006-05-18 23:09:15 +04:00
* Copyright ( C ) 2004 - 2006 Red Hat , Inc . All rights reserved .
2006-01-16 19:50:04 +03:00
*/
2009-11-21 00:50:40 +03:00
# include <linux/module.h>
2006-01-16 19:50:04 +03:00
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/completion.h>
# include <linux/buffer_head.h>
2006-02-28 01:23:27 +03:00
# include <linux/gfs2_ondisk.h>
2006-03-28 23:14:04 +04:00
# include <linux/crc32.h>
2018-01-17 02:01:33 +03:00
# include <linux/crc32c.h>
2018-03-29 20:41:27 +03:00
# include <linux/ktime.h>
2006-01-16 19:50:04 +03:00
# include "gfs2.h"
2006-02-28 01:23:27 +03:00
# include "incore.h"
2006-01-16 19:50:04 +03:00
# include "bmap.h"
# include "glock.h"
# include "glops.h"
2017-12-18 21:48:29 +03:00
# include "log.h"
2006-01-16 19:50:04 +03:00
# include "lops.h"
# include "meta_io.h"
# include "recovery.h"
# include "super.h"
2006-02-28 01:23:27 +03:00
# include "util.h"
2006-03-28 23:14:04 +04:00
# include "dir.h"
2006-01-16 19:50:04 +03:00
2023-08-28 19:02:22 +03:00
struct workqueue_struct * gfs2_recovery_wq ;
2010-07-21 00:09:02 +04:00
2006-01-16 19:50:04 +03:00
int gfs2_replay_read_block ( struct gfs2_jdesc * jd , unsigned int blk ,
struct buffer_head * * bh )
{
2006-06-14 23:32:57 +04:00
struct gfs2_inode * ip = GFS2_I ( jd - > jd_inode ) ;
2006-02-28 01:23:27 +03:00
struct gfs2_glock * gl = ip - > i_gl ;
2006-09-04 20:49:07 +04:00
u64 dblock ;
u32 extlen ;
2006-01-16 19:50:04 +03:00
int error ;
2021-04-01 00:17:38 +03:00
extlen = 32 ;
error = gfs2_get_extent ( & ip - > i_inode , blk , & dblock , & extlen ) ;
2006-01-16 19:50:04 +03:00
if ( error )
return error ;
if ( ! dblock ) {
2006-02-28 01:23:27 +03:00
gfs2_consist_inode ( ip ) ;
2006-01-16 19:50:04 +03:00
return - EIO ;
}
2006-09-22 01:05:23 +04:00
* bh = gfs2_meta_ra ( gl , dblock , extlen ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
2014-03-07 02:19:15 +04:00
int gfs2_revoke_add ( struct gfs2_jdesc * jd , u64 blkno , unsigned int where )
2006-01-16 19:50:04 +03:00
{
2014-03-07 02:19:15 +04:00
struct list_head * head = & jd - > jd_revoke_list ;
2022-04-01 01:38:57 +03:00
struct gfs2_revoke_replay * rr = NULL , * iter ;
2006-01-16 19:50:04 +03:00
2022-04-01 01:38:57 +03:00
list_for_each_entry ( iter , head , rr_list ) {
if ( iter - > rr_blkno = = blkno ) {
rr = iter ;
2006-01-16 19:50:04 +03:00
break ;
}
}
2022-04-01 01:38:57 +03:00
if ( rr ) {
2006-01-16 19:50:04 +03:00
rr - > rr_where = where ;
return 0 ;
}
2008-04-09 17:33:41 +04:00
rr = kmalloc ( sizeof ( struct gfs2_revoke_replay ) , GFP_NOFS ) ;
2006-01-16 19:50:04 +03:00
if ( ! rr )
return - ENOMEM ;
rr - > rr_blkno = blkno ;
rr - > rr_where = where ;
list_add ( & rr - > rr_list , head ) ;
return 1 ;
}
2014-03-07 02:19:15 +04:00
int gfs2_revoke_check ( struct gfs2_jdesc * jd , u64 blkno , unsigned int where )
2006-01-16 19:50:04 +03:00
{
2022-04-01 01:38:57 +03:00
struct gfs2_revoke_replay * rr = NULL , * iter ;
2006-01-16 19:50:04 +03:00
int wrap , a , b , revoke ;
2022-04-01 01:38:57 +03:00
list_for_each_entry ( iter , & jd - > jd_revoke_list , rr_list ) {
if ( iter - > rr_blkno = = blkno ) {
rr = iter ;
2006-01-16 19:50:04 +03:00
break ;
}
}
2022-04-01 01:38:57 +03:00
if ( ! rr )
2006-01-16 19:50:04 +03:00
return 0 ;
2014-03-07 02:19:15 +04:00
wrap = ( rr - > rr_where < jd - > jd_replay_tail ) ;
a = ( jd - > jd_replay_tail < where ) ;
2006-01-16 19:50:04 +03:00
b = ( where < rr - > rr_where ) ;
revoke = ( wrap ) ? ( a | | b ) : ( a & & b ) ;
return revoke ;
}
2014-03-07 02:19:15 +04:00
void gfs2_revoke_clean ( struct gfs2_jdesc * jd )
2006-01-16 19:50:04 +03:00
{
2014-03-07 02:19:15 +04:00
struct list_head * head = & jd - > jd_revoke_list ;
2006-01-16 19:50:04 +03:00
struct gfs2_revoke_replay * rr ;
while ( ! list_empty ( head ) ) {
2020-02-03 21:22:45 +03:00
rr = list_first_entry ( head , struct gfs2_revoke_replay , rr_list ) ;
2006-01-16 19:50:04 +03:00
list_del ( & rr - > rr_list ) ;
kfree ( rr ) ;
}
}
2018-11-09 18:54:18 +03:00
int __get_log_header ( struct gfs2_sbd * sdp , const struct gfs2_log_header * lh ,
unsigned int blkno , struct gfs2_log_header_host * head )
{
u32 hash , crc ;
if ( lh - > lh_header . mh_magic ! = cpu_to_be32 ( GFS2_MAGIC ) | |
lh - > lh_header . mh_type ! = cpu_to_be32 ( GFS2_METATYPE_LH ) | |
( blkno & & be32_to_cpu ( lh - > lh_blkno ) ! = blkno ) )
return 1 ;
hash = crc32 ( ~ 0 , lh , LH_V1_SIZE - 4 ) ;
hash = ~ crc32_le_shift ( hash , 4 ) ; /* assume lh_hash is zero */
if ( be32_to_cpu ( lh - > lh_hash ) ! = hash )
return 1 ;
crc = crc32c ( ~ 0 , ( void * ) lh + LH_V1_SIZE + 4 ,
sdp - > sd_sb . sb_bsize - LH_V1_SIZE - 4 ) ;
if ( ( lh - > lh_crc ! = 0 & & be32_to_cpu ( lh - > lh_crc ) ! = crc ) )
return 1 ;
head - > lh_sequence = be64_to_cpu ( lh - > lh_sequence ) ;
head - > lh_flags = be32_to_cpu ( lh - > lh_flags ) ;
head - > lh_tail = be32_to_cpu ( lh - > lh_tail ) ;
head - > lh_blkno = be32_to_cpu ( lh - > lh_blkno ) ;
2020-10-20 23:58:03 +03:00
head - > lh_local_total = be64_to_cpu ( lh - > lh_local_total ) ;
head - > lh_local_free = be64_to_cpu ( lh - > lh_local_free ) ;
head - > lh_local_dinodes = be64_to_cpu ( lh - > lh_local_dinodes ) ;
2018-11-09 18:54:18 +03:00
return 0 ;
}
2006-01-16 19:50:04 +03:00
/**
* get_log_header - read the log header for a given segment
* @ jd : the journal
* @ blk : the block to look at
2021-03-30 19:44:29 +03:00
* @ head : the log header to return
2006-01-16 19:50:04 +03:00
*
* Read the log header for a given segement in a given journal . Do a few
* sanity checks on it .
*
* Returns : 0 on success ,
* 1 if the header was invalid or incomplete ,
* errno on error
*/
static int get_log_header ( struct gfs2_jdesc * jd , unsigned int blk ,
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host * head )
2006-01-16 19:50:04 +03:00
{
2018-11-09 18:54:18 +03:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-01-16 19:50:04 +03:00
struct buffer_head * bh ;
int error ;
error = gfs2_replay_read_block ( jd , blk , & bh ) ;
if ( error )
return error ;
2018-11-09 18:54:18 +03:00
error = __get_log_header ( sdp , ( const struct gfs2_log_header * ) bh - > b_data ,
blk , head ) ;
2018-01-17 01:07:57 +03:00
brelse ( bh ) ;
2006-01-16 19:50:04 +03:00
2018-01-17 01:07:57 +03:00
return error ;
2006-01-16 19:50:04 +03:00
}
/**
* foreach_descriptor - go through the active part of the log
* @ jd : the journal
* @ start : the first log header in the active region
* @ end : the last log header ( don ' t process the contents of this entry ) )
2021-03-30 19:44:29 +03:00
* @ pass : iteration number ( foreach_descriptor ( ) is called in a for ( ) loop )
2006-01-16 19:50:04 +03:00
*
* Call a given function once for every log descriptor in the active
* portion of the log .
*
* Returns : errno
*/
2019-03-25 18:34:19 +03:00
static int foreach_descriptor ( struct gfs2_jdesc * jd , u32 start ,
2006-01-16 19:50:04 +03:00
unsigned int end , int pass )
{
2006-06-14 23:32:57 +04:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-01-16 19:50:04 +03:00
struct buffer_head * bh ;
struct gfs2_log_descriptor * ld ;
int error = 0 ;
u32 length ;
__be64 * ptr ;
unsigned int offset = sizeof ( struct gfs2_log_descriptor ) ;
2006-09-05 22:41:30 +04:00
offset + = sizeof ( __be64 ) - 1 ;
offset & = ~ ( sizeof ( __be64 ) - 1 ) ;
2006-01-16 19:50:04 +03:00
while ( start ! = end ) {
error = gfs2_replay_read_block ( jd , start , & bh ) ;
if ( error )
return error ;
if ( gfs2_meta_check ( sdp , bh ) ) {
brelse ( bh ) ;
return - EIO ;
}
ld = ( struct gfs2_log_descriptor * ) bh - > b_data ;
length = be32_to_cpu ( ld - > ld_length ) ;
2006-03-31 00:46:23 +04:00
if ( be32_to_cpu ( ld - > ld_header . mh_type ) = = GFS2_METATYPE_LH ) {
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host lh ;
2006-01-16 19:50:04 +03:00
error = get_log_header ( jd , start , & lh ) ;
if ( ! error ) {
2016-07-21 21:02:44 +03:00
gfs2_replay_incr_blk ( jd , & start ) ;
2006-08-10 20:08:40 +04:00
brelse ( bh ) ;
2006-01-16 19:50:04 +03:00
continue ;
}
if ( error = = 1 ) {
2006-06-14 23:32:57 +04:00
gfs2_consist_inode ( GFS2_I ( jd - > jd_inode ) ) ;
2006-01-16 19:50:04 +03:00
error = - EIO ;
}
brelse ( bh ) ;
return error ;
} else if ( gfs2_metatype_check ( sdp , bh , GFS2_METATYPE_LD ) ) {
brelse ( bh ) ;
return - EIO ;
}
ptr = ( __be64 * ) ( bh - > b_data + offset ) ;
error = lops_scan_elements ( jd , start , ld , ptr , pass ) ;
if ( error ) {
brelse ( bh ) ;
return error ;
}
while ( length - - )
2016-07-21 21:02:44 +03:00
gfs2_replay_incr_blk ( jd , & start ) ;
2006-01-16 19:50:04 +03:00
brelse ( bh ) ;
}
return 0 ;
}
/**
* clean_journal - mark a dirty journal as being clean
* @ jd : the journal
* @ head : the head journal to start from
*
* Returns : errno
*/
2017-12-18 21:48:29 +03:00
static void clean_journal ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head )
2006-01-16 19:50:04 +03:00
{
2006-06-14 23:32:57 +04:00
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2019-03-25 18:34:19 +03:00
u32 lblock = head - > lh_blkno ;
2006-01-16 19:50:04 +03:00
2019-03-25 18:34:19 +03:00
gfs2_replay_incr_blk ( jd , & lblock ) ;
gfs2_write_log_header ( sdp , jd , head - > lh_sequence + 1 , 0 , lblock ,
2018-01-17 02:01:33 +03:00
GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY ,
REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC ) ;
2019-08-28 23:21:34 +03:00
if ( jd - > jd_jid = = sdp - > sd_lockstruct . ls_jid ) {
sdp - > sd_log_flush_head = lblock ;
gfs2_log_incr_head ( sdp ) ;
}
2006-01-16 19:50:04 +03:00
}
2008-01-30 18:34:04 +03:00
2009-01-12 13:43:39 +03:00
static void gfs2_recovery_done ( struct gfs2_sbd * sdp , unsigned int jid ,
unsigned int message )
2008-01-30 18:34:04 +03:00
{
2009-01-12 13:43:39 +03:00
char env_jid [ 20 ] ;
char env_status [ 20 ] ;
char * envp [ ] = { env_jid , env_status , NULL } ;
struct lm_lockstruct * ls = & sdp - > sd_lockstruct ;
2012-01-10 02:18:05 +04:00
2009-01-12 13:43:39 +03:00
ls - > ls_recover_jid_done = jid ;
ls - > ls_recover_jid_status = message ;
2015-01-12 14:01:03 +03:00
sprintf ( env_jid , " JID=%u " , jid ) ;
2009-01-12 13:43:39 +03:00
sprintf ( env_status , " RECOVERY=%s " ,
message = = LM_RD_SUCCESS ? " Done " : " Failed " ) ;
kobject_uevent_env ( & sdp - > sd_kobj , KOBJ_CHANGE , envp ) ;
2012-01-10 02:18:05 +04:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_recovery_result )
sdp - > sd_lockstruct . ls_ops - > lm_recovery_result ( sdp , jid , message ) ;
2008-01-30 18:34:04 +03:00
}
2020-10-20 23:58:05 +03:00
/**
* update_statfs_inode - Update the master statfs inode or zero out the local
* statfs inode for a given journal .
* @ jd : The journal
* @ head : If NULL , @ inode is the local statfs inode and we need to zero it out .
* Otherwise , it @ head contains the statfs change info that needs to be
* synced to the master statfs inode ( pointed to by @ inode ) .
* @ inode : statfs inode to update .
*/
static int update_statfs_inode ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head ,
struct inode * inode )
{
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
struct gfs2_inode * ip ;
struct buffer_head * bh ;
struct gfs2_statfs_change_host sc ;
int error = 0 ;
BUG_ON ( ! inode ) ;
ip = GFS2_I ( inode ) ;
error = gfs2_meta_inode_buffer ( ip , & bh ) ;
if ( error )
goto out ;
spin_lock ( & sdp - > sd_statfs_spin ) ;
if ( head ) { /* Update the master statfs inode */
gfs2_statfs_change_in ( & sc , bh - > b_data + sizeof ( struct gfs2_dinode ) ) ;
sc . sc_total + = head - > lh_local_total ;
sc . sc_free + = head - > lh_local_free ;
sc . sc_dinodes + = head - > lh_local_dinodes ;
gfs2_statfs_change_out ( & sc , bh - > b_data + sizeof ( struct gfs2_dinode ) ) ;
fs_info ( sdp , " jid=%u: Updated master statfs Total:%lld, "
" Free:%lld, Dinodes:%lld after change "
" [%+lld,%+lld,%+lld] \n " , jd - > jd_jid , sc . sc_total ,
sc . sc_free , sc . sc_dinodes , head - > lh_local_total ,
head - > lh_local_free , head - > lh_local_dinodes ) ;
} else { /* Zero out the local statfs inode */
memset ( bh - > b_data + sizeof ( struct gfs2_dinode ) , 0 ,
sizeof ( struct gfs2_statfs_change ) ) ;
/* If it's our own journal, reset any in-memory changes too */
if ( jd - > jd_jid = = sdp - > sd_lockstruct . ls_jid ) {
memset ( & sdp - > sd_statfs_local , 0 ,
sizeof ( struct gfs2_statfs_change_host ) ) ;
}
}
spin_unlock ( & sdp - > sd_statfs_spin ) ;
mark_buffer_dirty ( bh ) ;
brelse ( bh ) ;
2020-10-27 20:29:37 +03:00
gfs2_inode_metasync ( ip - > i_gl ) ;
2020-10-20 23:58:05 +03:00
out :
return error ;
}
/**
* recover_local_statfs - Update the master and local statfs changes for this
* journal .
*
* Previously , statfs updates would be read in from the local statfs inode and
* synced to the master statfs inode during recovery .
*
* We now use the statfs updates in the journal head to update the master statfs
* inode instead of reading in from the local statfs inode . To preserve backward
* compatibility with kernels that can ' t do this , we still need to keep the
* local statfs inode up to date by writing changes to it . At some point in the
* future , we can do away with the local statfs inodes altogether and keep the
* statfs changes solely in the journal .
*
* @ jd : the journal
* @ head : the journal head
*
* Returns : errno
*/
static void recover_local_statfs ( struct gfs2_jdesc * jd ,
struct gfs2_log_header_host * head )
{
int error ;
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
if ( ! head - > lh_local_total & & ! head - > lh_local_free
& & ! head - > lh_local_dinodes ) /* No change */
goto zero_local ;
/* First update the master statfs inode with the changes we
* found in the journal . */
error = update_statfs_inode ( jd , head , sdp - > sd_statfs_inode ) ;
if ( error )
goto out ;
zero_local :
/* Zero out the local statfs inode so any changes in there
* are not re - recovered . */
error = update_statfs_inode ( jd , NULL ,
find_local_statfs_inode ( sdp , jd - > jd_jid ) ) ;
out :
return ;
}
2010-07-21 00:09:02 +04:00
void gfs2_recover_func ( struct work_struct * work )
2006-01-16 19:50:04 +03:00
{
2009-05-19 13:01:18 +04:00
struct gfs2_jdesc * jd = container_of ( work , struct gfs2_jdesc , jd_work ) ;
2006-06-14 23:32:57 +04:00
struct gfs2_inode * ip = GFS2_I ( jd - > jd_inode ) ;
struct gfs2_sbd * sdp = GFS2_SB ( jd - > jd_inode ) ;
2006-10-14 05:47:13 +04:00
struct gfs2_log_header_host head ;
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
struct gfs2_holder j_gh , ji_gh ;
2018-03-29 20:41:27 +03:00
ktime_t t_start , t_jlck , t_jhd , t_tlck , t_rep ;
2006-01-16 19:50:04 +03:00
int ro = 0 ;
unsigned int pass ;
GFS2: Fix recovery issues for spectators
This patch fixes a couple problems dealing with spectators who
remain with gfs2 mounts after the last non-spectator node fails.
Before this patch, spectator mounts would try to acquire the dlm's
mounted lock EX as part of its normal recovery sequence.
The mounted lock is only used to determine whether the node is
the first mounter, the first node to mount the file system, for
the purposes of file system recovery and journal replay.
It's not necessary for spectators: they should never do journal
recovery. If they acquire the lock it will prevent another "real"
first-mounter from acquiring the lock in EX mode, which means it
also cannot do journal recovery because it doesn't think it's the
first node to mount the file system.
This patch checks if the mounter is a spectator, and if so, avoids
grabbing the mounted lock. This allows a secondary mounter who is
really the first non-spectator mounter, to do journal recovery:
since the spectator doesn't acquire the lock, it can grab it in
EX mode, and therefore consider itself to be the first mounter
both as a "real" first mount, and as a first-real-after-spectator.
Note that the control lock still needs to be taken in PR mode
in order to fetch the lvb value so it has the current status of
all journal's recovery. This is used as it is today by a first
mounter to replay the journals. For spectators, it's merely
used to fetch the status bits. All recovery is bypassed and the
node waits until recovery is completed by a non-spectator node.
I also improved the cryptic message given by control_mount when
a spectator is waiting for a non-spectator to perform recovery.
It also fixes a problem in gfs2_recover_set whereby spectators
were never queueing recovery work for their own journal.
They cannot do recovery themselves, but they still need to queue
the work so they can check the recovery bits and clear the
DFL_BLOCK_LOCKS bit once the recovery happens on another node.
When the work queue runs on a spectator, it bypasses most of the
work so it won't print a bunch of annoying messages. All it will
print is a bunch of messages that look like this until recovery
completes on the non-spectator node:
GFS2: fsid=mycluster:scratch.s: recover generation 3 jid 0
GFS2: fsid=mycluster:scratch.s: recover jid 0 result busy
These continue every 1.5 seconds until the recovery is done by
the non-spectator, at which time it says:
GFS2: fsid=mycluster:scratch.s: recover generation 4 done
Then it proceeds with its mount.
If the file system is mounted in spectator node and the last
remaining non-spectator is fenced, any IO to the file system is
blocked by dlm and the spectator waits until recovery is
performed by a non-spectator.
If a spectator tries to mount the file system before any
non-spectators, it blocks and repeatedly gives this kernel
message:
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2018-07-05 22:40:46 +03:00
int error = 0 ;
2010-09-29 17:20:52 +04:00
int jlocked = 0 ;
2006-01-16 19:50:04 +03:00
2023-12-20 19:16:29 +03:00
if ( gfs2_withdrawing_or_withdrawn ( sdp ) ) {
gfs2: Ignore dlm recovery requests if gfs2 is withdrawn
When a node fails, user space informs dlm of the node failure,
and dlm instructs gfs2 on the surviving nodes to perform journal
recovery. It does this by calling various callback functions in
lock_dlm.c. To mark its progress, it keeps generation numbers
and recover bits in a dlm "control" lock lvb, which is seen by
all nodes to determine which journals need to be replayed.
The gfs2 on all nodes get the same recovery requests from dlm,
so they all try to do the recovery, but only one will be
granted the exclusive lock on the journal. The others fail
with a "Busy" message on their "try lock."
However, when a node is withdrawn, it cannot safely do any
recovery or replay any journals. To make matters worse,
gfs2 might withdraw as a result of attempting recovery. For
example, this might happen if the device goes offline, or if
an hba fails. But in today's gfs2 code, it doesn't check for
being withdrawn at any step in the recovery process. What's
worse is that these callbacks from dlm have no return code,
so there is no way to indicate failure back to dlm. We can
send a "Recovery failed" uevent eventually, but that tells
user space what happened, not dlm's kernel code.
Before this patch, lock_dlm would perform its recovery steps but
ignore the result, and eventually it would still update its
generation number in the lvb, despite the fact that it may have
withdrawn or encountered an error. The other nodes would then
see the newer generation number in the lvb and conclude that
they don't need to do recovery because the generation number
is newer than the last one they saw. They think a different
node has already recovered the journal.
This patch adds checks to several of the callbacks used by dlm
in its recovery state machine so that the functions are ignored
and skipped if an io error has occurred or if the file system
is withdrawn. That prevents the lvb bits from being updated, and
therefore dlm and user space still see the need for recovery to
take place.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-12 23:58:40 +03:00
fs_err ( sdp , " jid=%u: Recovery not attempted due to withdraw. \n " ,
jd - > jd_jid ) ;
goto fail ;
}
2018-03-29 20:41:27 +03:00
t_start = ktime_get ( ) ;
GFS2: Fix recovery issues for spectators
This patch fixes a couple problems dealing with spectators who
remain with gfs2 mounts after the last non-spectator node fails.
Before this patch, spectator mounts would try to acquire the dlm's
mounted lock EX as part of its normal recovery sequence.
The mounted lock is only used to determine whether the node is
the first mounter, the first node to mount the file system, for
the purposes of file system recovery and journal replay.
It's not necessary for spectators: they should never do journal
recovery. If they acquire the lock it will prevent another "real"
first-mounter from acquiring the lock in EX mode, which means it
also cannot do journal recovery because it doesn't think it's the
first node to mount the file system.
This patch checks if the mounter is a spectator, and if so, avoids
grabbing the mounted lock. This allows a secondary mounter who is
really the first non-spectator mounter, to do journal recovery:
since the spectator doesn't acquire the lock, it can grab it in
EX mode, and therefore consider itself to be the first mounter
both as a "real" first mount, and as a first-real-after-spectator.
Note that the control lock still needs to be taken in PR mode
in order to fetch the lvb value so it has the current status of
all journal's recovery. This is used as it is today by a first
mounter to replay the journals. For spectators, it's merely
used to fetch the status bits. All recovery is bypassed and the
node waits until recovery is completed by a non-spectator node.
I also improved the cryptic message given by control_mount when
a spectator is waiting for a non-spectator to perform recovery.
It also fixes a problem in gfs2_recover_set whereby spectators
were never queueing recovery work for their own journal.
They cannot do recovery themselves, but they still need to queue
the work so they can check the recovery bits and clear the
DFL_BLOCK_LOCKS bit once the recovery happens on another node.
When the work queue runs on a spectator, it bypasses most of the
work so it won't print a bunch of annoying messages. All it will
print is a bunch of messages that look like this until recovery
completes on the non-spectator node:
GFS2: fsid=mycluster:scratch.s: recover generation 3 jid 0
GFS2: fsid=mycluster:scratch.s: recover jid 0 result busy
These continue every 1.5 seconds until the recovery is done by
the non-spectator, at which time it says:
GFS2: fsid=mycluster:scratch.s: recover generation 4 done
Then it proceeds with its mount.
If the file system is mounted in spectator node and the last
remaining non-spectator is fenced, any IO to the file system is
blocked by dlm and the spectator waits until recovery is
performed by a non-spectator.
If a spectator tries to mount the file system before any
non-spectators, it blocks and repeatedly gives this kernel
message:
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2018-07-05 22:40:46 +03:00
if ( sdp - > sd_args . ar_spectator )
goto fail ;
if ( jd - > jd_jid ! = sdp - > sd_lockstruct . ls_jid ) {
2022-11-16 16:19:06 +03:00
fs_info ( sdp , " jid=%u: Trying to acquire journal glock... \n " ,
2006-04-26 21:21:55 +04:00
jd - > jd_jid ) ;
2010-09-29 17:20:52 +04:00
jlocked = 1 ;
2022-11-16 16:19:06 +03:00
/* Acquire the journal glock so we can do recovery */
2006-01-16 19:50:04 +03:00
2006-04-26 21:21:55 +04:00
error = gfs2_glock_nq_num ( sdp , jd - > jd_jid , & gfs2_journal_glops ,
LM_ST_EXCLUSIVE ,
LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE ,
& j_gh ) ;
switch ( error ) {
case 0 :
break ;
2006-09-25 17:26:04 +04:00
2006-04-26 21:21:55 +04:00
case GLR_TRYFAILED :
fs_info ( sdp , " jid=%u: Busy \n " , jd - > jd_jid ) ;
error = 0 ;
2020-11-20 21:25:03 +03:00
goto fail ;
2006-09-25 17:26:04 +04:00
2006-04-26 21:21:55 +04:00
default :
goto fail ;
2019-10-04 18:55:29 +03:00
}
2006-01-16 19:50:04 +03:00
2006-04-26 21:21:55 +04:00
error = gfs2_glock_nq_init ( ip - > i_gl , LM_ST_SHARED ,
2007-08-09 02:08:14 +04:00
LM_FLAG_NOEXP | GL_NOCACHE , & ji_gh ) ;
2006-04-26 21:21:55 +04:00
if ( error )
goto fail_gunlock_j ;
} else {
fs_info ( sdp , " jid=%u, already locked for use \n " , jd - > jd_jid ) ;
}
2006-01-16 19:50:04 +03:00
2018-03-29 20:41:27 +03:00
t_jlck = ktime_get ( ) ;
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: Looking at journal... \n " , jd - > jd_jid ) ;
error = gfs2_jdesc_check ( jd ) ;
if ( error )
goto fail_gunlock_ji ;
2019-05-02 22:17:40 +03:00
error = gfs2_find_jhead ( jd , & head , true ) ;
2006-01-16 19:50:04 +03:00
if ( error )
goto fail_gunlock_ji ;
2018-03-29 20:41:27 +03:00
t_jhd = ktime_get ( ) ;
2018-11-09 18:35:14 +03:00
fs_info ( sdp , " jid=%u: Journal head lookup took %lldms \n " , jd - > jd_jid ,
ktime_ms_delta ( t_jhd , t_jlck ) ) ;
2006-01-16 19:50:04 +03:00
if ( ! ( head . lh_flags & GFS2_LOG_HEAD_UNMOUNT ) ) {
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
mutex_lock ( & sdp - > sd_freeze_mutex ) ;
2006-01-16 19:50:04 +03:00
2022-08-18 17:12:24 +03:00
if ( test_bit ( SDF_FROZEN , & sdp - > sd_flags ) ) {
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
mutex_unlock ( & sdp - > sd_freeze_mutex ) ;
fs_warn ( sdp , " jid=%u: Can't replay: filesystem "
" is frozen \n " , jd - > jd_jid ) ;
2006-01-16 19:50:04 +03:00
goto fail_gunlock_ji ;
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
}
2006-01-16 19:50:04 +03:00
2012-01-09 23:40:06 +04:00
if ( test_bit ( SDF_RORECOVERY , & sdp - > sd_flags ) ) {
ro = 1 ;
} else if ( test_bit ( SDF_JOURNAL_CHECKED , & sdp - > sd_flags ) ) {
2006-01-16 19:50:04 +03:00
if ( ! test_bit ( SDF_JOURNAL_LIVE , & sdp - > sd_flags ) )
ro = 1 ;
} else {
2017-07-17 10:45:34 +03:00
if ( sb_rdonly ( sdp - > sd_vfs ) ) {
2008-01-18 23:06:37 +03:00
/* check if device itself is read-only */
ro = bdev_read_only ( sdp - > sd_vfs - > s_bdev ) ;
if ( ! ro ) {
fs_info ( sdp , " recovery required on "
" read-only filesystem. \n " ) ;
fs_info ( sdp , " write access will be "
" enabled during recovery. \n " ) ;
}
}
2006-01-16 19:50:04 +03:00
}
if ( ro ) {
2008-01-18 23:06:37 +03:00
fs_warn ( sdp , " jid=%u: Can't replay: read-only block "
" device \n " , jd - > jd_jid ) ;
2006-01-16 19:50:04 +03:00
error = - EROFS ;
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
goto fail_gunlock_nofreeze ;
2006-01-16 19:50:04 +03:00
}
2018-03-29 20:41:27 +03:00
t_tlck = ktime_get ( ) ;
2019-02-27 23:32:36 +03:00
fs_info ( sdp , " jid=%u: Replaying journal...0x%x to 0x%x \n " ,
jd - > jd_jid , head . lh_tail , head . lh_blkno ) ;
2006-01-16 19:50:04 +03:00
2020-02-17 23:15:05 +03:00
/* We take the sd_log_flush_lock here primarily to prevent log
* flushes and simultaneous journal replays from stomping on
2021-01-21 18:10:26 +03:00
* each other wrt jd_log_bio . */
2020-03-26 20:22:05 +03:00
down_read ( & sdp - > sd_log_flush_lock ) ;
2006-01-16 19:50:04 +03:00
for ( pass = 0 ; pass < 2 ; pass + + ) {
lops_before_scan ( jd , & head , pass ) ;
error = foreach_descriptor ( jd , head . lh_tail ,
head . lh_blkno , pass ) ;
lops_after_scan ( jd , error , pass ) ;
2021-02-05 20:11:28 +03:00
if ( error ) {
up_read ( & sdp - > sd_log_flush_lock ) ;
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
goto fail_gunlock_nofreeze ;
2021-02-05 20:11:28 +03:00
}
2006-01-16 19:50:04 +03:00
}
2020-10-20 23:58:05 +03:00
recover_local_statfs ( jd , & head ) ;
2017-12-18 21:48:29 +03:00
clean_journal ( jd , & head ) ;
2020-03-26 20:22:05 +03:00
up_read ( & sdp - > sd_log_flush_lock ) ;
2006-01-16 19:50:04 +03:00
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
mutex_unlock ( & sdp - > sd_freeze_mutex ) ;
2018-03-29 20:41:27 +03:00
t_rep = ktime_get ( ) ;
fs_info ( sdp , " jid=%u: Journal replayed in %lldms [jlck:%lldms, "
" jhead:%lldms, tlck:%lldms, replay:%lldms] \n " ,
jd - > jd_jid , ktime_ms_delta ( t_rep , t_start ) ,
ktime_ms_delta ( t_jlck , t_start ) ,
ktime_ms_delta ( t_jhd , t_jlck ) ,
ktime_ms_delta ( t_tlck , t_jhd ) ,
ktime_ms_delta ( t_rep , t_tlck ) ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
gfs2_recovery_done ( sdp , jd - > jd_jid , LM_RD_SUCCESS ) ;
2006-01-16 19:50:04 +03:00
2010-09-29 17:20:52 +04:00
if ( jlocked ) {
gfs2_glock_dq_uninit ( & ji_gh ) ;
2006-04-26 21:21:55 +04:00
gfs2_glock_dq_uninit ( & j_gh ) ;
2010-09-29 17:20:52 +04:00
}
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: Done \n " , jd - > jd_jid ) ;
2010-07-21 00:09:02 +04:00
goto done ;
2006-01-16 19:50:04 +03:00
gfs2: Rework freeze / thaw logic
So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time. To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node. There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again. The initiating node would keep the freeze glock held in
exclusive mode. To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.
It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem. This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock. We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time. But that can fail, and freeze_go_sync()
isn't actually allowed to fail.
To get around this, this patch changes the freeze glock locking scheme
as follows:
At mount time, each node takes the freeze glock in shared mode. To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode. All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run. There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem. This is happening outside of the glock state engine, so
there, we are allowed to fail.
From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.
Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-11-15 01:34:50 +03:00
fail_gunlock_nofreeze :
mutex_unlock ( & sdp - > sd_freeze_mutex ) ;
2006-04-26 21:21:55 +04:00
fail_gunlock_ji :
2010-09-29 17:20:52 +04:00
if ( jlocked ) {
2006-04-26 21:21:55 +04:00
gfs2_glock_dq_uninit ( & ji_gh ) ;
fail_gunlock_j :
gfs2_glock_dq_uninit ( & j_gh ) ;
}
2006-01-16 19:50:04 +03:00
fs_info ( sdp , " jid=%u: %s \n " , jd - > jd_jid , ( error ) ? " Failed " : " Done " ) ;
2006-04-26 21:21:55 +04:00
fail :
2012-01-10 00:29:20 +04:00
jd - > jd_recover_error = error ;
2009-01-12 13:43:39 +03:00
gfs2_recovery_done ( sdp , jd - > jd_jid , LM_RD_GAVEUP ) ;
2010-07-21 00:09:02 +04:00
done :
clear_bit ( JDF_RECOVERY , & jd - > jd_flags ) ;
2014-03-17 21:06:10 +04:00
smp_mb__after_atomic ( ) ;
2010-07-21 00:09:02 +04:00
wake_up_bit ( & jd - > jd_flags , JDF_RECOVERY ) ;
2006-01-16 19:50:04 +03:00
}
2010-07-21 00:09:02 +04:00
int gfs2_recover_journal ( struct gfs2_jdesc * jd , bool wait )
2008-11-19 13:08:22 +03:00
{
2009-05-19 13:01:18 +04:00
int rv ;
2010-07-21 00:09:02 +04:00
if ( test_and_set_bit ( JDF_RECOVERY , & jd - > jd_flags ) )
return - EBUSY ;
/* we have JDF_RECOVERY, queue should always succeed */
2023-08-28 19:02:22 +03:00
rv = queue_work ( gfs2_recovery_wq , & jd - > jd_work ) ;
2010-07-21 00:09:02 +04:00
BUG_ON ( ! rv ) ;
if ( wait )
sched: Remove proliferation of wait_on_bit() action functions
The current "wait_on_bit" interface requires an 'action'
function to be provided which does the actual waiting.
There are over 20 such functions, many of them identical.
Most cases can be satisfied by one of just two functions, one
which uses io_schedule() and one which just uses schedule().
So:
Rename wait_on_bit and wait_on_bit_lock to
wait_on_bit_action and wait_on_bit_lock_action
to make it explicit that they need an action function.
Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io
which are *not* given an action function but implicitly use
a standard one.
The decision to error-out if a signal is pending is now made
based on the 'mode' argument rather than being encoded in the action
function.
All instances of the old wait_on_bit and wait_on_bit_lock which
can use the new version have been changed accordingly and their
action functions have been discarded.
wait_on_bit{_lock} does not return any specific error code in the
event of a signal so the caller must check for non-zero and
interpolate their own error code as appropriate.
The wait_on_bit() call in __fscache_wait_on_invalidate() was
ambiguous as it specified TASK_UNINTERRUPTIBLE but used
fscache_wait_bit_interruptible as an action function.
David Howells confirms this should be uniformly
"uninterruptible"
The main remaining user of wait_on_bit{,_lock}_action is NFS
which needs to use a freezer-aware schedule() call.
A comment in fs/gfs2/glock.c notes that having multiple 'action'
functions is useful as they display differently in the 'wchan'
field of 'ps'. (and /proc/$PID/wchan).
As the new bit_wait{,_io} functions are tagged "__sched", they
will not show up at all, but something higher in the stack. So
the distinction will still be visible, only with different
function names (gds2_glock_wait versus gfs2_glock_dq_wait in the
gfs2/glock.c case).
Since first version of this patch (against 3.15) two new action
functions appeared, on in NFS and one in CIFS. CIFS also now
uses an action function that makes the same freezer aware
schedule call as NFS.
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: David Howells <dhowells@redhat.com> (fscache, keys)
Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2)
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steve French <sfrench@samba.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 09:16:04 +04:00
wait_on_bit ( & jd - > jd_flags , JDF_RECOVERY ,
2010-07-21 00:09:02 +04:00
TASK_UNINTERRUPTIBLE ) ;
2012-01-10 00:29:20 +04:00
return wait ? jd - > jd_recover_error : 0 ;
2008-11-19 13:08:22 +03:00
}