2006-01-16 16:50:04 +00:00
/*
* Copyright ( C ) Sistina Software , Inc . 1997 - 2003 All rights reserved .
2008-01-31 10:31:39 -06:00
* Copyright ( C ) 2004 - 2008 Red Hat , Inc . All rights reserved .
2006-01-16 16:50:04 +00:00
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
2006-09-01 11:05:15 -04:00
* of the GNU General Public License version 2.
2006-01-16 16:50:04 +00:00
*/
# include <linux/sched.h>
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/completion.h>
# include <linux/buffer_head.h>
# include <linux/delay.h>
# include <linux/sort.h>
# include <linux/jhash.h>
2006-03-29 14:36:49 -05:00
# include <linux/kallsyms.h>
2006-02-27 17:23:27 -05:00
# include <linux/gfs2_ondisk.h>
2006-09-11 21:40:30 -04:00
# include <linux/list.h>
2006-09-19 07:56:29 +02:00
# include <linux/lm_interface.h>
2007-01-17 15:33:23 +00:00
# include <linux/wait.h>
2007-03-05 23:10:39 -08:00
# include <linux/module.h>
2007-01-29 11:51:45 +00:00
# include <linux/rwsem.h>
2006-01-16 16:50:04 +00:00
# include <asm/uaccess.h>
2007-03-16 10:26:37 +00:00
# include <linux/seq_file.h>
# include <linux/debugfs.h>
2007-08-01 13:57:10 +01:00
# include <linux/kthread.h>
# include <linux/freezer.h>
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
# include <linux/workqueue.h>
# include <linux/jiffies.h>
2006-01-16 16:50:04 +00:00
# include "gfs2.h"
2006-02-27 17:23:27 -05:00
# include "incore.h"
2006-01-16 16:50:04 +00:00
# include "glock.h"
# include "glops.h"
# include "inode.h"
# include "lops.h"
# include "meta_io.h"
# include "quota.h"
# include "super.h"
2006-02-27 17:23:27 -05:00
# include "util.h"
2006-01-16 16:50:04 +00:00
2006-09-08 13:35:56 -04:00
struct gfs2_gl_hash_bucket {
2006-09-12 10:10:01 -04:00
struct hlist_head hb_list ;
2006-09-08 13:35:56 -04:00
} ;
2008-05-21 17:03:22 +01:00
struct gfs2_glock_iter {
int hash ; /* hash bucket index */
struct gfs2_sbd * sdp ; /* incore superblock */
struct gfs2_glock * gl ; /* current glock struct */
char string [ 512 ] ; /* scratch space */
2007-03-16 10:26:37 +00:00
} ;
2006-01-16 16:50:04 +00:00
typedef void ( * glock_examiner ) ( struct gfs2_glock * gl ) ;
2006-04-28 10:59:12 -04:00
static int gfs2_dump_lockstate ( struct gfs2_sbd * sdp ) ;
2008-05-21 17:03:22 +01:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl ) ;
# define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
2007-01-29 11:51:45 +00:00
static DECLARE_RWSEM ( gfs2_umount_flush_sem ) ;
2007-03-16 10:26:37 +00:00
static struct dentry * gfs2_root ;
2007-08-01 13:57:10 +01:00
static struct task_struct * scand_process ;
static unsigned int scand_secs = 5 ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
static struct workqueue_struct * glock_workqueue ;
2006-04-28 10:59:12 -04:00
2006-09-12 10:10:01 -04:00
# define GFS2_GL_HASH_SHIFT 15
2006-09-09 16:59:11 -04:00
# define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
# define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
2006-09-07 14:40:21 -04:00
static struct gfs2_gl_hash_bucket gl_hash_table [ GFS2_GL_HASH_SIZE ] ;
2007-03-23 17:05:15 -05:00
static struct dentry * gfs2_root ;
2006-09-09 16:59:11 -04:00
/*
* Despite what you might think , the numbers below are not arbitrary : - )
* They are taken from the ipv4 routing hash code , which is well tested
* and thus should be nearly optimal . Later on we might tweek the numbers
* but for now this should be fine .
*
* The reason for putting the locks in a separate array from the list heads
* is that we can have fewer locks than list heads and save memory . We use
* the same hash function for both , but with a different hash mask .
*/
# if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
defined ( CONFIG_PROVE_LOCKING )
# ifdef CONFIG_LOCKDEP
# define GL_HASH_LOCK_SZ 256
# else
# if NR_CPUS >= 32
# define GL_HASH_LOCK_SZ 4096
# elif NR_CPUS >= 16
# define GL_HASH_LOCK_SZ 2048
# elif NR_CPUS >= 8
# define GL_HASH_LOCK_SZ 1024
# elif NR_CPUS >= 4
# define GL_HASH_LOCK_SZ 512
# else
# define GL_HASH_LOCK_SZ 256
# endif
# endif
/* We never want more locks than chains */
# if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
# undef GL_HASH_LOCK_SZ
# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
# endif
static rwlock_t gl_hash_locks [ GL_HASH_LOCK_SZ ] ;
static inline rwlock_t * gl_lock_addr ( unsigned int x )
{
2006-09-09 18:59:27 -04:00
return & gl_hash_locks [ x & ( GL_HASH_LOCK_SZ - 1 ) ] ;
2006-09-09 16:59:11 -04:00
}
# else /* not SMP, so no spinlocks required */
2006-11-28 22:29:19 -08:00
static inline rwlock_t * gl_lock_addr ( unsigned int x )
2006-09-09 16:59:11 -04:00
{
return NULL ;
}
# endif
2006-09-07 14:40:21 -04:00
2006-01-16 16:50:04 +00:00
/**
* gl_hash ( ) - Turn glock number into hash bucket number
* @ lock : The glock number
*
* Returns : The number of the corresponding hash bucket
*/
2006-09-07 13:12:27 -04:00
static unsigned int gl_hash ( const struct gfs2_sbd * sdp ,
const struct lm_lockname * name )
2006-01-16 16:50:04 +00:00
{
unsigned int h ;
2006-09-04 12:49:07 -04:00
h = jhash ( & name - > ln_number , sizeof ( u64 ) , 0 ) ;
2006-01-16 16:50:04 +00:00
h = jhash ( & name - > ln_type , sizeof ( unsigned int ) , h ) ;
2006-09-07 13:12:27 -04:00
h = jhash ( & sdp , sizeof ( struct gfs2_sbd * ) , h ) ;
2006-01-16 16:50:04 +00:00
h & = GFS2_GL_HASH_MASK ;
return h ;
}
/**
* glock_free ( ) - Perform a few checks and then release struct gfs2_glock
* @ gl : The glock to release
*
* Also calls lock module to release its internal structure for this glock .
*
*/
static void glock_free ( struct gfs2_glock * gl )
{
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct inode * aspace = gl - > gl_aspace ;
2008-05-23 14:46:04 +01:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_put_lock )
2008-01-30 15:34:04 +00:00
sdp - > sd_lockstruct . ls_ops - > lm_put_lock ( gl - > gl_lock ) ;
2006-01-16 16:50:04 +00:00
if ( aspace )
gfs2_aspace_put ( aspace ) ;
kmem_cache_free ( gfs2_glock_cachep , gl ) ;
}
/**
* gfs2_glock_hold ( ) - increment reference count on glock
* @ gl : The glock to hold
*
*/
2008-01-29 00:11:34 +02:00
static void gfs2_glock_hold ( struct gfs2_glock * gl )
2006-01-16 16:50:04 +00:00
{
2006-09-13 10:43:37 -04:00
atomic_inc ( & gl - > gl_ref ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_glock_put ( ) - Decrement reference count on glock
* @ gl : The glock to put
*
*/
int gfs2_glock_put ( struct gfs2_glock * gl )
{
int rv = 0 ;
2006-09-09 16:59:11 -04:00
write_lock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2006-09-13 10:43:37 -04:00
if ( atomic_dec_and_test ( & gl - > gl_ref ) ) {
2006-09-12 10:10:01 -04:00
hlist_del ( & gl - > gl_list ) ;
2006-09-09 16:59:11 -04:00
write_unlock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2008-05-21 17:03:22 +01:00
GLOCK_BUG_ON ( gl , gl - > gl_state ! = LM_ST_UNLOCKED ) ;
GLOCK_BUG_ON ( gl , ! list_empty ( & gl - > gl_reclaim ) ) ;
GLOCK_BUG_ON ( gl , ! list_empty ( & gl - > gl_holders ) ) ;
2006-01-16 16:50:04 +00:00
glock_free ( gl ) ;
rv = 1 ;
goto out ;
}
2006-09-09 16:59:11 -04:00
write_unlock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2006-08-24 17:03:05 -04:00
out :
2006-01-16 16:50:04 +00:00
return rv ;
}
/**
* search_bucket ( ) - Find struct gfs2_glock by lock number
* @ bucket : the bucket to search
* @ name : The lock name
*
* Returns : NULL , or the struct gfs2_glock with the requested number
*/
2006-09-08 13:35:56 -04:00
static struct gfs2_glock * search_bucket ( unsigned int hash ,
2006-08-30 12:50:28 -04:00
const struct gfs2_sbd * sdp ,
2006-08-30 11:16:23 -04:00
const struct lm_lockname * name )
2006-01-16 16:50:04 +00:00
{
struct gfs2_glock * gl ;
2006-09-12 10:10:01 -04:00
struct hlist_node * h ;
2006-01-16 16:50:04 +00:00
2006-09-12 10:10:01 -04:00
hlist_for_each_entry ( gl , h , & gl_hash_table [ hash ] . hb_list , gl_list ) {
2006-01-16 16:50:04 +00:00
if ( ! lm_name_equal ( & gl - > gl_name , name ) )
continue ;
2006-08-30 12:50:28 -04:00
if ( gl - > gl_sbd ! = sdp )
continue ;
2006-01-16 16:50:04 +00:00
2006-09-13 10:43:37 -04:00
atomic_inc ( & gl - > gl_ref ) ;
2006-01-16 16:50:04 +00:00
return gl ;
}
return NULL ;
}
/**
* gfs2_glock_find ( ) - Find glock by lock number
* @ sdp : The GFS2 superblock
* @ name : The lock name
*
* Returns : NULL , or the struct gfs2_glock with the requested number
*/
2006-09-07 14:40:21 -04:00
static struct gfs2_glock * gfs2_glock_find ( const struct gfs2_sbd * sdp ,
2006-08-30 11:16:23 -04:00
const struct lm_lockname * name )
2006-01-16 16:50:04 +00:00
{
2006-09-08 13:35:56 -04:00
unsigned int hash = gl_hash ( sdp , name ) ;
2006-01-16 16:50:04 +00:00
struct gfs2_glock * gl ;
2006-09-09 16:59:11 -04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-09-08 13:35:56 -04:00
gl = search_bucket ( hash , sdp , name ) ;
2006-09-09 16:59:11 -04:00
read_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 16:50:04 +00:00
return gl ;
}
2008-05-21 17:03:22 +01:00
/**
* may_grant - check if its ok to grant a new lock
* @ gl : The glock
* @ gh : The lock request which we wish to grant
*
* Returns : true if its ok to grant the lock
*/
static inline int may_grant ( const struct gfs2_glock * gl , const struct gfs2_holder * gh )
{
const struct gfs2_holder * gh_head = list_entry ( gl - > gl_holders . next , const struct gfs2_holder , gh_list ) ;
if ( ( gh - > gh_state = = LM_ST_EXCLUSIVE | |
gh_head - > gh_state = = LM_ST_EXCLUSIVE ) & & gh ! = gh_head )
return 0 ;
if ( gl - > gl_state = = gh - > gh_state )
return 1 ;
if ( gh - > gh_flags & GL_EXACT )
return 0 ;
2008-07-07 10:07:28 +01:00
if ( gl - > gl_state = = LM_ST_EXCLUSIVE ) {
if ( gh - > gh_state = = LM_ST_SHARED & & gh_head - > gh_state = = LM_ST_SHARED )
return 1 ;
if ( gh - > gh_state = = LM_ST_DEFERRED & & gh_head - > gh_state = = LM_ST_DEFERRED )
return 1 ;
}
2008-05-21 17:03:22 +01:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & ( gh - > gh_flags & LM_FLAG_ANY ) )
return 1 ;
return 0 ;
}
static void gfs2_holder_wake ( struct gfs2_holder * gh )
{
clear_bit ( HIF_WAIT , & gh - > gh_iflags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gh - > gh_iflags , HIF_WAIT ) ;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @ gl : The glock
*
* Returns : true if there is a blocked holder at the head of the list
*/
static int do_promote ( struct gfs2_glock * gl )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh , * tmp ;
int ret ;
restart :
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( may_grant ( gl , gh ) ) {
if ( gh - > gh_list . prev = = & gl - > gl_holders & &
glops - > go_lock ) {
spin_unlock ( & gl - > gl_spin ) ;
/* FIXME: eliminate this eventually */
ret = glops - > go_lock ( gh ) ;
spin_lock ( & gl - > gl_spin ) ;
if ( ret ) {
gh - > gh_error = ret ;
list_del_init ( & gh - > gh_list ) ;
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
gfs2_holder_wake ( gh ) ;
continue ;
}
if ( gh - > gh_list . prev = = & gl - > gl_holders )
return 1 ;
break ;
}
return 0 ;
}
/**
* do_error - Something unexpected has happened during a lock request
*
*/
static inline void do_error ( struct gfs2_glock * gl , const int ret )
{
struct gfs2_holder * gh , * tmp ;
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( ret & LM_OUT_ERROR )
gh - > gh_error = - EIO ;
else if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) )
gh - > gh_error = GLR_TRYFAILED ;
else
continue ;
list_del_init ( & gh - > gh_list ) ;
gfs2_holder_wake ( gh ) ;
}
}
/**
* find_first_waiter - find the first gh that ' s waiting for the glock
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_waiter ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
if ( ! test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* state_change - record that the glock is now in a different state
* @ gl : the glock
* @ new_state the new state
*
*/
static void state_change ( struct gfs2_glock * gl , unsigned int new_state )
{
int held1 , held2 ;
held1 = ( gl - > gl_state ! = LM_ST_UNLOCKED ) ;
held2 = ( new_state ! = LM_ST_UNLOCKED ) ;
if ( held1 ! = held2 ) {
if ( held2 )
gfs2_glock_hold ( gl ) ;
else
gfs2_glock_put ( gl ) ;
}
gl - > gl_state = new_state ;
gl - > gl_tchange = jiffies ;
}
static void gfs2_demote_wake ( struct gfs2_glock * gl )
{
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
clear_bit ( GLF_DEMOTE , & gl - > gl_flags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gl - > gl_flags , GLF_DEMOTE ) ;
}
/**
* finish_xmote - The DLM has replied to one of our lock requests
* @ gl : The glock
* @ ret : The status from the DLM
*
*/
static void finish_xmote ( struct gfs2_glock * gl , unsigned int ret )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh ;
unsigned state = ret & LM_OUT_ST_MASK ;
spin_lock ( & gl - > gl_spin ) ;
state_change ( gl , state ) ;
gh = find_first_waiter ( gl ) ;
/* Demote to UN request arrived during demote to SH or DF */
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) & &
state ! = LM_ST_UNLOCKED & & gl - > gl_demote_state = = LM_ST_UNLOCKED )
gl - > gl_target = LM_ST_UNLOCKED ;
/* Check for state != intended state */
if ( unlikely ( state ! = gl - > gl_target ) ) {
if ( gh & & ! test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) {
/* move to back of queue and try next entry */
if ( ret & LM_OUT_CANCELED ) {
if ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) = = 0 )
list_move_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
goto retry ;
}
/* Some error or failed "try lock" - report it */
if ( ( ret & LM_OUT_ERROR ) | |
( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) ) {
gl - > gl_target = gl - > gl_state ;
do_error ( gl , ret ) ;
goto out ;
}
}
switch ( state ) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED :
retry :
do_xmote ( gl , gh , gl - > gl_target ) ;
break ;
/* Conversion fails, unlock and try again */
case LM_ST_SHARED :
case LM_ST_DEFERRED :
do_xmote ( gl , gh , LM_ST_UNLOCKED ) ;
break ;
default : /* Everything else */
printk ( KERN_ERR " GFS2: wanted %u got %u \n " , gl - > gl_target , state ) ;
GLOCK_BUG_ON ( gl , 1 ) ;
}
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_put ( gl ) ;
return ;
}
/* Fast path - we got what we asked for */
if ( test_and_clear_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
if ( state ! = LM_ST_UNLOCKED ) {
if ( glops - > go_xmote_bh ) {
int rv ;
spin_unlock ( & gl - > gl_spin ) ;
rv = glops - > go_xmote_bh ( gl , gh ) ;
if ( rv = = - EAGAIN )
return ;
spin_lock ( & gl - > gl_spin ) ;
if ( rv ) {
do_error ( gl , rv ) ;
goto out ;
}
}
do_promote ( gl ) ;
}
out :
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_put ( gl ) ;
}
static unsigned int gfs2_lm_lock ( struct gfs2_sbd * sdp , void * lock ,
unsigned int cur_state , unsigned int req_state ,
unsigned int flags )
{
int ret = LM_OUT_ERROR ;
2008-05-23 14:46:04 +01:00
if ( ! sdp - > sd_lockstruct . ls_ops - > lm_lock )
return req_state = = LM_ST_UNLOCKED ? 0 : req_state ;
2008-05-21 17:03:22 +01:00
if ( likely ( ! test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
ret = sdp - > sd_lockstruct . ls_ops - > lm_lock ( lock , cur_state ,
req_state , flags ) ;
return ret ;
}
/**
* do_xmote - Calls the DLM to change the state of a lock
* @ gl : The lock state
* @ gh : The holder ( only for promotes )
* @ target : The target lock state
*
*/
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
unsigned int lck_flags = gh ? gh - > gh_flags : 0 ;
int ret ;
lck_flags & = ( LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY ) ;
BUG_ON ( gl - > gl_state = = target ) ;
BUG_ON ( gl - > gl_state = = gl - > gl_target ) ;
if ( ( target = = LM_ST_UNLOCKED | | target = = LM_ST_DEFERRED ) & &
glops - > go_inval ) {
set_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
spin_unlock ( & gl - > gl_spin ) ;
if ( glops - > go_xmote_th )
glops - > go_xmote_th ( gl ) ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
glops - > go_inval ( gl , target = = LM_ST_DEFERRED ? 0 : DIO_METADATA ) ;
clear_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( target ! = LM_ST_UNLOCKED & & ( gl - > gl_state = = LM_ST_SHARED | |
gl - > gl_state = = LM_ST_DEFERRED ) & &
! ( lck_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
lck_flags | = LM_FLAG_TRY_1CB ;
ret = gfs2_lm_lock ( sdp , gl - > gl_lock , gl - > gl_state , target , lck_flags ) ;
if ( ! ( ret & LM_OUT_ASYNC ) ) {
finish_xmote ( gl , ret ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
} else {
GLOCK_BUG_ON ( gl , ret ! = LM_OUT_ASYNC ) ;
}
spin_lock ( & gl - > gl_spin ) ;
}
/**
* find_first_holder - find the first " holder " gh
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_holder ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
if ( ! list_empty ( & gl - > gl_holders ) ) {
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* run_queue - do all outstanding tasks related to a glock
* @ gl : The glock in question
* @ nonblock : True if we must not block in run_queue
*
*/
static void run_queue ( struct gfs2_glock * gl , const int nonblock )
{
struct gfs2_holder * gh = NULL ;
if ( test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) )
return ;
GLOCK_BUG_ON ( gl , test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) ;
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_demote_state ! = gl - > gl_state ) {
if ( find_first_holder ( gl ) )
goto out ;
if ( nonblock )
goto out_sched ;
set_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ;
2008-07-07 10:02:36 +01:00
GLOCK_BUG_ON ( gl , gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) ;
2008-05-21 17:03:22 +01:00
gl - > gl_target = gl - > gl_demote_state ;
} else {
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
if ( do_promote ( gl ) = = 0 )
goto out ;
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
if ( ! ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
do_xmote ( gl , gh , gl - > gl_target ) ;
return ;
out_sched :
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
out :
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
}
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
static void glock_work_func ( struct work_struct * work )
{
2008-05-21 17:03:22 +01:00
unsigned long delay = 0 ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
struct gfs2_glock * gl = container_of ( work , struct gfs2_glock , gl_work . work ) ;
2008-05-21 17:03:22 +01:00
if ( test_and_clear_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) )
finish_xmote ( gl , gl - > gl_reply ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
spin_lock ( & gl - > gl_spin ) ;
2008-07-07 10:02:36 +01:00
if ( test_and_clear_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = LM_ST_EXCLUSIVE ) {
2008-05-21 17:03:22 +01:00
unsigned long holdtime , now = jiffies ;
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
set_bit ( delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE , & gl - > gl_flags ) ;
}
run_queue ( gl , 0 ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 17:03:22 +01:00
if ( ! delay | |
queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
}
2008-01-30 15:34:04 +00:00
static int gfs2_lm_get_lock ( struct gfs2_sbd * sdp , struct lm_lockname * name ,
void * * lockp )
{
int error = - EIO ;
2008-05-23 14:46:04 +01:00
if ( ! sdp - > sd_lockstruct . ls_ops - > lm_get_lock )
return 0 ;
2008-01-30 15:34:04 +00:00
if ( likely ( ! test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
error = sdp - > sd_lockstruct . ls_ops - > lm_get_lock (
sdp - > sd_lockstruct . ls_lockspace , name , lockp ) ;
return error ;
}
2006-01-16 16:50:04 +00:00
/**
* gfs2_glock_get ( ) - Get a glock , or create one if one doesn ' t exist
* @ sdp : The GFS2 superblock
* @ number : the lock number
* @ glops : The glock_operations to use
* @ create : If 0 , don ' t create the glock if it doesn ' t exist
* @ glp : the glock is returned here
*
* This does not lock a glock , just finds / creates structures for one .
*
* Returns : errno
*/
2006-09-04 12:49:07 -04:00
int gfs2_glock_get ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 09:30:00 -04:00
const struct gfs2_glock_operations * glops , int create ,
2006-01-16 16:50:04 +00:00
struct gfs2_glock * * glp )
{
2006-09-08 13:35:56 -04:00
struct lm_lockname name = { . ln_number = number , . ln_type = glops - > go_type } ;
2006-01-16 16:50:04 +00:00
struct gfs2_glock * gl , * tmp ;
2006-09-08 13:35:56 -04:00
unsigned int hash = gl_hash ( sdp , & name ) ;
2006-01-16 16:50:04 +00:00
int error ;
2006-09-09 16:59:11 -04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-09-08 13:35:56 -04:00
gl = search_bucket ( hash , sdp , & name ) ;
2006-09-09 16:59:11 -04:00
read_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 16:50:04 +00:00
if ( gl | | ! create ) {
* glp = gl ;
return 0 ;
}
gl = kmem_cache_alloc ( gfs2_glock_cachep , GFP_KERNEL ) ;
if ( ! gl )
return - ENOMEM ;
2006-08-30 10:36:52 -04:00
gl - > gl_flags = 0 ;
2006-01-16 16:50:04 +00:00
gl - > gl_name = name ;
2006-09-13 10:43:37 -04:00
atomic_set ( & gl - > gl_ref , 1 ) ;
2006-01-16 16:50:04 +00:00
gl - > gl_state = LM_ST_UNLOCKED ;
2008-05-21 17:03:22 +01:00
gl - > gl_target = LM_ST_UNLOCKED ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
2006-09-08 13:35:56 -04:00
gl - > gl_hash = hash ;
2006-01-16 16:50:04 +00:00
gl - > gl_ops = glops ;
2006-08-30 10:36:52 -04:00
gl - > gl_stamp = jiffies ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
gl - > gl_tchange = jiffies ;
2006-08-30 10:36:52 -04:00
gl - > gl_object = NULL ;
2006-01-16 16:50:04 +00:00
gl - > gl_sbd = sdp ;
2006-08-30 10:36:52 -04:00
gl - > gl_aspace = NULL ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
INIT_DELAYED_WORK ( & gl - > gl_work , glock_work_func ) ;
2006-01-16 16:50:04 +00:00
/* If this glock protects actual on-disk data or metadata blocks,
create a VFS inode to manage the pages / buffers holding them . */
2006-09-04 09:49:55 -04:00
if ( glops = = & gfs2_inode_glops | | glops = = & gfs2_rgrp_glops ) {
2006-01-16 16:50:04 +00:00
gl - > gl_aspace = gfs2_aspace_get ( sdp ) ;
if ( ! gl - > gl_aspace ) {
error = - ENOMEM ;
goto fail ;
}
}
error = gfs2_lm_get_lock ( sdp , & name , & gl - > gl_lock ) ;
if ( error )
goto fail_aspace ;
2006-09-09 16:59:11 -04:00
write_lock ( gl_lock_addr ( hash ) ) ;
2006-09-08 13:35:56 -04:00
tmp = search_bucket ( hash , sdp , & name ) ;
2006-01-16 16:50:04 +00:00
if ( tmp ) {
2006-09-09 16:59:11 -04:00
write_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 16:50:04 +00:00
glock_free ( gl ) ;
gl = tmp ;
} else {
2006-09-12 10:10:01 -04:00
hlist_add_head ( & gl - > gl_list , & gl_hash_table [ hash ] . hb_list ) ;
2006-09-09 16:59:11 -04:00
write_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 16:50:04 +00:00
}
* glp = gl ;
return 0 ;
2006-08-30 10:36:52 -04:00
fail_aspace :
2006-01-16 16:50:04 +00:00
if ( gl - > gl_aspace )
gfs2_aspace_put ( gl - > gl_aspace ) ;
2006-08-30 10:36:52 -04:00
fail :
2006-09-25 09:26:04 -04:00
kmem_cache_free ( gfs2_glock_cachep , gl ) ;
2006-01-16 16:50:04 +00:00
return error ;
}
/**
* gfs2_holder_init - initialize a struct gfs2_holder in the default way
* @ gl : the glock
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
*/
2006-04-20 16:57:23 -04:00
void gfs2_holder_init ( struct gfs2_glock * gl , unsigned int state , unsigned flags ,
2006-01-16 16:50:04 +00:00
struct gfs2_holder * gh )
{
INIT_LIST_HEAD ( & gh - > gh_list ) ;
gh - > gh_gl = gl ;
2006-03-29 14:36:49 -05:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2008-02-07 00:13:19 -08:00
gh - > gh_owner_pid = get_pid ( task_pid ( current ) ) ;
2006-01-16 16:50:04 +00:00
gh - > gh_state = state ;
gh - > gh_flags = flags ;
gh - > gh_error = 0 ;
gh - > gh_iflags = 0 ;
gfs2_glock_hold ( gl ) ;
}
/**
* gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
* Don ' t mess with the glock .
*
*/
2006-04-20 16:57:23 -04:00
void gfs2_holder_reinit ( unsigned int state , unsigned flags , struct gfs2_holder * gh )
2006-01-16 16:50:04 +00:00
{
gh - > gh_state = state ;
2006-04-26 14:58:26 -04:00
gh - > gh_flags = flags ;
2007-03-16 09:40:31 +00:00
gh - > gh_iflags = 0 ;
2006-03-29 14:36:49 -05:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_holder_uninit - uninitialize a holder structure ( drop glock reference )
* @ gh : the holder structure
*
*/
void gfs2_holder_uninit ( struct gfs2_holder * gh )
{
2008-02-07 00:13:19 -08:00
put_pid ( gh - > gh_owner_pid ) ;
2006-01-16 16:50:04 +00:00
gfs2_glock_put ( gh - > gh_gl ) ;
gh - > gh_gl = NULL ;
2006-03-29 14:36:49 -05:00
gh - > gh_ip = 0 ;
2006-01-16 16:50:04 +00:00
}
2007-06-11 08:22:32 +01:00
static int just_schedule ( void * word )
2007-01-17 15:33:23 +00:00
{
schedule ( ) ;
return 0 ;
}
2008-05-21 17:03:22 +01:00
static void wait_on_holder ( struct gfs2_holder * gh )
2008-01-30 15:34:04 +00:00
{
2008-05-21 17:03:22 +01:00
might_sleep ( ) ;
wait_on_bit ( & gh - > gh_iflags , HIF_WAIT , just_schedule , TASK_UNINTERRUPTIBLE ) ;
2008-01-30 15:34:04 +00:00
}
2008-05-21 17:03:22 +01:00
static void wait_on_demote ( struct gfs2_glock * gl )
2006-01-16 16:50:04 +00:00
{
2008-05-21 17:03:22 +01:00
might_sleep ( ) ;
wait_on_bit ( & gl - > gl_flags , GLF_DEMOTE , just_schedule , TASK_UNINTERRUPTIBLE ) ;
2006-01-16 16:50:04 +00:00
}
/**
2008-05-21 17:03:22 +01:00
* handle_callback - process a demote request
* @ gl : the glock
* @ state : the state the caller wants us to change to
2006-01-16 16:50:04 +00:00
*
2008-05-21 17:03:22 +01:00
* There are only two requests that we are going to see in actual
* practise : LM_ST_SHARED and LM_ST_UNLOCKED
2006-01-16 16:50:04 +00:00
*/
2008-05-21 17:03:22 +01:00
static void handle_callback ( struct gfs2_glock * gl , unsigned int state ,
int remote , unsigned long delay )
2006-01-16 16:50:04 +00:00
{
2008-05-21 17:03:22 +01:00
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
set_bit ( bit , & gl - > gl_flags ) ;
if ( gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) {
gl - > gl_demote_state = state ;
gl - > gl_demote_time = jiffies ;
if ( remote & & gl - > gl_ops - > go_type = = LM_TYPE_IOPEN & &
gl - > gl_object )
gfs2_glock_schedule_for_reclaim ( gl ) ;
} else if ( gl - > gl_demote_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = state ) {
gl - > gl_demote_state = LM_ST_UNLOCKED ;
2006-01-16 16:50:04 +00:00
}
}
/**
2008-05-21 17:03:22 +01:00
* gfs2_glock_wait - wait on a glock acquisition
2006-01-16 16:50:04 +00:00
* @ gh : the glock holder
*
* Returns : 0 on success
*/
2008-05-21 17:03:22 +01:00
int gfs2_glock_wait ( struct gfs2_holder * gh )
2006-01-16 16:50:04 +00:00
{
2007-01-17 15:33:23 +00:00
wait_on_holder ( gh ) ;
2006-01-16 16:50:04 +00:00
return gh - > gh_error ;
}
2008-05-21 17:03:22 +01:00
void gfs2_print_dbg ( struct seq_file * seq , const char * fmt , . . . )
2007-03-16 10:26:37 +00:00
{
va_list args ;
va_start ( args , fmt ) ;
2008-05-21 17:03:22 +01:00
if ( seq ) {
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 10:26:37 +00:00
vsprintf ( gi - > string , fmt , args ) ;
2008-05-21 17:03:22 +01:00
seq_printf ( seq , gi - > string ) ;
} else {
printk ( KERN_ERR " " ) ;
2007-03-16 10:26:37 +00:00
vprintk ( fmt , args ) ;
2008-05-21 17:03:22 +01:00
}
2007-03-16 10:26:37 +00:00
va_end ( args ) ;
}
2006-01-16 16:50:04 +00:00
/**
* add_to_queue - Add a holder to the wait queue ( but look for recursion )
* @ gh : the holder structure to add
*
2008-05-21 17:03:22 +01:00
* Eventually we should move the recursive locking trap to a
* debugging option or something like that . This is the fast
* path and needs to have the minimum number of distractions .
*
2006-01-16 16:50:04 +00:00
*/
2008-05-21 17:03:22 +01:00
static inline void add_to_queue ( struct gfs2_holder * gh )
2006-01-16 16:50:04 +00:00
{
struct gfs2_glock * gl = gh - > gh_gl ;
2008-05-21 17:03:22 +01:00
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct list_head * insert_pt = NULL ;
struct gfs2_holder * gh2 ;
int try_lock = 0 ;
2006-01-16 16:50:04 +00:00
2008-02-07 00:13:19 -08:00
BUG_ON ( gh - > gh_owner_pid = = NULL ) ;
2007-01-17 15:33:23 +00:00
if ( test_and_set_bit ( HIF_WAIT , & gh - > gh_iflags ) )
BUG ( ) ;
2006-04-20 16:57:23 -04:00
2008-05-21 17:03:22 +01:00
if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) {
if ( test_bit ( GLF_LOCK , & gl - > gl_flags ) )
try_lock = 1 ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
goto fail ;
}
list_for_each_entry ( gh2 , & gl - > gl_holders , gh_list ) {
if ( unlikely ( gh2 - > gh_owner_pid = = gh - > gh_owner_pid & &
( gh - > gh_gl - > gl_ops - > go_type ! = LM_TYPE_FLOCK ) ) )
goto trap_recursive ;
if ( try_lock & &
! ( gh2 - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) & &
! may_grant ( gl , gh ) ) {
fail :
gh - > gh_error = GLR_TRYFAILED ;
gfs2_holder_wake ( gh ) ;
return ;
2007-09-13 23:35:27 -05:00
}
2008-05-21 17:03:22 +01:00
if ( test_bit ( HIF_HOLDER , & gh2 - > gh_iflags ) )
continue ;
if ( unlikely ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) & & ! insert_pt ) )
insert_pt = & gh2 - > gh_list ;
}
if ( likely ( insert_pt = = NULL ) ) {
list_add_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
if ( unlikely ( gh - > gh_flags & LM_FLAG_PRIORITY ) )
goto do_cancel ;
return ;
}
list_add_tail ( & gh - > gh_list , insert_pt ) ;
do_cancel :
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( ! ( gh - > gh_flags & LM_FLAG_PRIORITY ) ) {
spin_unlock ( & gl - > gl_spin ) ;
2008-05-23 14:46:04 +01:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_cancel )
sdp - > sd_lockstruct . ls_ops - > lm_cancel ( gl - > gl_lock ) ;
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 16:50:04 +00:00
}
2008-05-21 17:03:22 +01:00
return ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
trap_recursive :
print_symbol ( KERN_ERR " original: %s \n " , gh2 - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh2 - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh2 - > gh_gl - > gl_name . ln_type , gh2 - > gh_state ) ;
print_symbol ( KERN_ERR " new: %s \n " , gh - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh - > gh_gl - > gl_name . ln_type , gh - > gh_state ) ;
__dump_glock ( NULL , gl ) ;
BUG ( ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock ( acquire a glock )
* @ gh : the holder structure
*
* if ( gh - > gh_flags & GL_ASYNC ) , this never returns an error
*
* Returns : 0 , GLR_TRYFAILED , or errno on failure
*/
int gfs2_glock_nq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
int error = 0 ;
2008-05-21 17:03:22 +01:00
if ( unlikely ( test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
2006-01-16 16:50:04 +00:00
return - EIO ;
spin_lock ( & gl - > gl_spin ) ;
add_to_queue ( gh ) ;
2008-05-21 17:03:22 +01:00
run_queue ( gl , 1 ) ;
2006-01-16 16:50:04 +00:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 17:03:22 +01:00
if ( ! ( gh - > gh_flags & GL_ASYNC ) )
error = gfs2_glock_wait ( gh ) ;
2006-01-16 16:50:04 +00:00
return error ;
}
/**
* gfs2_glock_poll - poll to see if an async request has been completed
* @ gh : the holder
*
* Returns : 1 if the request is ready to be gfs2_glock_wait ( ) ed on
*/
int gfs2_glock_poll ( struct gfs2_holder * gh )
{
2008-05-21 17:03:22 +01:00
return test_bit ( HIF_WAIT , & gh - > gh_iflags ) ? 0 : 1 ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock ( release a glock )
* @ gh : the glock holder
*
*/
void gfs2_glock_dq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
2006-08-30 09:30:00 -04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
unsigned delay = 0 ;
2008-05-21 17:03:22 +01:00
int fast_path = 0 ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 16:50:04 +00:00
if ( gh - > gh_flags & GL_NOCACHE )
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
handle_callback ( gl , LM_ST_UNLOCKED , 0 , 0 ) ;
2006-01-16 16:50:04 +00:00
list_del_init ( & gh - > gh_list ) ;
2008-05-21 17:03:22 +01:00
if ( find_first_holder ( gl ) = = NULL ) {
2007-11-02 08:39:34 +00:00
if ( glops - > go_unlock ) {
2008-05-21 17:03:22 +01:00
GLOCK_BUG_ON ( gl , test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) ) ;
2007-11-02 08:39:34 +00:00
spin_unlock ( & gl - > gl_spin ) ;
2006-01-16 16:50:04 +00:00
glops - > go_unlock ( gh ) ;
2007-11-02 08:39:34 +00:00
spin_lock ( & gl - > gl_spin ) ;
2008-05-21 17:03:22 +01:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2007-11-02 08:39:34 +00:00
}
2007-03-16 09:40:31 +00:00
gl - > gl_stamp = jiffies ;
2008-05-21 17:03:22 +01:00
if ( list_empty ( & gl - > gl_holders ) & &
! test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
fast_path = 1 ;
2006-01-16 16:50:04 +00:00
}
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 17:03:22 +01:00
if ( likely ( fast_path ) )
return ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
gfs2_glock_hold ( gl ) ;
if ( test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 16:50:04 +00:00
}
2007-06-11 08:22:32 +01:00
void gfs2_glock_dq_wait ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
gfs2_glock_dq ( gh ) ;
wait_on_demote ( gl ) ;
}
2006-01-16 16:50:04 +00:00
/**
* gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
* @ gh : the holder structure
*
*/
void gfs2_glock_dq_uninit ( struct gfs2_holder * gh )
{
gfs2_glock_dq ( gh ) ;
gfs2_holder_uninit ( gh ) ;
}
/**
* gfs2_glock_nq_num - acquire a glock based on lock number
* @ sdp : the filesystem
* @ number : the lock number
* @ glops : the glock operations for the type of glock
* @ state : the state to acquire the glock in
* @ flags : modifier flags for the aquisition
* @ gh : the struct gfs2_holder
*
* Returns : errno
*/
2006-09-04 12:49:07 -04:00
int gfs2_glock_nq_num ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 09:30:00 -04:00
const struct gfs2_glock_operations * glops ,
unsigned int state , int flags , struct gfs2_holder * gh )
2006-01-16 16:50:04 +00:00
{
struct gfs2_glock * gl ;
int error ;
error = gfs2_glock_get ( sdp , number , glops , CREATE , & gl ) ;
if ( ! error ) {
error = gfs2_glock_nq_init ( gl , state , flags , gh ) ;
gfs2_glock_put ( gl ) ;
}
return error ;
}
/**
* glock_compare - Compare two struct gfs2_glock structures for sorting
* @ arg_a : the first structure
* @ arg_b : the second structure
*
*/
static int glock_compare ( const void * arg_a , const void * arg_b )
{
2006-09-09 17:07:05 -04:00
const struct gfs2_holder * gh_a = * ( const struct gfs2_holder * * ) arg_a ;
const struct gfs2_holder * gh_b = * ( const struct gfs2_holder * * ) arg_b ;
const struct lm_lockname * a = & gh_a - > gh_gl - > gl_name ;
const struct lm_lockname * b = & gh_b - > gh_gl - > gl_name ;
2006-01-16 16:50:04 +00:00
if ( a - > ln_number > b - > ln_number )
2006-09-09 17:07:05 -04:00
return 1 ;
if ( a - > ln_number < b - > ln_number )
return - 1 ;
2007-01-22 12:10:39 -05:00
BUG_ON ( gh_a - > gh_gl - > gl_ops - > go_type = = gh_b - > gh_gl - > gl_ops - > go_type ) ;
2006-09-09 17:07:05 -04:00
return 0 ;
2006-01-16 16:50:04 +00:00
}
/**
* nq_m_sync - synchonously acquire more than one glock in deadlock free order
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
static int nq_m_sync ( unsigned int num_gh , struct gfs2_holder * ghs ,
struct gfs2_holder * * p )
{
unsigned int x ;
int error = 0 ;
for ( x = 0 ; x < num_gh ; x + + )
p [ x ] = & ghs [ x ] ;
sort ( p , num_gh , sizeof ( struct gfs2_holder * ) , glock_compare , NULL ) ;
for ( x = 0 ; x < num_gh ; x + + ) {
p [ x ] - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
error = gfs2_glock_nq ( p [ x ] ) ;
if ( error ) {
while ( x - - )
gfs2_glock_dq ( p [ x ] ) ;
break ;
}
}
return error ;
}
/**
* gfs2_glock_nq_m - acquire multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
int gfs2_glock_nq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
2007-06-19 15:38:17 +01:00
struct gfs2_holder * tmp [ 4 ] ;
struct gfs2_holder * * pph = tmp ;
2006-01-16 16:50:04 +00:00
int error = 0 ;
2007-06-19 15:38:17 +01:00
switch ( num_gh ) {
case 0 :
2006-01-16 16:50:04 +00:00
return 0 ;
2007-06-19 15:38:17 +01:00
case 1 :
2006-01-16 16:50:04 +00:00
ghs - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
return gfs2_glock_nq ( ghs ) ;
2007-06-19 15:38:17 +01:00
default :
if ( num_gh < = 4 )
2006-01-16 16:50:04 +00:00
break ;
2007-06-19 15:38:17 +01:00
pph = kmalloc ( num_gh * sizeof ( struct gfs2_holder * ) , GFP_NOFS ) ;
if ( ! pph )
return - ENOMEM ;
2006-01-16 16:50:04 +00:00
}
2007-06-19 15:38:17 +01:00
error = nq_m_sync ( num_gh , ghs , pph ) ;
2006-01-16 16:50:04 +00:00
2007-06-19 15:38:17 +01:00
if ( pph ! = tmp )
kfree ( pph ) ;
2006-01-16 16:50:04 +00:00
return error ;
}
/**
* gfs2_glock_dq_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
unsigned int x ;
for ( x = 0 ; x < num_gh ; x + + )
gfs2_glock_dq ( & ghs [ x ] ) ;
}
/**
* gfs2_glock_dq_uninit_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_uninit_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
unsigned int x ;
for ( x = 0 ; x < num_gh ; x + + )
gfs2_glock_dq_uninit ( & ghs [ x ] ) ;
}
2008-01-30 15:34:04 +00:00
static int gfs2_lm_hold_lvb ( struct gfs2_sbd * sdp , void * lock , char * * lvbp )
{
int error = - EIO ;
2008-05-23 14:46:04 +01:00
if ( ! sdp - > sd_lockstruct . ls_ops - > lm_hold_lvb )
return 0 ;
2008-01-30 15:34:04 +00:00
if ( likely ( ! test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
error = sdp - > sd_lockstruct . ls_ops - > lm_hold_lvb ( lock , lvbp ) ;
return error ;
}
2006-01-16 16:50:04 +00:00
/**
* gfs2_lvb_hold - attach a LVB from a glock
* @ gl : The glock in question
*
*/
int gfs2_lvb_hold ( struct gfs2_glock * gl )
{
int error ;
if ( ! atomic_read ( & gl - > gl_lvb_count ) ) {
error = gfs2_lm_hold_lvb ( gl - > gl_sbd , gl - > gl_lock , & gl - > gl_lvb ) ;
2008-05-21 17:03:22 +01:00
if ( error )
2006-01-16 16:50:04 +00:00
return error ;
gfs2_glock_hold ( gl ) ;
}
atomic_inc ( & gl - > gl_lvb_count ) ;
return 0 ;
}
/**
* gfs2_lvb_unhold - detach a LVB from a glock
* @ gl : The glock in question
*
*/
void gfs2_lvb_unhold ( struct gfs2_glock * gl )
{
2008-01-30 15:34:04 +00:00
struct gfs2_sbd * sdp = gl - > gl_sbd ;
2006-01-16 16:50:04 +00:00
gfs2_glock_hold ( gl ) ;
gfs2_assert ( gl - > gl_sbd , atomic_read ( & gl - > gl_lvb_count ) > 0 ) ;
if ( atomic_dec_and_test ( & gl - > gl_lvb_count ) ) {
2008-05-23 14:46:04 +01:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_unhold_lvb )
2008-01-30 15:34:04 +00:00
sdp - > sd_lockstruct . ls_ops - > lm_unhold_lvb ( gl - > gl_lock , gl - > gl_lvb ) ;
2006-01-16 16:50:04 +00:00
gl - > gl_lvb = NULL ;
gfs2_glock_put ( gl ) ;
}
gfs2_glock_put ( gl ) ;
}
static void blocking_cb ( struct gfs2_sbd * sdp , struct lm_lockname * name ,
unsigned int state )
{
struct gfs2_glock * gl ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
unsigned long delay = 0 ;
unsigned long holdtime ;
unsigned long now = jiffies ;
2006-01-16 16:50:04 +00:00
gl = gfs2_glock_find ( sdp , name ) ;
if ( ! gl )
return ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
2008-09-02 13:33:17 +01:00
if ( test_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
handle_callback ( gl , state , 1 , delay ) ;
2008-05-21 17:03:22 +01:00
spin_unlock ( & gl - > gl_spin ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_glock_cb - Callback used by locking module
2006-09-07 15:50:20 -04:00
* @ sdp : Pointer to the superblock
2006-01-16 16:50:04 +00:00
* @ type : Type of callback
* @ data : Type dependent data pointer
*
* Called by the locking module when it wants to tell us something .
* Either we need to drop a lock , one of our ASYNC requests completed , or
* a journal from another client needs to be recovered .
*/
2006-09-08 10:17:58 -04:00
void gfs2_glock_cb ( void * cb_data , unsigned int type , void * data )
2006-01-16 16:50:04 +00:00
{
2006-09-08 10:17:58 -04:00
struct gfs2_sbd * sdp = cb_data ;
2006-01-16 16:50:04 +00:00
switch ( type ) {
case LM_CB_NEED_E :
2006-04-27 11:25:45 -04:00
blocking_cb ( sdp , data , LM_ST_UNLOCKED ) ;
2006-01-16 16:50:04 +00:00
return ;
case LM_CB_NEED_D :
2006-04-27 11:25:45 -04:00
blocking_cb ( sdp , data , LM_ST_DEFERRED ) ;
2006-01-16 16:50:04 +00:00
return ;
case LM_CB_NEED_S :
2006-04-27 11:25:45 -04:00
blocking_cb ( sdp , data , LM_ST_SHARED ) ;
2006-01-16 16:50:04 +00:00
return ;
case LM_CB_ASYNC : {
2006-04-27 11:25:45 -04:00
struct lm_async_cb * async = data ;
2006-01-16 16:50:04 +00:00
struct gfs2_glock * gl ;
2007-01-29 11:51:45 +00:00
down_read ( & gfs2_umount_flush_sem ) ;
2006-01-16 16:50:04 +00:00
gl = gfs2_glock_find ( sdp , & async - > lc_name ) ;
if ( gfs2_assert_warn ( sdp , gl ) )
return ;
2008-05-21 17:03:22 +01:00
gl - > gl_reply = async - > lc_ret ;
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2007-01-29 11:51:45 +00:00
up_read ( & gfs2_umount_flush_sem ) ;
2006-01-16 16:50:04 +00:00
return ;
}
case LM_CB_NEED_RECOVERY :
gfs2_jdesc_make_dirty ( sdp , * ( unsigned int * ) data ) ;
if ( sdp - > sd_recoverd_process )
wake_up_process ( sdp - > sd_recoverd_process ) ;
return ;
default :
gfs2_assert_warn ( sdp , 0 ) ;
return ;
}
}
/**
* demote_ok - Check to see if it ' s ok to unlock a glock
* @ gl : the glock
*
* Returns : 1 if it ' s ok
*/
static int demote_ok ( struct gfs2_glock * gl )
{
2006-08-30 09:30:00 -04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
2006-01-16 16:50:04 +00:00
int demote = 1 ;
if ( test_bit ( GLF_STICKY , & gl - > gl_flags ) )
demote = 0 ;
else if ( glops - > go_demote_ok )
demote = glops - > go_demote_ok ( gl ) ;
return demote ;
}
/**
* gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
* @ gl : the glock
*
*/
void gfs2_glock_schedule_for_reclaim ( struct gfs2_glock * gl )
{
struct gfs2_sbd * sdp = gl - > gl_sbd ;
spin_lock ( & sdp - > sd_reclaim_lock ) ;
if ( list_empty ( & gl - > gl_reclaim ) ) {
gfs2_glock_hold ( gl ) ;
list_add ( & gl - > gl_reclaim , & sdp - > sd_reclaim_list ) ;
atomic_inc ( & sdp - > sd_reclaim_count ) ;
2008-01-28 14:54:16 -06:00
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
wake_up ( & sdp - > sd_reclaim_wq ) ;
} else
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_reclaim_glock - process the next glock on the filesystem ' s reclaim list
* @ sdp : the filesystem
*
* Called from gfs2_glockd ( ) glock reclaim daemon , or when promoting a
* different glock and we notice that there are a lot of glocks in the
* reclaim list .
*
*/
void gfs2_reclaim_glock ( struct gfs2_sbd * sdp )
{
struct gfs2_glock * gl ;
2008-05-21 17:03:22 +01:00
int done_callback = 0 ;
2006-01-16 16:50:04 +00:00
spin_lock ( & sdp - > sd_reclaim_lock ) ;
if ( list_empty ( & sdp - > sd_reclaim_list ) ) {
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
return ;
}
gl = list_entry ( sdp - > sd_reclaim_list . next ,
struct gfs2_glock , gl_reclaim ) ;
list_del_init ( & gl - > gl_reclaim ) ;
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
atomic_dec ( & sdp - > sd_reclaim_count ) ;
atomic_inc ( & sdp - > sd_reclaimed ) ;
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
if ( find_first_holder ( gl ) = = NULL & &
gl - > gl_state ! = LM_ST_UNLOCKED & & demote_ok ( gl ) ) {
handle_callback ( gl , LM_ST_UNLOCKED , 0 , 0 ) ;
done_callback = 1 ;
2006-01-16 16:50:04 +00:00
}
2008-05-21 17:03:22 +01:00
spin_unlock ( & gl - > gl_spin ) ;
if ( ! done_callback | |
queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 16:50:04 +00:00
}
/**
* examine_bucket - Call a function for glock in a hash bucket
* @ examiner : the function
* @ sdp : the filesystem
* @ bucket : the bucket
*
* Returns : 1 if the bucket has entries
*/
static int examine_bucket ( glock_examiner examiner , struct gfs2_sbd * sdp ,
2006-09-08 13:35:56 -04:00
unsigned int hash )
2006-01-16 16:50:04 +00:00
{
2006-09-11 21:40:30 -04:00
struct gfs2_glock * gl , * prev = NULL ;
int has_entries = 0 ;
2006-09-12 10:10:01 -04:00
struct hlist_head * head = & gl_hash_table [ hash ] . hb_list ;
2006-01-16 16:50:04 +00:00
2006-09-11 21:40:30 -04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-09-12 10:10:01 -04:00
/* Can't use hlist_for_each_entry - don't want prefetch here */
if ( hlist_empty ( head ) )
2006-09-11 21:40:30 -04:00
goto out ;
2006-09-12 10:10:01 -04:00
gl = list_entry ( head - > first , struct gfs2_glock , gl_list ) ;
while ( 1 ) {
2007-08-01 13:57:10 +01:00
if ( ! sdp | | gl - > gl_sbd = = sdp ) {
2006-01-16 16:50:04 +00:00
gfs2_glock_hold ( gl ) ;
2006-09-11 21:40:30 -04:00
read_unlock ( gl_lock_addr ( hash ) ) ;
if ( prev )
gfs2_glock_put ( prev ) ;
prev = gl ;
examiner ( gl ) ;
2006-09-14 13:57:38 -04:00
has_entries = 1 ;
2006-09-11 21:40:30 -04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-01-16 16:50:04 +00:00
}
2006-09-12 10:10:01 -04:00
if ( gl - > gl_list . next = = NULL )
break ;
2006-09-11 21:40:30 -04:00
gl = list_entry ( gl - > gl_list . next , struct gfs2_glock , gl_list ) ;
2006-01-16 16:50:04 +00:00
}
2006-09-11 21:40:30 -04:00
out :
read_unlock ( gl_lock_addr ( hash ) ) ;
if ( prev )
gfs2_glock_put ( prev ) ;
2007-08-01 13:57:10 +01:00
cond_resched ( ) ;
2006-09-11 21:40:30 -04:00
return has_entries ;
2006-01-16 16:50:04 +00:00
}
/**
* scan_glock - look at a glock and see if we can reclaim it
* @ gl : the glock to look at
*
*/
static void scan_glock ( struct gfs2_glock * gl )
{
2006-11-23 10:51:34 -05:00
if ( gl - > gl_ops = = & gfs2_inode_glops & & gl - > gl_object )
2006-09-11 21:40:30 -04:00
return ;
2008-05-21 17:03:22 +01:00
if ( test_bit ( GLF_LOCK , & gl - > gl_flags ) )
return ;
2006-08-24 17:03:05 -04:00
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
if ( find_first_holder ( gl ) = = NULL & &
gl - > gl_state ! = LM_ST_UNLOCKED & & demote_ok ( gl ) )
gfs2_glock_schedule_for_reclaim ( gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
2006-01-16 16:50:04 +00:00
}
/**
* clear_glock - look at a glock and see if we can free it from glock cache
* @ gl : the glock to look at
*
*/
static void clear_glock ( struct gfs2_glock * gl )
{
struct gfs2_sbd * sdp = gl - > gl_sbd ;
int released ;
spin_lock ( & sdp - > sd_reclaim_lock ) ;
if ( ! list_empty ( & gl - > gl_reclaim ) ) {
list_del_init ( & gl - > gl_reclaim ) ;
atomic_dec ( & sdp - > sd_reclaim_count ) ;
2006-04-20 16:57:23 -04:00
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
2006-01-16 16:50:04 +00:00
released = gfs2_glock_put ( gl ) ;
gfs2_assert ( sdp , ! released ) ;
2006-04-20 16:57:23 -04:00
} else {
spin_unlock ( & sdp - > sd_reclaim_lock ) ;
2006-01-16 16:50:04 +00:00
}
2008-05-21 17:03:22 +01:00
spin_lock ( & gl - > gl_spin ) ;
if ( find_first_holder ( gl ) = = NULL & & gl - > gl_state ! = LM_ST_UNLOCKED )
handle_callback ( gl , LM_ST_UNLOCKED , 0 , 0 ) ;
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 16:50:04 +00:00
}
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @ sdp : the filesystem
* @ wait : wait until it ' s all gone
*
2008-06-03 14:09:53 +01:00
* Called when unmounting the filesystem .
2006-01-16 16:50:04 +00:00
*/
2008-06-03 14:09:53 +01:00
void gfs2_gl_hash_clear ( struct gfs2_sbd * sdp )
2006-01-16 16:50:04 +00:00
{
unsigned long t ;
unsigned int x ;
int cont ;
t = jiffies ;
for ( ; ; ) {
cont = 0 ;
2006-09-11 21:40:30 -04:00
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + ) {
2006-09-25 09:26:04 -04:00
if ( examine_bucket ( clear_glock , sdp , x ) )
2006-01-16 16:50:04 +00:00
cont = 1 ;
2006-09-11 21:40:30 -04:00
}
2006-01-16 16:50:04 +00:00
2008-06-03 14:09:53 +01:00
if ( ! cont )
2006-01-16 16:50:04 +00:00
break ;
if ( time_after_eq ( jiffies ,
t + gfs2_tune_get ( sdp , gt_stall_secs ) * HZ ) ) {
fs_warn ( sdp , " Unmount seems to be stalled. "
" Dumping lock state... \n " ) ;
gfs2_dump_lockstate ( sdp ) ;
t = jiffies ;
}
2007-01-29 11:51:45 +00:00
down_write ( & gfs2_umount_flush_sem ) ;
2006-01-16 16:50:04 +00:00
invalidate_inodes ( sdp - > sd_vfs ) ;
2007-01-29 11:51:45 +00:00
up_write ( & gfs2_umount_flush_sem ) ;
2006-05-05 16:59:11 -04:00
msleep ( 10 ) ;
2006-01-16 16:50:04 +00:00
}
}
2008-05-21 17:03:22 +01:00
static const char * state2str ( unsigned state )
2007-03-23 17:05:15 -05:00
{
2008-05-21 17:03:22 +01:00
switch ( state ) {
case LM_ST_UNLOCKED :
return " UN " ;
case LM_ST_SHARED :
return " SH " ;
case LM_ST_DEFERRED :
return " DF " ;
case LM_ST_EXCLUSIVE :
return " EX " ;
}
return " ?? " ;
}
static const char * hflags2str ( char * buf , unsigned flags , unsigned long iflags )
{
char * p = buf ;
if ( flags & LM_FLAG_TRY )
* p + + = ' t ' ;
if ( flags & LM_FLAG_TRY_1CB )
* p + + = ' T ' ;
if ( flags & LM_FLAG_NOEXP )
* p + + = ' e ' ;
if ( flags & LM_FLAG_ANY )
* p + + = ' a ' ;
if ( flags & LM_FLAG_PRIORITY )
* p + + = ' p ' ;
if ( flags & GL_ASYNC )
* p + + = ' a ' ;
if ( flags & GL_EXACT )
* p + + = ' E ' ;
if ( flags & GL_NOCACHE )
* p + + = ' c ' ;
if ( test_bit ( HIF_HOLDER , & iflags ) )
* p + + = ' H ' ;
if ( test_bit ( HIF_WAIT , & iflags ) )
* p + + = ' W ' ;
if ( test_bit ( HIF_FIRST , & iflags ) )
* p + + = ' F ' ;
* p = 0 ;
return buf ;
2007-03-23 17:05:15 -05:00
}
2006-01-16 16:50:04 +00:00
/**
* dump_holder - print information about a glock holder
2008-05-21 17:03:22 +01:00
* @ seq : the seq_file struct
2006-01-16 16:50:04 +00:00
* @ gh : the glock holder
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 17:03:22 +01:00
static int dump_holder ( struct seq_file * seq , const struct gfs2_holder * gh )
2006-01-16 16:50:04 +00:00
{
2008-05-21 17:03:22 +01:00
struct task_struct * gh_owner = NULL ;
char buffer [ KSYM_SYMBOL_LEN ] ;
char flags_buf [ 32 ] ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
sprint_symbol ( buffer , gh - > gh_ip ) ;
if ( gh - > gh_owner_pid )
2008-02-07 00:13:19 -08:00
gh_owner = pid_task ( gh - > gh_owner_pid , PIDTYPE_PID ) ;
2008-05-21 17:03:22 +01:00
gfs2_print_dbg ( seq , " H: s:%s f:%s e:%d p:%ld [%s] %s \n " ,
state2str ( gh - > gh_state ) ,
hflags2str ( flags_buf , gh - > gh_flags , gh - > gh_iflags ) ,
gh - > gh_error ,
gh - > gh_owner_pid ? ( long ) pid_nr ( gh - > gh_owner_pid ) : - 1 ,
gh_owner ? gh_owner - > comm : " (ended) " , buffer ) ;
2007-03-16 10:26:37 +00:00
return 0 ;
2006-01-16 16:50:04 +00:00
}
2008-05-21 17:03:22 +01:00
static const char * gflags2str ( char * buf , const unsigned long * gflags )
{
char * p = buf ;
if ( test_bit ( GLF_LOCK , gflags ) )
* p + + = ' l ' ;
if ( test_bit ( GLF_STICKY , gflags ) )
* p + + = ' s ' ;
if ( test_bit ( GLF_DEMOTE , gflags ) )
* p + + = ' D ' ;
if ( test_bit ( GLF_PENDING_DEMOTE , gflags ) )
* p + + = ' d ' ;
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , gflags ) )
* p + + = ' p ' ;
if ( test_bit ( GLF_DIRTY , gflags ) )
* p + + = ' y ' ;
if ( test_bit ( GLF_LFLUSH , gflags ) )
* p + + = ' f ' ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , gflags ) )
* p + + = ' i ' ;
if ( test_bit ( GLF_REPLY_PENDING , gflags ) )
* p + + = ' r ' ;
* p = 0 ;
return buf ;
2006-01-16 16:50:04 +00:00
}
/**
2008-05-21 17:03:22 +01:00
* __dump_glock - print information about a glock
* @ seq : The seq_file struct
2006-01-16 16:50:04 +00:00
* @ gl : the glock
2008-05-21 17:03:22 +01:00
*
* The file format is as follows :
* One line per object , capital letters are used to indicate objects
* G = glock , I = Inode , R = rgrp , H = holder . Glocks are not indented ,
* other objects are indented by a single space and follow the glock to
* which they are related . Fields are indicated by lower case letters
* followed by a colon and the field value , except for strings which are in
* [ ] so that its possible to see if they are composed of spaces for
* example . The field ' s are n = number ( id of the object ) , f = flags ,
* t = type , s = state , r = refcount , e = error , p = pid .
2006-01-16 16:50:04 +00:00
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 17:03:22 +01:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl )
2006-01-16 16:50:04 +00:00
{
2008-05-21 17:03:22 +01:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
unsigned long long dtime ;
const struct gfs2_holder * gh ;
char gflags_buf [ 32 ] ;
int error = 0 ;
2006-01-16 16:50:04 +00:00
2008-05-21 17:03:22 +01:00
dtime = jiffies - gl - > gl_demote_time ;
dtime * = 1000000 / HZ ; /* demote time in uSec */
if ( ! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
dtime = 0 ;
gfs2_print_dbg ( seq , " G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d \n " ,
state2str ( gl - > gl_state ) ,
gl - > gl_name . ln_type ,
( unsigned long long ) gl - > gl_name . ln_number ,
gflags2str ( gflags_buf , & gl - > gl_flags ) ,
state2str ( gl - > gl_target ) ,
state2str ( gl - > gl_demote_state ) , dtime ,
atomic_read ( & gl - > gl_lvb_count ) ,
atomic_read ( & gl - > gl_ail_count ) ,
atomic_read ( & gl - > gl_ref ) ) ;
2006-01-16 16:50:04 +00:00
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
2008-05-21 17:03:22 +01:00
error = dump_holder ( seq , gh ) ;
2006-01-16 16:50:04 +00:00
if ( error )
goto out ;
}
2008-05-21 17:03:22 +01:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & glops - > go_dump )
error = glops - > go_dump ( seq , gl ) ;
2006-09-04 12:04:26 -04:00
out :
2006-01-16 16:50:04 +00:00
return error ;
}
2008-05-21 17:03:22 +01:00
static int dump_glock ( struct seq_file * seq , struct gfs2_glock * gl )
{
int ret ;
spin_lock ( & gl - > gl_spin ) ;
ret = __dump_glock ( seq , gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
return ret ;
}
2006-01-16 16:50:04 +00:00
/**
* gfs2_dump_lockstate - print out the current lockstate
* @ sdp : the filesystem
* @ ub : the buffer to copy the information into
*
* If @ ub is NULL , dump the lockstate to the console .
*
*/
2006-04-28 10:59:12 -04:00
static int gfs2_dump_lockstate ( struct gfs2_sbd * sdp )
2006-01-16 16:50:04 +00:00
{
struct gfs2_glock * gl ;
2006-09-12 10:10:01 -04:00
struct hlist_node * h ;
2006-01-16 16:50:04 +00:00
unsigned int x ;
int error = 0 ;
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + ) {
2006-09-09 16:59:11 -04:00
read_lock ( gl_lock_addr ( x ) ) ;
2006-01-16 16:50:04 +00:00
2006-09-12 10:10:01 -04:00
hlist_for_each_entry ( gl , h , & gl_hash_table [ x ] . hb_list , gl_list ) {
2006-09-07 14:40:21 -04:00
if ( gl - > gl_sbd ! = sdp )
continue ;
2006-01-16 16:50:04 +00:00
2007-03-16 10:26:37 +00:00
error = dump_glock ( NULL , gl ) ;
2006-01-16 16:50:04 +00:00
if ( error )
break ;
}
2006-09-09 16:59:11 -04:00
read_unlock ( gl_lock_addr ( x ) ) ;
2006-01-16 16:50:04 +00:00
if ( error )
break ;
}
return error ;
}
2007-08-01 13:57:10 +01:00
/**
* gfs2_scand - Look for cached glocks and inodes to toss from memory
* @ sdp : Pointer to GFS2 superblock
*
* One of these daemons runs , finding candidates to add to sd_reclaim_list .
* See gfs2_glockd ( )
*/
static int gfs2_scand ( void * data )
{
unsigned x ;
unsigned delay ;
while ( ! kthread_should_stop ( ) ) {
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + )
examine_bucket ( scan_glock , NULL , x ) ;
if ( freezing ( current ) )
refrigerator ( ) ;
delay = scand_secs ;
if ( delay < 1 )
delay = 1 ;
schedule_timeout_interruptible ( delay * HZ ) ;
}
return 0 ;
}
2006-09-07 14:40:21 -04:00
int __init gfs2_glock_init ( void )
{
unsigned i ;
for ( i = 0 ; i < GFS2_GL_HASH_SIZE ; i + + ) {
2006-09-12 10:10:01 -04:00
INIT_HLIST_HEAD ( & gl_hash_table [ i ] . hb_list ) ;
2006-09-07 14:40:21 -04:00
}
2006-09-09 16:59:11 -04:00
# ifdef GL_HASH_LOCK_SZ
for ( i = 0 ; i < GL_HASH_LOCK_SZ ; i + + ) {
rwlock_init ( & gl_hash_locks [ i ] ) ;
}
# endif
2007-08-01 13:57:10 +01:00
scand_process = kthread_run ( gfs2_scand , NULL , " gfs2_scand " ) ;
if ( IS_ERR ( scand_process ) )
return PTR_ERR ( scand_process ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
glock_workqueue = create_workqueue ( " glock_workqueue " ) ;
if ( IS_ERR ( glock_workqueue ) ) {
kthread_stop ( scand_process ) ;
return PTR_ERR ( glock_workqueue ) ;
}
2006-09-07 14:40:21 -04:00
return 0 ;
}
2007-08-01 13:57:10 +01:00
void gfs2_glock_exit ( void )
{
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 13:19:05 -05:00
destroy_workqueue ( glock_workqueue ) ;
2007-08-01 13:57:10 +01:00
kthread_stop ( scand_process ) ;
}
module_param ( scand_secs , uint , S_IRUGO | S_IWUSR ) ;
MODULE_PARM_DESC ( scand_secs , " The number of seconds between scand runs " ) ;
2008-05-21 17:03:22 +01:00
static int gfs2_glock_iter_next ( struct gfs2_glock_iter * gi )
2007-03-16 10:26:37 +00:00
{
2007-07-24 13:53:36 +01:00
struct gfs2_glock * gl ;
2007-08-21 09:57:29 -05:00
restart :
2007-04-17 11:37:11 -05:00
read_lock ( gl_lock_addr ( gi - > hash ) ) ;
2007-07-24 13:53:36 +01:00
gl = gi - > gl ;
if ( gl ) {
2007-08-21 09:57:29 -05:00
gi - > gl = hlist_entry ( gl - > gl_list . next ,
struct gfs2_glock , gl_list ) ;
2008-07-22 22:58:03 +01:00
} else {
gi - > gl = hlist_entry ( gl_hash_table [ gi - > hash ] . hb_list . first ,
struct gfs2_glock , gl_list ) ;
2007-03-16 10:26:37 +00:00
}
2008-07-22 22:58:03 +01:00
if ( gi - > gl )
gfs2_glock_hold ( gi - > gl ) ;
2007-04-17 11:37:11 -05:00
read_unlock ( gl_lock_addr ( gi - > hash ) ) ;
2007-07-24 13:53:36 +01:00
if ( gl )
gfs2_glock_put ( gl ) ;
2008-05-21 17:03:22 +01:00
while ( gi - > gl = = NULL ) {
2008-07-22 22:58:03 +01:00
gi - > hash + + ;
2007-07-24 13:53:36 +01:00
if ( gi - > hash > = GFS2_GL_HASH_SIZE )
return 1 ;
read_lock ( gl_lock_addr ( gi - > hash ) ) ;
gi - > gl = hlist_entry ( gl_hash_table [ gi - > hash ] . hb_list . first ,
struct gfs2_glock , gl_list ) ;
if ( gi - > gl )
gfs2_glock_hold ( gi - > gl ) ;
read_unlock ( gl_lock_addr ( gi - > hash ) ) ;
}
2007-08-21 09:57:29 -05:00
if ( gi - > sdp ! = gi - > gl - > gl_sbd )
goto restart ;
2007-03-16 10:26:37 +00:00
return 0 ;
}
2008-05-21 17:03:22 +01:00
static void gfs2_glock_iter_free ( struct gfs2_glock_iter * gi )
2007-03-16 10:26:37 +00:00
{
2007-07-24 13:53:36 +01:00
if ( gi - > gl )
gfs2_glock_put ( gi - > gl ) ;
2007-08-21 09:57:29 -05:00
gi - > gl = NULL ;
2007-03-16 10:26:37 +00:00
}
2008-05-21 17:03:22 +01:00
static void * gfs2_glock_seq_start ( struct seq_file * seq , loff_t * pos )
2007-03-16 10:26:37 +00:00
{
2008-05-21 17:03:22 +01:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 10:26:37 +00:00
loff_t n = * pos ;
2008-05-21 17:03:22 +01:00
gi - > hash = 0 ;
2007-03-16 10:26:37 +00:00
2008-05-21 17:03:22 +01:00
do {
2007-03-16 10:26:37 +00:00
if ( gfs2_glock_iter_next ( gi ) ) {
gfs2_glock_iter_free ( gi ) ;
return NULL ;
}
2008-05-21 17:03:22 +01:00
} while ( n - - ) ;
2007-03-16 10:26:37 +00:00
2008-05-21 17:03:22 +01:00
return gi - > gl ;
2007-03-16 10:26:37 +00:00
}
2008-05-21 17:03:22 +01:00
static void * gfs2_glock_seq_next ( struct seq_file * seq , void * iter_ptr ,
2007-03-16 10:26:37 +00:00
loff_t * pos )
{
2008-05-21 17:03:22 +01:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 10:26:37 +00:00
( * pos ) + + ;
if ( gfs2_glock_iter_next ( gi ) ) {
gfs2_glock_iter_free ( gi ) ;
return NULL ;
}
2008-05-21 17:03:22 +01:00
return gi - > gl ;
2007-03-16 10:26:37 +00:00
}
2008-05-21 17:03:22 +01:00
static void gfs2_glock_seq_stop ( struct seq_file * seq , void * iter_ptr )
2007-03-16 10:26:37 +00:00
{
2008-05-21 17:03:22 +01:00
struct gfs2_glock_iter * gi = seq - > private ;
gfs2_glock_iter_free ( gi ) ;
2007-03-16 10:26:37 +00:00
}
2008-05-21 17:03:22 +01:00
static int gfs2_glock_seq_show ( struct seq_file * seq , void * iter_ptr )
2007-03-16 10:26:37 +00:00
{
2008-05-21 17:03:22 +01:00
return dump_glock ( seq , iter_ptr ) ;
2007-03-16 10:26:37 +00:00
}
2007-07-31 18:31:11 +08:00
static const struct seq_operations gfs2_glock_seq_ops = {
2007-03-16 10:26:37 +00:00
. start = gfs2_glock_seq_start ,
. next = gfs2_glock_seq_next ,
. stop = gfs2_glock_seq_stop ,
. show = gfs2_glock_seq_show ,
} ;
static int gfs2_debugfs_open ( struct inode * inode , struct file * file )
{
2008-05-21 17:03:22 +01:00
int ret = seq_open_private ( file , & gfs2_glock_seq_ops ,
sizeof ( struct gfs2_glock_iter ) ) ;
if ( ret = = 0 ) {
struct seq_file * seq = file - > private_data ;
struct gfs2_glock_iter * gi = seq - > private ;
gi - > sdp = inode - > i_private ;
}
return ret ;
2007-03-16 10:26:37 +00:00
}
static const struct file_operations gfs2_debug_fops = {
. owner = THIS_MODULE ,
. open = gfs2_debugfs_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2008-05-21 17:03:22 +01:00
. release = seq_release_private ,
2007-03-16 10:26:37 +00:00
} ;
int gfs2_create_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 11:41:11 -05:00
sdp - > debugfs_dir = debugfs_create_dir ( sdp - > sd_table_name , gfs2_root ) ;
if ( ! sdp - > debugfs_dir )
return - ENOMEM ;
sdp - > debugfs_dentry_glocks = debugfs_create_file ( " glocks " ,
S_IFREG | S_IRUGO ,
sdp - > debugfs_dir , sdp ,
& gfs2_debug_fops ) ;
if ( ! sdp - > debugfs_dentry_glocks )
2007-03-16 10:26:37 +00:00
return - ENOMEM ;
return 0 ;
}
void gfs2_delete_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 11:41:11 -05:00
if ( sdp & & sdp - > debugfs_dir ) {
if ( sdp - > debugfs_dentry_glocks ) {
debugfs_remove ( sdp - > debugfs_dentry_glocks ) ;
sdp - > debugfs_dentry_glocks = NULL ;
}
debugfs_remove ( sdp - > debugfs_dir ) ;
sdp - > debugfs_dir = NULL ;
}
2007-03-16 10:26:37 +00:00
}
int gfs2_register_debugfs ( void )
{
gfs2_root = debugfs_create_dir ( " gfs2 " , NULL ) ;
return gfs2_root ? 0 : - ENOMEM ;
}
void gfs2_unregister_debugfs ( void )
{
debugfs_remove ( gfs2_root ) ;
2007-04-18 11:41:11 -05:00
gfs2_root = NULL ;
2007-03-16 10:26:37 +00:00
}