2006-01-16 19:50:04 +03:00
/*
* Copyright ( C ) Sistina Software , Inc . 1997 - 2003 All rights reserved .
2008-01-31 19:31:39 +03:00
* Copyright ( C ) 2004 - 2008 Red Hat , Inc . All rights reserved .
2006-01-16 19:50:04 +03:00
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
2006-09-01 19:05:15 +04:00
* of the GNU General Public License version 2.
2006-01-16 19:50:04 +03:00
*/
# include <linux/sched.h>
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/buffer_head.h>
# include <linux/delay.h>
# include <linux/sort.h>
# include <linux/jhash.h>
2006-03-29 23:36:49 +04:00
# include <linux/kallsyms.h>
2006-02-28 01:23:27 +03:00
# include <linux/gfs2_ondisk.h>
2006-09-12 05:40:30 +04:00
# include <linux/list.h>
2007-01-17 18:33:23 +03:00
# include <linux/wait.h>
2007-03-06 10:10:39 +03:00
# include <linux/module.h>
2007-01-29 14:51:45 +03:00
# include <linux/rwsem.h>
2006-01-16 19:50:04 +03:00
# include <asm/uaccess.h>
2007-03-16 13:26:37 +03:00
# include <linux/seq_file.h>
# include <linux/debugfs.h>
2007-08-01 16:57:10 +04:00
# include <linux/kthread.h>
# include <linux/freezer.h>
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
# include <linux/workqueue.h>
# include <linux/jiffies.h>
2006-01-16 19:50:04 +03:00
# include "gfs2.h"
2006-02-28 01:23:27 +03:00
# include "incore.h"
2006-01-16 19:50:04 +03:00
# include "glock.h"
# include "glops.h"
# include "inode.h"
# include "lops.h"
# include "meta_io.h"
# include "quota.h"
# include "super.h"
2006-02-28 01:23:27 +03:00
# include "util.h"
2008-11-18 16:38:48 +03:00
# include "bmap.h"
2006-01-16 19:50:04 +03:00
2006-09-08 21:35:56 +04:00
struct gfs2_gl_hash_bucket {
2006-09-12 18:10:01 +04:00
struct hlist_head hb_list ;
2006-09-08 21:35:56 +04:00
} ;
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter {
int hash ; /* hash bucket index */
struct gfs2_sbd * sdp ; /* incore superblock */
struct gfs2_glock * gl ; /* current glock struct */
char string [ 512 ] ; /* scratch space */
2007-03-16 13:26:37 +03:00
} ;
2006-01-16 19:50:04 +03:00
typedef void ( * glock_examiner ) ( struct gfs2_glock * gl ) ;
2006-04-28 18:59:12 +04:00
static int gfs2_dump_lockstate ( struct gfs2_sbd * sdp ) ;
2008-05-21 20:03:22 +04:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl ) ;
# define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
2007-01-29 14:51:45 +03:00
static DECLARE_RWSEM ( gfs2_umount_flush_sem ) ;
2007-03-16 13:26:37 +03:00
static struct dentry * gfs2_root ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
static struct workqueue_struct * glock_workqueue ;
2008-11-20 16:39:47 +03:00
static LIST_HEAD ( lru_list ) ;
static atomic_t lru_count = ATOMIC_INIT ( 0 ) ;
2008-12-25 17:35:27 +03:00
static DEFINE_SPINLOCK ( lru_lock ) ;
2006-04-28 18:59:12 +04:00
2006-09-12 18:10:01 +04:00
# define GFS2_GL_HASH_SHIFT 15
2006-09-10 00:59:11 +04:00
# define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
# define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
2006-09-07 22:40:21 +04:00
static struct gfs2_gl_hash_bucket gl_hash_table [ GFS2_GL_HASH_SIZE ] ;
2007-03-24 01:05:15 +03:00
static struct dentry * gfs2_root ;
2006-09-10 00:59:11 +04:00
/*
* Despite what you might think , the numbers below are not arbitrary : - )
* They are taken from the ipv4 routing hash code , which is well tested
* and thus should be nearly optimal . Later on we might tweek the numbers
* but for now this should be fine .
*
* The reason for putting the locks in a separate array from the list heads
* is that we can have fewer locks than list heads and save memory . We use
* the same hash function for both , but with a different hash mask .
*/
# if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
defined ( CONFIG_PROVE_LOCKING )
# ifdef CONFIG_LOCKDEP
# define GL_HASH_LOCK_SZ 256
# else
# if NR_CPUS >= 32
# define GL_HASH_LOCK_SZ 4096
# elif NR_CPUS >= 16
# define GL_HASH_LOCK_SZ 2048
# elif NR_CPUS >= 8
# define GL_HASH_LOCK_SZ 1024
# elif NR_CPUS >= 4
# define GL_HASH_LOCK_SZ 512
# else
# define GL_HASH_LOCK_SZ 256
# endif
# endif
/* We never want more locks than chains */
# if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
# undef GL_HASH_LOCK_SZ
# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
# endif
static rwlock_t gl_hash_locks [ GL_HASH_LOCK_SZ ] ;
static inline rwlock_t * gl_lock_addr ( unsigned int x )
{
2006-09-10 02:59:27 +04:00
return & gl_hash_locks [ x & ( GL_HASH_LOCK_SZ - 1 ) ] ;
2006-09-10 00:59:11 +04:00
}
# else /* not SMP, so no spinlocks required */
2006-11-29 09:29:19 +03:00
static inline rwlock_t * gl_lock_addr ( unsigned int x )
2006-09-10 00:59:11 +04:00
{
return NULL ;
}
# endif
2006-09-07 22:40:21 +04:00
2006-01-16 19:50:04 +03:00
/**
* gl_hash ( ) - Turn glock number into hash bucket number
* @ lock : The glock number
*
* Returns : The number of the corresponding hash bucket
*/
2006-09-07 21:12:27 +04:00
static unsigned int gl_hash ( const struct gfs2_sbd * sdp ,
const struct lm_lockname * name )
2006-01-16 19:50:04 +03:00
{
unsigned int h ;
2006-09-04 20:49:07 +04:00
h = jhash ( & name - > ln_number , sizeof ( u64 ) , 0 ) ;
2006-01-16 19:50:04 +03:00
h = jhash ( & name - > ln_type , sizeof ( unsigned int ) , h ) ;
2006-09-07 21:12:27 +04:00
h = jhash ( & sdp , sizeof ( struct gfs2_sbd * ) , h ) ;
2006-01-16 19:50:04 +03:00
h & = GFS2_GL_HASH_MASK ;
return h ;
}
/**
* glock_free ( ) - Perform a few checks and then release struct gfs2_glock
* @ gl : The glock to release
*
* Also calls lock module to release its internal structure for this glock .
*
*/
static void glock_free ( struct gfs2_glock * gl )
{
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct inode * aspace = gl - > gl_aspace ;
if ( aspace )
gfs2_aspace_put ( aspace ) ;
2009-01-12 13:43:39 +03:00
sdp - > sd_lockstruct . ls_ops - > lm_put_lock ( gfs2_glock_cachep , gl ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_hold ( ) - increment reference count on glock
* @ gl : The glock to hold
*
*/
2008-01-29 01:11:34 +03:00
static void gfs2_glock_hold ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2009-02-05 13:12:38 +03:00
GLOCK_BUG_ON ( gl , atomic_read ( & gl - > gl_ref ) = = 0 ) ;
2006-09-13 18:43:37 +04:00
atomic_inc ( & gl - > gl_ref ) ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
/**
* gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
* @ gl : the glock
*
*/
static void gfs2_glock_schedule_for_reclaim ( struct gfs2_glock * gl )
{
spin_lock ( & lru_lock ) ;
if ( list_empty ( & gl - > gl_lru ) & & gl - > gl_state ! = LM_ST_UNLOCKED ) {
list_add_tail ( & gl - > gl_lru , & lru_list ) ;
atomic_inc ( & lru_count ) ;
}
spin_unlock ( & lru_lock ) ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_put ( ) - Decrement reference count on glock
* @ gl : The glock to put
*
*/
int gfs2_glock_put ( struct gfs2_glock * gl )
{
int rv = 0 ;
2006-09-10 00:59:11 +04:00
write_lock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2006-09-13 18:43:37 +04:00
if ( atomic_dec_and_test ( & gl - > gl_ref ) ) {
2006-09-12 18:10:01 +04:00
hlist_del ( & gl - > gl_list ) ;
2006-09-10 00:59:11 +04:00
write_unlock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
if ( ! list_empty ( & gl - > gl_lru ) ) {
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
}
spin_unlock ( & lru_lock ) ;
2008-05-21 20:03:22 +04:00
GLOCK_BUG_ON ( gl , ! list_empty ( & gl - > gl_holders ) ) ;
2006-01-16 19:50:04 +03:00
glock_free ( gl ) ;
rv = 1 ;
goto out ;
}
2008-11-20 16:39:47 +03:00
/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
if ( atomic_read ( & gl - > gl_ref ) = = 2 )
gfs2_glock_schedule_for_reclaim ( gl ) ;
2009-02-05 13:12:38 +03:00
write_unlock ( gl_lock_addr ( gl - > gl_hash ) ) ;
2006-08-25 01:03:05 +04:00
out :
2006-01-16 19:50:04 +03:00
return rv ;
}
/**
* search_bucket ( ) - Find struct gfs2_glock by lock number
* @ bucket : the bucket to search
* @ name : The lock name
*
* Returns : NULL , or the struct gfs2_glock with the requested number
*/
2006-09-08 21:35:56 +04:00
static struct gfs2_glock * search_bucket ( unsigned int hash ,
2006-08-30 20:50:28 +04:00
const struct gfs2_sbd * sdp ,
2006-08-30 19:16:23 +04:00
const struct lm_lockname * name )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
2006-09-12 18:10:01 +04:00
struct hlist_node * h ;
2006-01-16 19:50:04 +03:00
2006-09-12 18:10:01 +04:00
hlist_for_each_entry ( gl , h , & gl_hash_table [ hash ] . hb_list , gl_list ) {
2006-01-16 19:50:04 +03:00
if ( ! lm_name_equal ( & gl - > gl_name , name ) )
continue ;
2006-08-30 20:50:28 +04:00
if ( gl - > gl_sbd ! = sdp )
continue ;
2006-01-16 19:50:04 +03:00
2006-09-13 18:43:37 +04:00
atomic_inc ( & gl - > gl_ref ) ;
2006-01-16 19:50:04 +03:00
return gl ;
}
return NULL ;
}
2008-05-21 20:03:22 +04:00
/**
* may_grant - check if its ok to grant a new lock
* @ gl : The glock
* @ gh : The lock request which we wish to grant
*
* Returns : true if its ok to grant the lock
*/
static inline int may_grant ( const struct gfs2_glock * gl , const struct gfs2_holder * gh )
{
const struct gfs2_holder * gh_head = list_entry ( gl - > gl_holders . next , const struct gfs2_holder , gh_list ) ;
if ( ( gh - > gh_state = = LM_ST_EXCLUSIVE | |
gh_head - > gh_state = = LM_ST_EXCLUSIVE ) & & gh ! = gh_head )
return 0 ;
if ( gl - > gl_state = = gh - > gh_state )
return 1 ;
if ( gh - > gh_flags & GL_EXACT )
return 0 ;
2008-07-07 13:07:28 +04:00
if ( gl - > gl_state = = LM_ST_EXCLUSIVE ) {
if ( gh - > gh_state = = LM_ST_SHARED & & gh_head - > gh_state = = LM_ST_SHARED )
return 1 ;
if ( gh - > gh_state = = LM_ST_DEFERRED & & gh_head - > gh_state = = LM_ST_DEFERRED )
return 1 ;
}
2008-05-21 20:03:22 +04:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & ( gh - > gh_flags & LM_FLAG_ANY ) )
return 1 ;
return 0 ;
}
static void gfs2_holder_wake ( struct gfs2_holder * gh )
{
clear_bit ( HIF_WAIT , & gh - > gh_iflags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gh - > gh_iflags , HIF_WAIT ) ;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @ gl : The glock
*
2008-11-18 16:38:48 +03:00
* Returns : 1 if there is a blocked holder at the head of the list , or 2
* if a type specific operation is underway .
2008-05-21 20:03:22 +04:00
*/
static int do_promote ( struct gfs2_glock * gl )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh , * tmp ;
int ret ;
restart :
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( may_grant ( gl , gh ) ) {
if ( gh - > gh_list . prev = = & gl - > gl_holders & &
glops - > go_lock ) {
spin_unlock ( & gl - > gl_spin ) ;
/* FIXME: eliminate this eventually */
ret = glops - > go_lock ( gh ) ;
spin_lock ( & gl - > gl_spin ) ;
if ( ret ) {
2008-11-18 16:38:48 +03:00
if ( ret = = 1 )
return 2 ;
2008-05-21 20:03:22 +04:00
gh - > gh_error = ret ;
list_del_init ( & gh - > gh_list ) ;
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
gfs2_holder_wake ( gh ) ;
continue ;
}
if ( gh - > gh_list . prev = = & gl - > gl_holders )
return 1 ;
break ;
}
return 0 ;
}
/**
* do_error - Something unexpected has happened during a lock request
*
*/
static inline void do_error ( struct gfs2_glock * gl , const int ret )
{
struct gfs2_holder * gh , * tmp ;
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( ret & LM_OUT_ERROR )
gh - > gh_error = - EIO ;
else if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) )
gh - > gh_error = GLR_TRYFAILED ;
else
continue ;
list_del_init ( & gh - > gh_list ) ;
gfs2_holder_wake ( gh ) ;
}
}
/**
* find_first_waiter - find the first gh that ' s waiting for the glock
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_waiter ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
if ( ! test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* state_change - record that the glock is now in a different state
* @ gl : the glock
* @ new_state the new state
*
*/
static void state_change ( struct gfs2_glock * gl , unsigned int new_state )
{
int held1 , held2 ;
held1 = ( gl - > gl_state ! = LM_ST_UNLOCKED ) ;
held2 = ( new_state ! = LM_ST_UNLOCKED ) ;
if ( held1 ! = held2 ) {
if ( held2 )
gfs2_glock_hold ( gl ) ;
else
gfs2_glock_put ( gl ) ;
}
gl - > gl_state = new_state ;
gl - > gl_tchange = jiffies ;
}
static void gfs2_demote_wake ( struct gfs2_glock * gl )
{
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
clear_bit ( GLF_DEMOTE , & gl - > gl_flags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gl - > gl_flags , GLF_DEMOTE ) ;
}
/**
* finish_xmote - The DLM has replied to one of our lock requests
* @ gl : The glock
* @ ret : The status from the DLM
*
*/
static void finish_xmote ( struct gfs2_glock * gl , unsigned int ret )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh ;
unsigned state = ret & LM_OUT_ST_MASK ;
2008-11-18 16:38:48 +03:00
int rv ;
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
state_change ( gl , state ) ;
gh = find_first_waiter ( gl ) ;
/* Demote to UN request arrived during demote to SH or DF */
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) & &
state ! = LM_ST_UNLOCKED & & gl - > gl_demote_state = = LM_ST_UNLOCKED )
gl - > gl_target = LM_ST_UNLOCKED ;
/* Check for state != intended state */
if ( unlikely ( state ! = gl - > gl_target ) ) {
if ( gh & & ! test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) {
/* move to back of queue and try next entry */
if ( ret & LM_OUT_CANCELED ) {
if ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) = = 0 )
list_move_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
goto retry ;
}
/* Some error or failed "try lock" - report it */
if ( ( ret & LM_OUT_ERROR ) | |
( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) ) {
gl - > gl_target = gl - > gl_state ;
do_error ( gl , ret ) ;
goto out ;
}
}
switch ( state ) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED :
retry :
do_xmote ( gl , gh , gl - > gl_target ) ;
break ;
/* Conversion fails, unlock and try again */
case LM_ST_SHARED :
case LM_ST_DEFERRED :
do_xmote ( gl , gh , LM_ST_UNLOCKED ) ;
break ;
default : /* Everything else */
printk ( KERN_ERR " GFS2: wanted %u got %u \n " , gl - > gl_target , state ) ;
GLOCK_BUG_ON ( gl , 1 ) ;
}
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_put ( gl ) ;
return ;
}
/* Fast path - we got what we asked for */
if ( test_and_clear_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
if ( state ! = LM_ST_UNLOCKED ) {
if ( glops - > go_xmote_bh ) {
spin_unlock ( & gl - > gl_spin ) ;
rv = glops - > go_xmote_bh ( gl , gh ) ;
if ( rv = = - EAGAIN )
return ;
spin_lock ( & gl - > gl_spin ) ;
if ( rv ) {
do_error ( gl , rv ) ;
goto out ;
}
}
2008-11-18 16:38:48 +03:00
rv = do_promote ( gl ) ;
if ( rv = = 2 )
goto out_locked ;
2008-05-21 20:03:22 +04:00
}
out :
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2008-11-18 16:38:48 +03:00
out_locked :
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_put ( gl ) ;
}
static unsigned int gfs2_lm_lock ( struct gfs2_sbd * sdp , void * lock ,
2009-01-12 13:43:39 +03:00
unsigned int req_state ,
2008-05-21 20:03:22 +04:00
unsigned int flags )
{
int ret = LM_OUT_ERROR ;
2008-05-23 17:46:04 +04:00
if ( ! sdp - > sd_lockstruct . ls_ops - > lm_lock )
return req_state = = LM_ST_UNLOCKED ? 0 : req_state ;
2008-05-21 20:03:22 +04:00
if ( likely ( ! test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
2009-01-12 13:43:39 +03:00
ret = sdp - > sd_lockstruct . ls_ops - > lm_lock ( lock ,
2008-05-21 20:03:22 +04:00
req_state , flags ) ;
return ret ;
}
/**
* do_xmote - Calls the DLM to change the state of a lock
* @ gl : The lock state
* @ gh : The holder ( only for promotes )
* @ target : The target lock state
*
*/
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
unsigned int lck_flags = gh ? gh - > gh_flags : 0 ;
int ret ;
lck_flags & = ( LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY ) ;
BUG_ON ( gl - > gl_state = = target ) ;
BUG_ON ( gl - > gl_state = = gl - > gl_target ) ;
if ( ( target = = LM_ST_UNLOCKED | | target = = LM_ST_DEFERRED ) & &
glops - > go_inval ) {
set_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
spin_unlock ( & gl - > gl_spin ) ;
if ( glops - > go_xmote_th )
glops - > go_xmote_th ( gl ) ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
glops - > go_inval ( gl , target = = LM_ST_DEFERRED ? 0 : DIO_METADATA ) ;
clear_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( target ! = LM_ST_UNLOCKED & & ( gl - > gl_state = = LM_ST_SHARED | |
gl - > gl_state = = LM_ST_DEFERRED ) & &
! ( lck_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
lck_flags | = LM_FLAG_TRY_1CB ;
2009-01-12 13:43:39 +03:00
ret = gfs2_lm_lock ( sdp , gl , target , lck_flags ) ;
2008-05-21 20:03:22 +04:00
if ( ! ( ret & LM_OUT_ASYNC ) ) {
finish_xmote ( gl , ret ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
} else {
GLOCK_BUG_ON ( gl , ret ! = LM_OUT_ASYNC ) ;
}
spin_lock ( & gl - > gl_spin ) ;
}
/**
* find_first_holder - find the first " holder " gh
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_holder ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
if ( ! list_empty ( & gl - > gl_holders ) ) {
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* run_queue - do all outstanding tasks related to a glock
* @ gl : The glock in question
* @ nonblock : True if we must not block in run_queue
*
*/
static void run_queue ( struct gfs2_glock * gl , const int nonblock )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
struct gfs2_holder * gh = NULL ;
2008-11-18 16:38:48 +03:00
int ret ;
2008-05-21 20:03:22 +04:00
if ( test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) )
return ;
GLOCK_BUG_ON ( gl , test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) ;
2009-02-05 13:12:38 +03:00
down_read ( & gfs2_umount_flush_sem ) ;
2008-05-21 20:03:22 +04:00
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_demote_state ! = gl - > gl_state ) {
if ( find_first_holder ( gl ) )
2009-02-05 13:12:38 +03:00
goto out_unlock ;
2008-05-21 20:03:22 +04:00
if ( nonblock )
goto out_sched ;
set_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ;
2008-07-07 13:02:36 +04:00
GLOCK_BUG_ON ( gl , gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) ;
2008-05-21 20:03:22 +04:00
gl - > gl_target = gl - > gl_demote_state ;
} else {
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
2008-11-18 16:38:48 +03:00
ret = do_promote ( gl ) ;
if ( ret = = 0 )
2009-02-05 13:12:38 +03:00
goto out_unlock ;
2008-11-18 16:38:48 +03:00
if ( ret = = 2 )
2009-02-05 13:12:38 +03:00
goto out_sem ;
2008-05-21 20:03:22 +04:00
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
if ( ! ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
do_xmote ( gl , gh , gl - > gl_target ) ;
2009-02-05 13:12:38 +03:00
out_sem :
up_read ( & gfs2_umount_flush_sem ) ;
2008-05-21 20:03:22 +04:00
return ;
out_sched :
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2009-02-05 13:12:38 +03:00
out_unlock :
2008-05-21 20:03:22 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2009-02-05 13:12:38 +03:00
goto out_sem ;
2008-05-21 20:03:22 +04:00
}
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
static void glock_work_func ( struct work_struct * work )
{
2008-05-21 20:03:22 +04:00
unsigned long delay = 0 ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
struct gfs2_glock * gl = container_of ( work , struct gfs2_glock , gl_work . work ) ;
2008-05-21 20:03:22 +04:00
if ( test_and_clear_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) )
finish_xmote ( gl , gl - > gl_reply ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
spin_lock ( & gl - > gl_spin ) ;
2008-07-07 13:02:36 +04:00
if ( test_and_clear_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = LM_ST_EXCLUSIVE ) {
2008-05-21 20:03:22 +04:00
unsigned long holdtime , now = jiffies ;
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
set_bit ( delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE , & gl - > gl_flags ) ;
}
run_queue ( gl , 0 ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( ! delay | |
queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_get ( ) - Get a glock , or create one if one doesn ' t exist
* @ sdp : The GFS2 superblock
* @ number : the lock number
* @ glops : The glock_operations to use
* @ create : If 0 , don ' t create the glock if it doesn ' t exist
* @ glp : the glock is returned here
*
* This does not lock a glock , just finds / creates structures for one .
*
* Returns : errno
*/
2006-09-04 20:49:07 +04:00
int gfs2_glock_get ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops , int create ,
2006-01-16 19:50:04 +03:00
struct gfs2_glock * * glp )
{
2006-09-08 21:35:56 +04:00
struct lm_lockname name = { . ln_number = number , . ln_type = glops - > go_type } ;
2006-01-16 19:50:04 +03:00
struct gfs2_glock * gl , * tmp ;
2006-09-08 21:35:56 +04:00
unsigned int hash = gl_hash ( sdp , & name ) ;
2006-01-16 19:50:04 +03:00
int error ;
2006-09-10 00:59:11 +04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-09-08 21:35:56 +04:00
gl = search_bucket ( hash , sdp , & name ) ;
2006-09-10 00:59:11 +04:00
read_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 19:50:04 +03:00
if ( gl | | ! create ) {
* glp = gl ;
return 0 ;
}
gl = kmem_cache_alloc ( gfs2_glock_cachep , GFP_KERNEL ) ;
if ( ! gl )
return - ENOMEM ;
2006-08-30 18:36:52 +04:00
gl - > gl_flags = 0 ;
2006-01-16 19:50:04 +03:00
gl - > gl_name = name ;
2006-09-13 18:43:37 +04:00
atomic_set ( & gl - > gl_ref , 1 ) ;
2006-01-16 19:50:04 +03:00
gl - > gl_state = LM_ST_UNLOCKED ;
2008-05-21 20:03:22 +04:00
gl - > gl_target = LM_ST_UNLOCKED ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
2006-09-08 21:35:56 +04:00
gl - > gl_hash = hash ;
2006-01-16 19:50:04 +03:00
gl - > gl_ops = glops ;
2009-01-12 13:43:39 +03:00
snprintf ( gl - > gl_strname , GDLM_STRNAME_BYTES , " %8x%16llx " , name . ln_type , ( unsigned long long ) number ) ;
memset ( & gl - > gl_lksb , 0 , sizeof ( struct dlm_lksb ) ) ;
gl - > gl_lksb . sb_lvbptr = gl - > gl_lvb ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gl - > gl_tchange = jiffies ;
2006-08-30 18:36:52 +04:00
gl - > gl_object = NULL ;
2006-01-16 19:50:04 +03:00
gl - > gl_sbd = sdp ;
2006-08-30 18:36:52 +04:00
gl - > gl_aspace = NULL ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
INIT_DELAYED_WORK ( & gl - > gl_work , glock_work_func ) ;
2006-01-16 19:50:04 +03:00
/* If this glock protects actual on-disk data or metadata blocks,
create a VFS inode to manage the pages / buffers holding them . */
2006-09-04 17:49:55 +04:00
if ( glops = = & gfs2_inode_glops | | glops = = & gfs2_rgrp_glops ) {
2006-01-16 19:50:04 +03:00
gl - > gl_aspace = gfs2_aspace_get ( sdp ) ;
if ( ! gl - > gl_aspace ) {
error = - ENOMEM ;
goto fail ;
}
}
2006-09-10 00:59:11 +04:00
write_lock ( gl_lock_addr ( hash ) ) ;
2006-09-08 21:35:56 +04:00
tmp = search_bucket ( hash , sdp , & name ) ;
2006-01-16 19:50:04 +03:00
if ( tmp ) {
2006-09-10 00:59:11 +04:00
write_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 19:50:04 +03:00
glock_free ( gl ) ;
gl = tmp ;
} else {
2006-09-12 18:10:01 +04:00
hlist_add_head ( & gl - > gl_list , & gl_hash_table [ hash ] . hb_list ) ;
2006-09-10 00:59:11 +04:00
write_unlock ( gl_lock_addr ( hash ) ) ;
2006-01-16 19:50:04 +03:00
}
* glp = gl ;
return 0 ;
2006-08-30 18:36:52 +04:00
fail :
2006-09-25 17:26:04 +04:00
kmem_cache_free ( gfs2_glock_cachep , gl ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
/**
* gfs2_holder_init - initialize a struct gfs2_holder in the default way
* @ gl : the glock
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
*/
2006-04-21 00:57:23 +04:00
void gfs2_holder_init ( struct gfs2_glock * gl , unsigned int state , unsigned flags ,
2006-01-16 19:50:04 +03:00
struct gfs2_holder * gh )
{
INIT_LIST_HEAD ( & gh - > gh_list ) ;
gh - > gh_gl = gl ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2008-02-07 11:13:19 +03:00
gh - > gh_owner_pid = get_pid ( task_pid ( current ) ) ;
2006-01-16 19:50:04 +03:00
gh - > gh_state = state ;
gh - > gh_flags = flags ;
gh - > gh_error = 0 ;
gh - > gh_iflags = 0 ;
gfs2_glock_hold ( gl ) ;
}
/**
* gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
* Don ' t mess with the glock .
*
*/
2006-04-21 00:57:23 +04:00
void gfs2_holder_reinit ( unsigned int state , unsigned flags , struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
gh - > gh_state = state ;
2006-04-26 22:58:26 +04:00
gh - > gh_flags = flags ;
2007-03-16 12:40:31 +03:00
gh - > gh_iflags = 0 ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_holder_uninit - uninitialize a holder structure ( drop glock reference )
* @ gh : the holder structure
*
*/
void gfs2_holder_uninit ( struct gfs2_holder * gh )
{
2008-02-07 11:13:19 +03:00
put_pid ( gh - > gh_owner_pid ) ;
2006-01-16 19:50:04 +03:00
gfs2_glock_put ( gh - > gh_gl ) ;
gh - > gh_gl = NULL ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = 0 ;
2006-01-16 19:50:04 +03:00
}
2007-06-11 11:22:32 +04:00
static int just_schedule ( void * word )
2007-01-17 18:33:23 +03:00
{
schedule ( ) ;
return 0 ;
}
2008-05-21 20:03:22 +04:00
static void wait_on_holder ( struct gfs2_holder * gh )
2008-01-30 18:34:04 +03:00
{
2008-05-21 20:03:22 +04:00
might_sleep ( ) ;
wait_on_bit ( & gh - > gh_iflags , HIF_WAIT , just_schedule , TASK_UNINTERRUPTIBLE ) ;
2008-01-30 18:34:04 +03:00
}
2008-05-21 20:03:22 +04:00
static void wait_on_demote ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
might_sleep ( ) ;
wait_on_bit ( & gl - > gl_flags , GLF_DEMOTE , just_schedule , TASK_UNINTERRUPTIBLE ) ;
2006-01-16 19:50:04 +03:00
}
/**
2008-05-21 20:03:22 +04:00
* handle_callback - process a demote request
* @ gl : the glock
* @ state : the state the caller wants us to change to
2006-01-16 19:50:04 +03:00
*
2008-05-21 20:03:22 +04:00
* There are only two requests that we are going to see in actual
* practise : LM_ST_SHARED and LM_ST_UNLOCKED
2006-01-16 19:50:04 +03:00
*/
2008-05-21 20:03:22 +04:00
static void handle_callback ( struct gfs2_glock * gl , unsigned int state ,
2008-11-20 16:39:47 +03:00
unsigned long delay )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
set_bit ( bit , & gl - > gl_flags ) ;
if ( gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) {
gl - > gl_demote_state = state ;
gl - > gl_demote_time = jiffies ;
} else if ( gl - > gl_demote_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = state ) {
gl - > gl_demote_state = LM_ST_UNLOCKED ;
2006-01-16 19:50:04 +03:00
}
}
/**
2008-05-21 20:03:22 +04:00
* gfs2_glock_wait - wait on a glock acquisition
2006-01-16 19:50:04 +03:00
* @ gh : the glock holder
*
* Returns : 0 on success
*/
2008-05-21 20:03:22 +04:00
int gfs2_glock_wait ( struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
2007-01-17 18:33:23 +03:00
wait_on_holder ( gh ) ;
2006-01-16 19:50:04 +03:00
return gh - > gh_error ;
}
2008-05-21 20:03:22 +04:00
void gfs2_print_dbg ( struct seq_file * seq , const char * fmt , . . . )
2007-03-16 13:26:37 +03:00
{
va_list args ;
va_start ( args , fmt ) ;
2008-05-21 20:03:22 +04:00
if ( seq ) {
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
vsprintf ( gi - > string , fmt , args ) ;
2008-05-21 20:03:22 +04:00
seq_printf ( seq , gi - > string ) ;
} else {
printk ( KERN_ERR " " ) ;
2007-03-16 13:26:37 +03:00
vprintk ( fmt , args ) ;
2008-05-21 20:03:22 +04:00
}
2007-03-16 13:26:37 +03:00
va_end ( args ) ;
}
2006-01-16 19:50:04 +03:00
/**
* add_to_queue - Add a holder to the wait queue ( but look for recursion )
* @ gh : the holder structure to add
*
2008-05-21 20:03:22 +04:00
* Eventually we should move the recursive locking trap to a
* debugging option or something like that . This is the fast
* path and needs to have the minimum number of distractions .
*
2006-01-16 19:50:04 +03:00
*/
2008-05-21 20:03:22 +04:00
static inline void add_to_queue ( struct gfs2_holder * gh )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl = gh - > gh_gl ;
2008-05-21 20:03:22 +04:00
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct list_head * insert_pt = NULL ;
struct gfs2_holder * gh2 ;
int try_lock = 0 ;
2006-01-16 19:50:04 +03:00
2008-02-07 11:13:19 +03:00
BUG_ON ( gh - > gh_owner_pid = = NULL ) ;
2007-01-17 18:33:23 +03:00
if ( test_and_set_bit ( HIF_WAIT , & gh - > gh_iflags ) )
BUG ( ) ;
2006-04-21 00:57:23 +04:00
2008-05-21 20:03:22 +04:00
if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) {
if ( test_bit ( GLF_LOCK , & gl - > gl_flags ) )
try_lock = 1 ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
goto fail ;
}
list_for_each_entry ( gh2 , & gl - > gl_holders , gh_list ) {
if ( unlikely ( gh2 - > gh_owner_pid = = gh - > gh_owner_pid & &
( gh - > gh_gl - > gl_ops - > go_type ! = LM_TYPE_FLOCK ) ) )
goto trap_recursive ;
if ( try_lock & &
! ( gh2 - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) & &
! may_grant ( gl , gh ) ) {
fail :
gh - > gh_error = GLR_TRYFAILED ;
gfs2_holder_wake ( gh ) ;
return ;
2007-09-14 08:35:27 +04:00
}
2008-05-21 20:03:22 +04:00
if ( test_bit ( HIF_HOLDER , & gh2 - > gh_iflags ) )
continue ;
if ( unlikely ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) & & ! insert_pt ) )
insert_pt = & gh2 - > gh_list ;
}
if ( likely ( insert_pt = = NULL ) ) {
list_add_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
if ( unlikely ( gh - > gh_flags & LM_FLAG_PRIORITY ) )
goto do_cancel ;
return ;
}
list_add_tail ( & gh - > gh_list , insert_pt ) ;
do_cancel :
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( ! ( gh - > gh_flags & LM_FLAG_PRIORITY ) ) {
spin_unlock ( & gl - > gl_spin ) ;
2008-05-23 17:46:04 +04:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_cancel )
2009-01-12 13:43:39 +03:00
sdp - > sd_lockstruct . ls_ops - > lm_cancel ( gl ) ;
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
}
2008-05-21 20:03:22 +04:00
return ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
trap_recursive :
print_symbol ( KERN_ERR " original: %s \n " , gh2 - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh2 - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh2 - > gh_gl - > gl_name . ln_type , gh2 - > gh_state ) ;
print_symbol ( KERN_ERR " new: %s \n " , gh - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh - > gh_gl - > gl_name . ln_type , gh - > gh_state ) ;
__dump_glock ( NULL , gl ) ;
BUG ( ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock ( acquire a glock )
* @ gh : the holder structure
*
* if ( gh - > gh_flags & GL_ASYNC ) , this never returns an error
*
* Returns : 0 , GLR_TRYFAILED , or errno on failure
*/
int gfs2_glock_nq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
int error = 0 ;
2008-05-21 20:03:22 +04:00
if ( unlikely ( test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
2006-01-16 19:50:04 +03:00
return - EIO ;
spin_lock ( & gl - > gl_spin ) ;
add_to_queue ( gh ) ;
2008-05-21 20:03:22 +04:00
run_queue ( gl , 1 ) ;
2006-01-16 19:50:04 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( ! ( gh - > gh_flags & GL_ASYNC ) )
error = gfs2_glock_wait ( gh ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
/**
* gfs2_glock_poll - poll to see if an async request has been completed
* @ gh : the holder
*
* Returns : 1 if the request is ready to be gfs2_glock_wait ( ) ed on
*/
int gfs2_glock_poll ( struct gfs2_holder * gh )
{
2008-05-21 20:03:22 +04:00
return test_bit ( HIF_WAIT , & gh - > gh_iflags ) ? 0 : 1 ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock ( release a glock )
* @ gh : the glock holder
*
*/
void gfs2_glock_dq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
unsigned delay = 0 ;
2008-05-21 20:03:22 +04:00
int fast_path = 0 ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
if ( gh - > gh_flags & GL_NOCACHE )
2008-11-20 16:39:47 +03:00
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
2006-01-16 19:50:04 +03:00
list_del_init ( & gh - > gh_list ) ;
2008-05-21 20:03:22 +04:00
if ( find_first_holder ( gl ) = = NULL ) {
2007-11-02 11:39:34 +03:00
if ( glops - > go_unlock ) {
2008-05-21 20:03:22 +04:00
GLOCK_BUG_ON ( gl , test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) ) ;
2007-11-02 11:39:34 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
glops - > go_unlock ( gh ) ;
2007-11-02 11:39:34 +03:00
spin_lock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2007-11-02 11:39:34 +03:00
}
2008-05-21 20:03:22 +04:00
if ( list_empty ( & gl - > gl_holders ) & &
! test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
fast_path = 1 ;
2006-01-16 19:50:04 +03:00
}
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( likely ( fast_path ) )
return ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gfs2_glock_hold ( gl ) ;
if ( test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2007-06-11 11:22:32 +04:00
void gfs2_glock_dq_wait ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
gfs2_glock_dq ( gh ) ;
wait_on_demote ( gl ) ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
* @ gh : the holder structure
*
*/
void gfs2_glock_dq_uninit ( struct gfs2_holder * gh )
{
gfs2_glock_dq ( gh ) ;
gfs2_holder_uninit ( gh ) ;
}
/**
* gfs2_glock_nq_num - acquire a glock based on lock number
* @ sdp : the filesystem
* @ number : the lock number
* @ glops : the glock operations for the type of glock
* @ state : the state to acquire the glock in
* @ flags : modifier flags for the aquisition
* @ gh : the struct gfs2_holder
*
* Returns : errno
*/
2006-09-04 20:49:07 +04:00
int gfs2_glock_nq_num ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops ,
unsigned int state , int flags , struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
int error ;
error = gfs2_glock_get ( sdp , number , glops , CREATE , & gl ) ;
if ( ! error ) {
error = gfs2_glock_nq_init ( gl , state , flags , gh ) ;
gfs2_glock_put ( gl ) ;
}
return error ;
}
/**
* glock_compare - Compare two struct gfs2_glock structures for sorting
* @ arg_a : the first structure
* @ arg_b : the second structure
*
*/
static int glock_compare ( const void * arg_a , const void * arg_b )
{
2006-09-10 01:07:05 +04:00
const struct gfs2_holder * gh_a = * ( const struct gfs2_holder * * ) arg_a ;
const struct gfs2_holder * gh_b = * ( const struct gfs2_holder * * ) arg_b ;
const struct lm_lockname * a = & gh_a - > gh_gl - > gl_name ;
const struct lm_lockname * b = & gh_b - > gh_gl - > gl_name ;
2006-01-16 19:50:04 +03:00
if ( a - > ln_number > b - > ln_number )
2006-09-10 01:07:05 +04:00
return 1 ;
if ( a - > ln_number < b - > ln_number )
return - 1 ;
2007-01-22 20:10:39 +03:00
BUG_ON ( gh_a - > gh_gl - > gl_ops - > go_type = = gh_b - > gh_gl - > gl_ops - > go_type ) ;
2006-09-10 01:07:05 +04:00
return 0 ;
2006-01-16 19:50:04 +03:00
}
/**
* nq_m_sync - synchonously acquire more than one glock in deadlock free order
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
static int nq_m_sync ( unsigned int num_gh , struct gfs2_holder * ghs ,
struct gfs2_holder * * p )
{
unsigned int x ;
int error = 0 ;
for ( x = 0 ; x < num_gh ; x + + )
p [ x ] = & ghs [ x ] ;
sort ( p , num_gh , sizeof ( struct gfs2_holder * ) , glock_compare , NULL ) ;
for ( x = 0 ; x < num_gh ; x + + ) {
p [ x ] - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
error = gfs2_glock_nq ( p [ x ] ) ;
if ( error ) {
while ( x - - )
gfs2_glock_dq ( p [ x ] ) ;
break ;
}
}
return error ;
}
/**
* gfs2_glock_nq_m - acquire multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
int gfs2_glock_nq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
2007-06-19 18:38:17 +04:00
struct gfs2_holder * tmp [ 4 ] ;
struct gfs2_holder * * pph = tmp ;
2006-01-16 19:50:04 +03:00
int error = 0 ;
2007-06-19 18:38:17 +04:00
switch ( num_gh ) {
case 0 :
2006-01-16 19:50:04 +03:00
return 0 ;
2007-06-19 18:38:17 +04:00
case 1 :
2006-01-16 19:50:04 +03:00
ghs - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
return gfs2_glock_nq ( ghs ) ;
2007-06-19 18:38:17 +04:00
default :
if ( num_gh < = 4 )
2006-01-16 19:50:04 +03:00
break ;
2007-06-19 18:38:17 +04:00
pph = kmalloc ( num_gh * sizeof ( struct gfs2_holder * ) , GFP_NOFS ) ;
if ( ! pph )
return - ENOMEM ;
2006-01-16 19:50:04 +03:00
}
2007-06-19 18:38:17 +04:00
error = nq_m_sync ( num_gh , ghs , pph ) ;
2006-01-16 19:50:04 +03:00
2007-06-19 18:38:17 +04:00
if ( pph ! = tmp )
kfree ( pph ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
/**
* gfs2_glock_dq_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
unsigned int x ;
for ( x = 0 ; x < num_gh ; x + + )
gfs2_glock_dq ( & ghs [ x ] ) ;
}
/**
* gfs2_glock_dq_uninit_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_uninit_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
unsigned int x ;
for ( x = 0 ; x < num_gh ; x + + )
gfs2_glock_dq_uninit ( & ghs [ x ] ) ;
}
2009-01-12 13:43:39 +03:00
void gfs2_glock_cb ( struct gfs2_glock * gl , unsigned int state )
2008-01-30 18:34:04 +03:00
{
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
unsigned long delay = 0 ;
unsigned long holdtime ;
unsigned long now = jiffies ;
2006-01-16 19:50:04 +03:00
2009-01-12 13:43:39 +03:00
gfs2_glock_hold ( gl ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
2008-09-02 16:33:17 +04:00
if ( test_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2008-11-20 16:39:47 +03:00
handle_callback ( gl , state , delay ) ;
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
/**
2009-01-12 13:43:39 +03:00
* gfs2_glock_complete - Callback used by locking
* @ gl : Pointer to the glock
* @ ret : The return value from the dlm
2006-01-16 19:50:04 +03:00
*
*/
2009-01-12 13:43:39 +03:00
void gfs2_glock_complete ( struct gfs2_glock * gl , int ret )
2006-01-16 19:50:04 +03:00
{
2009-01-12 13:43:39 +03:00
struct lm_lockstruct * ls = & gl - > gl_sbd - > sd_lockstruct ;
gl - > gl_reply = ret ;
if ( unlikely ( test_bit ( DFL_BLOCK_LOCKS , & ls - > ls_flags ) ) ) {
struct gfs2_holder * gh ;
spin_lock ( & gl - > gl_spin ) ;
gh = find_first_waiter ( gl ) ;
if ( ( ! ( gh & & ( gh - > gh_flags & LM_FLAG_NOEXP ) ) & &
( gl - > gl_target ! = LM_ST_UNLOCKED ) ) | |
( ( ret & ~ LM_OUT_ST_MASK ) ! = 0 ) )
set_bit ( GLF_FROZEN , & gl - > gl_flags ) ;
spin_unlock ( & gl - > gl_spin ) ;
2009-02-05 13:12:38 +03:00
if ( test_bit ( GLF_FROZEN , & gl - > gl_flags ) )
2006-01-16 19:50:04 +03:00
return ;
}
2009-01-12 13:43:39 +03:00
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
/**
* demote_ok - Check to see if it ' s ok to unlock a glock
* @ gl : the glock
*
* Returns : 1 if it ' s ok
*/
2008-11-20 16:39:47 +03:00
static int demote_ok ( const struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
if ( gl - > gl_state = = LM_ST_UNLOCKED )
return 0 ;
if ( ! list_empty ( & gl - > gl_holders ) )
return 0 ;
if ( glops - > go_demote_ok )
return glops - > go_demote_ok ( gl ) ;
return 1 ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
static int gfs2_shrink_glock_memory ( int nr , gfp_t gfp_mask )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
2008-11-20 16:39:47 +03:00
int may_demote ;
int nr_skipped = 0 ;
int got_ref = 0 ;
LIST_HEAD ( skipped ) ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
if ( nr = = 0 )
goto out ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
if ( ! ( gfp_mask & __GFP_FS ) )
return - 1 ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
while ( nr & & ! list_empty ( & lru_list ) ) {
gl = list_entry ( lru_list . next , struct gfs2_glock , gl_lru ) ;
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
/* Test for being demotable */
if ( ! test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) ) {
gfs2_glock_hold ( gl ) ;
got_ref = 1 ;
spin_unlock ( & lru_lock ) ;
spin_lock ( & gl - > gl_spin ) ;
may_demote = demote_ok ( gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
if ( may_demote ) {
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
nr - - ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
}
spin_lock ( & lru_lock ) ;
if ( may_demote )
continue ;
}
if ( list_empty ( & gl - > gl_lru ) & &
( atomic_read ( & gl - > gl_ref ) < = ( 2 + got_ref ) ) ) {
nr_skipped + + ;
list_add ( & gl - > gl_lru , & skipped ) ;
}
if ( got_ref ) {
spin_unlock ( & lru_lock ) ;
gfs2_glock_put ( gl ) ;
spin_lock ( & lru_lock ) ;
got_ref = 0 ;
}
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
list_splice ( & skipped , & lru_list ) ;
atomic_add ( nr_skipped , & lru_count ) ;
spin_unlock ( & lru_lock ) ;
out :
return ( atomic_read ( & lru_count ) / 100 ) * sysctl_vfs_cache_pressure ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
static struct shrinker glock_shrinker = {
. shrink = gfs2_shrink_glock_memory ,
. seeks = DEFAULT_SEEKS ,
} ;
2006-01-16 19:50:04 +03:00
/**
* examine_bucket - Call a function for glock in a hash bucket
* @ examiner : the function
* @ sdp : the filesystem
* @ bucket : the bucket
*
* Returns : 1 if the bucket has entries
*/
static int examine_bucket ( glock_examiner examiner , struct gfs2_sbd * sdp ,
2006-09-08 21:35:56 +04:00
unsigned int hash )
2006-01-16 19:50:04 +03:00
{
2006-09-12 05:40:30 +04:00
struct gfs2_glock * gl , * prev = NULL ;
int has_entries = 0 ;
2006-09-12 18:10:01 +04:00
struct hlist_head * head = & gl_hash_table [ hash ] . hb_list ;
2006-01-16 19:50:04 +03:00
2006-09-12 05:40:30 +04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-09-12 18:10:01 +04:00
/* Can't use hlist_for_each_entry - don't want prefetch here */
if ( hlist_empty ( head ) )
2006-09-12 05:40:30 +04:00
goto out ;
2006-09-12 18:10:01 +04:00
gl = list_entry ( head - > first , struct gfs2_glock , gl_list ) ;
while ( 1 ) {
2007-08-01 16:57:10 +04:00
if ( ! sdp | | gl - > gl_sbd = = sdp ) {
2006-01-16 19:50:04 +03:00
gfs2_glock_hold ( gl ) ;
2006-09-12 05:40:30 +04:00
read_unlock ( gl_lock_addr ( hash ) ) ;
if ( prev )
gfs2_glock_put ( prev ) ;
prev = gl ;
examiner ( gl ) ;
2006-09-14 21:57:38 +04:00
has_entries = 1 ;
2006-09-12 05:40:30 +04:00
read_lock ( gl_lock_addr ( hash ) ) ;
2006-01-16 19:50:04 +03:00
}
2006-09-12 18:10:01 +04:00
if ( gl - > gl_list . next = = NULL )
break ;
2006-09-12 05:40:30 +04:00
gl = list_entry ( gl - > gl_list . next , struct gfs2_glock , gl_list ) ;
2006-01-16 19:50:04 +03:00
}
2006-09-12 05:40:30 +04:00
out :
read_unlock ( gl_lock_addr ( hash ) ) ;
if ( prev )
gfs2_glock_put ( prev ) ;
2007-08-01 16:57:10 +04:00
cond_resched ( ) ;
2006-09-12 05:40:30 +04:00
return has_entries ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @ gl : The glock to thaw
*
* N . B . When we freeze a glock , we leave a ref to the glock outstanding ,
* so this has to result in the ref count being dropped by one .
*/
static void thaw_glock ( struct gfs2_glock * gl )
{
if ( ! test_and_clear_bit ( GLF_FROZEN , & gl - > gl_flags ) )
return ;
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
}
2006-01-16 19:50:04 +03:00
/**
* clear_glock - look at a glock and see if we can free it from glock cache
* @ gl : the glock to look at
*
*/
static void clear_glock ( struct gfs2_glock * gl )
{
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
if ( ! list_empty ( & gl - > gl_lru ) ) {
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
spin_unlock ( & lru_lock ) ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
if ( find_first_holder ( gl ) = = NULL & & gl - > gl_state ! = LM_ST_UNLOCKED )
2008-11-20 16:39:47 +03:00
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
/**
* gfs2_glock_thaw - Thaw any frozen glocks
* @ sdp : The super block
*
*/
void gfs2_glock_thaw ( struct gfs2_sbd * sdp )
{
unsigned x ;
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + )
examine_bucket ( thaw_glock , sdp , x ) ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @ sdp : the filesystem
* @ wait : wait until it ' s all gone
*
2008-06-03 17:09:53 +04:00
* Called when unmounting the filesystem .
2006-01-16 19:50:04 +03:00
*/
2008-12-19 18:32:06 +03:00
void gfs2_gl_hash_clear ( struct gfs2_sbd * sdp )
2006-01-16 19:50:04 +03:00
{
unsigned long t ;
unsigned int x ;
int cont ;
t = jiffies ;
for ( ; ; ) {
cont = 0 ;
2006-09-12 05:40:30 +04:00
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + ) {
2006-09-25 17:26:04 +04:00
if ( examine_bucket ( clear_glock , sdp , x ) )
2006-01-16 19:50:04 +03:00
cont = 1 ;
2006-09-12 05:40:30 +04:00
}
2006-01-16 19:50:04 +03:00
2008-06-03 17:09:53 +04:00
if ( ! cont )
2006-01-16 19:50:04 +03:00
break ;
if ( time_after_eq ( jiffies ,
t + gfs2_tune_get ( sdp , gt_stall_secs ) * HZ ) ) {
fs_warn ( sdp , " Unmount seems to be stalled. "
" Dumping lock state... \n " ) ;
gfs2_dump_lockstate ( sdp ) ;
t = jiffies ;
}
2007-01-29 14:51:45 +03:00
down_write ( & gfs2_umount_flush_sem ) ;
2006-01-16 19:50:04 +03:00
invalidate_inodes ( sdp - > sd_vfs ) ;
2007-01-29 14:51:45 +03:00
up_write ( & gfs2_umount_flush_sem ) ;
2006-05-06 00:59:11 +04:00
msleep ( 10 ) ;
2006-01-16 19:50:04 +03:00
}
}
2008-11-18 16:38:48 +03:00
void gfs2_glock_finish_truncate ( struct gfs2_inode * ip )
{
struct gfs2_glock * gl = ip - > i_gl ;
int ret ;
ret = gfs2_truncatei_resume ( ip ) ;
gfs2_assert_withdraw ( gl - > gl_sbd , ret = = 0 ) ;
spin_lock ( & gl - > gl_spin ) ;
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
run_queue ( gl , 1 ) ;
spin_unlock ( & gl - > gl_spin ) ;
}
2008-05-21 20:03:22 +04:00
static const char * state2str ( unsigned state )
2007-03-24 01:05:15 +03:00
{
2008-05-21 20:03:22 +04:00
switch ( state ) {
case LM_ST_UNLOCKED :
return " UN " ;
case LM_ST_SHARED :
return " SH " ;
case LM_ST_DEFERRED :
return " DF " ;
case LM_ST_EXCLUSIVE :
return " EX " ;
}
return " ?? " ;
}
static const char * hflags2str ( char * buf , unsigned flags , unsigned long iflags )
{
char * p = buf ;
if ( flags & LM_FLAG_TRY )
* p + + = ' t ' ;
if ( flags & LM_FLAG_TRY_1CB )
* p + + = ' T ' ;
if ( flags & LM_FLAG_NOEXP )
* p + + = ' e ' ;
if ( flags & LM_FLAG_ANY )
2009-01-12 13:43:39 +03:00
* p + + = ' A ' ;
2008-05-21 20:03:22 +04:00
if ( flags & LM_FLAG_PRIORITY )
* p + + = ' p ' ;
if ( flags & GL_ASYNC )
* p + + = ' a ' ;
if ( flags & GL_EXACT )
* p + + = ' E ' ;
if ( flags & GL_NOCACHE )
* p + + = ' c ' ;
if ( test_bit ( HIF_HOLDER , & iflags ) )
* p + + = ' H ' ;
if ( test_bit ( HIF_WAIT , & iflags ) )
* p + + = ' W ' ;
if ( test_bit ( HIF_FIRST , & iflags ) )
* p + + = ' F ' ;
* p = 0 ;
return buf ;
2007-03-24 01:05:15 +03:00
}
2006-01-16 19:50:04 +03:00
/**
* dump_holder - print information about a glock holder
2008-05-21 20:03:22 +04:00
* @ seq : the seq_file struct
2006-01-16 19:50:04 +03:00
* @ gh : the glock holder
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 20:03:22 +04:00
static int dump_holder ( struct seq_file * seq , const struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
struct task_struct * gh_owner = NULL ;
char buffer [ KSYM_SYMBOL_LEN ] ;
char flags_buf [ 32 ] ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
sprint_symbol ( buffer , gh - > gh_ip ) ;
if ( gh - > gh_owner_pid )
2008-02-07 11:13:19 +03:00
gh_owner = pid_task ( gh - > gh_owner_pid , PIDTYPE_PID ) ;
2008-05-21 20:03:22 +04:00
gfs2_print_dbg ( seq , " H: s:%s f:%s e:%d p:%ld [%s] %s \n " ,
state2str ( gh - > gh_state ) ,
hflags2str ( flags_buf , gh - > gh_flags , gh - > gh_iflags ) ,
gh - > gh_error ,
gh - > gh_owner_pid ? ( long ) pid_nr ( gh - > gh_owner_pid ) : - 1 ,
gh_owner ? gh_owner - > comm : " (ended) " , buffer ) ;
2007-03-16 13:26:37 +03:00
return 0 ;
2006-01-16 19:50:04 +03:00
}
2008-05-21 20:03:22 +04:00
static const char * gflags2str ( char * buf , const unsigned long * gflags )
{
char * p = buf ;
if ( test_bit ( GLF_LOCK , gflags ) )
* p + + = ' l ' ;
if ( test_bit ( GLF_DEMOTE , gflags ) )
* p + + = ' D ' ;
if ( test_bit ( GLF_PENDING_DEMOTE , gflags ) )
* p + + = ' d ' ;
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , gflags ) )
* p + + = ' p ' ;
if ( test_bit ( GLF_DIRTY , gflags ) )
* p + + = ' y ' ;
if ( test_bit ( GLF_LFLUSH , gflags ) )
* p + + = ' f ' ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , gflags ) )
* p + + = ' i ' ;
if ( test_bit ( GLF_REPLY_PENDING , gflags ) )
* p + + = ' r ' ;
2009-01-12 13:43:39 +03:00
if ( test_bit ( GLF_INITIAL , gflags ) )
2009-02-05 13:12:38 +03:00
* p + + = ' I ' ;
2009-01-12 13:43:39 +03:00
if ( test_bit ( GLF_FROZEN , gflags ) )
* p + + = ' F ' ;
2008-05-21 20:03:22 +04:00
* p = 0 ;
return buf ;
2006-01-16 19:50:04 +03:00
}
/**
2008-05-21 20:03:22 +04:00
* __dump_glock - print information about a glock
* @ seq : The seq_file struct
2006-01-16 19:50:04 +03:00
* @ gl : the glock
2008-05-21 20:03:22 +04:00
*
* The file format is as follows :
* One line per object , capital letters are used to indicate objects
* G = glock , I = Inode , R = rgrp , H = holder . Glocks are not indented ,
* other objects are indented by a single space and follow the glock to
* which they are related . Fields are indicated by lower case letters
* followed by a colon and the field value , except for strings which are in
* [ ] so that its possible to see if they are composed of spaces for
* example . The field ' s are n = number ( id of the object ) , f = flags ,
* t = type , s = state , r = refcount , e = error , p = pid .
2006-01-16 19:50:04 +03:00
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 20:03:22 +04:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
unsigned long long dtime ;
const struct gfs2_holder * gh ;
char gflags_buf [ 32 ] ;
int error = 0 ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
dtime = jiffies - gl - > gl_demote_time ;
dtime * = 1000000 / HZ ; /* demote time in uSec */
if ( ! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
dtime = 0 ;
2009-01-12 13:43:39 +03:00
gfs2_print_dbg ( seq , " G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d \n " ,
2008-05-21 20:03:22 +04:00
state2str ( gl - > gl_state ) ,
gl - > gl_name . ln_type ,
( unsigned long long ) gl - > gl_name . ln_number ,
gflags2str ( gflags_buf , & gl - > gl_flags ) ,
state2str ( gl - > gl_target ) ,
state2str ( gl - > gl_demote_state ) , dtime ,
atomic_read ( & gl - > gl_ail_count ) ,
atomic_read ( & gl - > gl_ref ) ) ;
2006-01-16 19:50:04 +03:00
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
2008-05-21 20:03:22 +04:00
error = dump_holder ( seq , gh ) ;
2006-01-16 19:50:04 +03:00
if ( error )
goto out ;
}
2008-05-21 20:03:22 +04:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & glops - > go_dump )
error = glops - > go_dump ( seq , gl ) ;
2006-09-04 20:04:26 +04:00
out :
2006-01-16 19:50:04 +03:00
return error ;
}
2008-05-21 20:03:22 +04:00
static int dump_glock ( struct seq_file * seq , struct gfs2_glock * gl )
{
int ret ;
spin_lock ( & gl - > gl_spin ) ;
ret = __dump_glock ( seq , gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
return ret ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_dump_lockstate - print out the current lockstate
* @ sdp : the filesystem
* @ ub : the buffer to copy the information into
*
* If @ ub is NULL , dump the lockstate to the console .
*
*/
2006-04-28 18:59:12 +04:00
static int gfs2_dump_lockstate ( struct gfs2_sbd * sdp )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
2006-09-12 18:10:01 +04:00
struct hlist_node * h ;
2006-01-16 19:50:04 +03:00
unsigned int x ;
int error = 0 ;
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + ) {
2006-09-10 00:59:11 +04:00
read_lock ( gl_lock_addr ( x ) ) ;
2006-01-16 19:50:04 +03:00
2006-09-12 18:10:01 +04:00
hlist_for_each_entry ( gl , h , & gl_hash_table [ x ] . hb_list , gl_list ) {
2006-09-07 22:40:21 +04:00
if ( gl - > gl_sbd ! = sdp )
continue ;
2006-01-16 19:50:04 +03:00
2007-03-16 13:26:37 +03:00
error = dump_glock ( NULL , gl ) ;
2006-01-16 19:50:04 +03:00
if ( error )
break ;
}
2006-09-10 00:59:11 +04:00
read_unlock ( gl_lock_addr ( x ) ) ;
2006-01-16 19:50:04 +03:00
if ( error )
break ;
}
return error ;
}
2007-08-01 16:57:10 +04:00
2006-09-07 22:40:21 +04:00
int __init gfs2_glock_init ( void )
{
unsigned i ;
for ( i = 0 ; i < GFS2_GL_HASH_SIZE ; i + + ) {
2006-09-12 18:10:01 +04:00
INIT_HLIST_HEAD ( & gl_hash_table [ i ] . hb_list ) ;
2006-09-07 22:40:21 +04:00
}
2006-09-10 00:59:11 +04:00
# ifdef GL_HASH_LOCK_SZ
for ( i = 0 ; i < GL_HASH_LOCK_SZ ; i + + ) {
rwlock_init ( & gl_hash_locks [ i ] ) ;
}
# endif
2007-08-01 16:57:10 +04:00
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
glock_workqueue = create_workqueue ( " glock_workqueue " ) ;
2008-11-20 16:39:47 +03:00
if ( IS_ERR ( glock_workqueue ) )
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
return PTR_ERR ( glock_workqueue ) ;
2008-11-20 16:39:47 +03:00
register_shrinker ( & glock_shrinker ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
2006-09-07 22:40:21 +04:00
return 0 ;
}
2007-08-01 16:57:10 +04:00
void gfs2_glock_exit ( void )
{
2008-11-20 16:39:47 +03:00
unregister_shrinker ( & glock_shrinker ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
destroy_workqueue ( glock_workqueue ) ;
2007-08-01 16:57:10 +04:00
}
2008-05-21 20:03:22 +04:00
static int gfs2_glock_iter_next ( struct gfs2_glock_iter * gi )
2007-03-16 13:26:37 +03:00
{
2007-07-24 16:53:36 +04:00
struct gfs2_glock * gl ;
2007-08-21 18:57:29 +04:00
restart :
2007-04-17 20:37:11 +04:00
read_lock ( gl_lock_addr ( gi - > hash ) ) ;
2007-07-24 16:53:36 +04:00
gl = gi - > gl ;
if ( gl ) {
2007-08-21 18:57:29 +04:00
gi - > gl = hlist_entry ( gl - > gl_list . next ,
struct gfs2_glock , gl_list ) ;
2008-07-23 01:58:03 +04:00
} else {
gi - > gl = hlist_entry ( gl_hash_table [ gi - > hash ] . hb_list . first ,
struct gfs2_glock , gl_list ) ;
2007-03-16 13:26:37 +03:00
}
2008-07-23 01:58:03 +04:00
if ( gi - > gl )
gfs2_glock_hold ( gi - > gl ) ;
2007-04-17 20:37:11 +04:00
read_unlock ( gl_lock_addr ( gi - > hash ) ) ;
2007-07-24 16:53:36 +04:00
if ( gl )
gfs2_glock_put ( gl ) ;
2008-05-21 20:03:22 +04:00
while ( gi - > gl = = NULL ) {
2008-07-23 01:58:03 +04:00
gi - > hash + + ;
2007-07-24 16:53:36 +04:00
if ( gi - > hash > = GFS2_GL_HASH_SIZE )
return 1 ;
read_lock ( gl_lock_addr ( gi - > hash ) ) ;
gi - > gl = hlist_entry ( gl_hash_table [ gi - > hash ] . hb_list . first ,
struct gfs2_glock , gl_list ) ;
if ( gi - > gl )
gfs2_glock_hold ( gi - > gl ) ;
read_unlock ( gl_lock_addr ( gi - > hash ) ) ;
}
2007-08-21 18:57:29 +04:00
if ( gi - > sdp ! = gi - > gl - > gl_sbd )
goto restart ;
2007-03-16 13:26:37 +03:00
return 0 ;
}
2008-05-21 20:03:22 +04:00
static void gfs2_glock_iter_free ( struct gfs2_glock_iter * gi )
2007-03-16 13:26:37 +03:00
{
2007-07-24 16:53:36 +04:00
if ( gi - > gl )
gfs2_glock_put ( gi - > gl ) ;
2007-08-21 18:57:29 +04:00
gi - > gl = NULL ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static void * gfs2_glock_seq_start ( struct seq_file * seq , loff_t * pos )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
loff_t n = * pos ;
2008-05-21 20:03:22 +04:00
gi - > hash = 0 ;
2007-03-16 13:26:37 +03:00
2008-05-21 20:03:22 +04:00
do {
2007-03-16 13:26:37 +03:00
if ( gfs2_glock_iter_next ( gi ) ) {
gfs2_glock_iter_free ( gi ) ;
return NULL ;
}
2008-05-21 20:03:22 +04:00
} while ( n - - ) ;
2007-03-16 13:26:37 +03:00
2008-05-21 20:03:22 +04:00
return gi - > gl ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static void * gfs2_glock_seq_next ( struct seq_file * seq , void * iter_ptr ,
2007-03-16 13:26:37 +03:00
loff_t * pos )
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
( * pos ) + + ;
if ( gfs2_glock_iter_next ( gi ) ) {
gfs2_glock_iter_free ( gi ) ;
return NULL ;
}
2008-05-21 20:03:22 +04:00
return gi - > gl ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static void gfs2_glock_seq_stop ( struct seq_file * seq , void * iter_ptr )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
gfs2_glock_iter_free ( gi ) ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static int gfs2_glock_seq_show ( struct seq_file * seq , void * iter_ptr )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
return dump_glock ( seq , iter_ptr ) ;
2007-03-16 13:26:37 +03:00
}
2007-07-31 14:31:11 +04:00
static const struct seq_operations gfs2_glock_seq_ops = {
2007-03-16 13:26:37 +03:00
. start = gfs2_glock_seq_start ,
. next = gfs2_glock_seq_next ,
. stop = gfs2_glock_seq_stop ,
. show = gfs2_glock_seq_show ,
} ;
static int gfs2_debugfs_open ( struct inode * inode , struct file * file )
{
2008-05-21 20:03:22 +04:00
int ret = seq_open_private ( file , & gfs2_glock_seq_ops ,
sizeof ( struct gfs2_glock_iter ) ) ;
if ( ret = = 0 ) {
struct seq_file * seq = file - > private_data ;
struct gfs2_glock_iter * gi = seq - > private ;
gi - > sdp = inode - > i_private ;
}
return ret ;
2007-03-16 13:26:37 +03:00
}
static const struct file_operations gfs2_debug_fops = {
. owner = THIS_MODULE ,
. open = gfs2_debugfs_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2008-05-21 20:03:22 +04:00
. release = seq_release_private ,
2007-03-16 13:26:37 +03:00
} ;
int gfs2_create_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 20:41:11 +04:00
sdp - > debugfs_dir = debugfs_create_dir ( sdp - > sd_table_name , gfs2_root ) ;
if ( ! sdp - > debugfs_dir )
return - ENOMEM ;
sdp - > debugfs_dentry_glocks = debugfs_create_file ( " glocks " ,
S_IFREG | S_IRUGO ,
sdp - > debugfs_dir , sdp ,
& gfs2_debug_fops ) ;
if ( ! sdp - > debugfs_dentry_glocks )
2007-03-16 13:26:37 +03:00
return - ENOMEM ;
return 0 ;
}
void gfs2_delete_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 20:41:11 +04:00
if ( sdp & & sdp - > debugfs_dir ) {
if ( sdp - > debugfs_dentry_glocks ) {
debugfs_remove ( sdp - > debugfs_dentry_glocks ) ;
sdp - > debugfs_dentry_glocks = NULL ;
}
debugfs_remove ( sdp - > debugfs_dir ) ;
sdp - > debugfs_dir = NULL ;
}
2007-03-16 13:26:37 +03:00
}
int gfs2_register_debugfs ( void )
{
gfs2_root = debugfs_create_dir ( " gfs2 " , NULL ) ;
return gfs2_root ? 0 : - ENOMEM ;
}
void gfs2_unregister_debugfs ( void )
{
debugfs_remove ( gfs2_root ) ;
2007-04-18 20:41:11 +04:00
gfs2_root = NULL ;
2007-03-16 13:26:37 +03:00
}