2006-01-16 19:50:04 +03:00
/*
* Copyright ( C ) Sistina Software , Inc . 1997 - 2003 All rights reserved .
2008-01-31 19:31:39 +03:00
* Copyright ( C ) 2004 - 2008 Red Hat , Inc . All rights reserved .
2006-01-16 19:50:04 +03:00
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
2006-09-01 19:05:15 +04:00
* of the GNU General Public License version 2.
2006-01-16 19:50:04 +03:00
*/
# include <linux/sched.h>
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/buffer_head.h>
# include <linux/delay.h>
# include <linux/sort.h>
# include <linux/jhash.h>
2006-03-29 23:36:49 +04:00
# include <linux/kallsyms.h>
2006-02-28 01:23:27 +03:00
# include <linux/gfs2_ondisk.h>
2006-09-12 05:40:30 +04:00
# include <linux/list.h>
2007-01-17 18:33:23 +03:00
# include <linux/wait.h>
2007-03-06 10:10:39 +03:00
# include <linux/module.h>
2006-01-16 19:50:04 +03:00
# include <asm/uaccess.h>
2007-03-16 13:26:37 +03:00
# include <linux/seq_file.h>
# include <linux/debugfs.h>
2007-08-01 16:57:10 +04:00
# include <linux/kthread.h>
# include <linux/freezer.h>
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
# include <linux/workqueue.h>
# include <linux/jiffies.h>
2011-01-19 12:30:01 +03:00
# include <linux/rcupdate.h>
# include <linux/rculist_bl.h>
# include <linux/bit_spinlock.h>
2006-01-16 19:50:04 +03:00
# include "gfs2.h"
2006-02-28 01:23:27 +03:00
# include "incore.h"
2006-01-16 19:50:04 +03:00
# include "glock.h"
# include "glops.h"
# include "inode.h"
# include "lops.h"
# include "meta_io.h"
# include "quota.h"
# include "super.h"
2006-02-28 01:23:27 +03:00
# include "util.h"
2008-11-18 16:38:48 +03:00
# include "bmap.h"
2009-06-12 11:49:20 +04:00
# define CREATE_TRACE_POINTS
# include "trace_gfs2.h"
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter {
int hash ; /* hash bucket index */
struct gfs2_sbd * sdp ; /* incore superblock */
struct gfs2_glock * gl ; /* current glock struct */
char string [ 512 ] ; /* scratch space */
2007-03-16 13:26:37 +03:00
} ;
2006-01-16 19:50:04 +03:00
typedef void ( * glock_examiner ) ( struct gfs2_glock * gl ) ;
2008-05-21 20:03:22 +04:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl ) ;
# define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
2007-03-16 13:26:37 +03:00
static struct dentry * gfs2_root ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
static struct workqueue_struct * glock_workqueue ;
2009-07-24 03:52:34 +04:00
struct workqueue_struct * gfs2_delete_workqueue ;
2008-11-20 16:39:47 +03:00
static LIST_HEAD ( lru_list ) ;
static atomic_t lru_count = ATOMIC_INIT ( 0 ) ;
2008-12-25 17:35:27 +03:00
static DEFINE_SPINLOCK ( lru_lock ) ;
2006-04-28 18:59:12 +04:00
2006-09-12 18:10:01 +04:00
# define GFS2_GL_HASH_SHIFT 15
2006-09-10 00:59:11 +04:00
# define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
# define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
2011-01-19 12:30:01 +03:00
static struct hlist_bl_head gl_hash_table [ GFS2_GL_HASH_SIZE ] ;
2007-03-24 01:05:15 +03:00
static struct dentry * gfs2_root ;
2006-09-10 00:59:11 +04:00
2006-01-16 19:50:04 +03:00
/**
* gl_hash ( ) - Turn glock number into hash bucket number
* @ lock : The glock number
*
* Returns : The number of the corresponding hash bucket
*/
2006-09-07 21:12:27 +04:00
static unsigned int gl_hash ( const struct gfs2_sbd * sdp ,
const struct lm_lockname * name )
2006-01-16 19:50:04 +03:00
{
unsigned int h ;
2006-09-04 20:49:07 +04:00
h = jhash ( & name - > ln_number , sizeof ( u64 ) , 0 ) ;
2006-01-16 19:50:04 +03:00
h = jhash ( & name - > ln_type , sizeof ( unsigned int ) , h ) ;
2006-09-07 21:12:27 +04:00
h = jhash ( & sdp , sizeof ( struct gfs2_sbd * ) , h ) ;
2006-01-16 19:50:04 +03:00
h & = GFS2_GL_HASH_MASK ;
return h ;
}
2011-01-19 12:30:01 +03:00
static inline void spin_lock_bucket ( unsigned int hash )
{
struct hlist_bl_head * bl = & gl_hash_table [ hash ] ;
bit_spin_lock ( 0 , ( unsigned long * ) bl ) ;
}
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
static inline void spin_unlock_bucket ( unsigned int hash )
{
struct hlist_bl_head * bl = & gl_hash_table [ hash ] ;
__bit_spin_unlock ( 0 , ( unsigned long * ) bl ) ;
}
2006-01-16 19:50:04 +03:00
2011-03-09 13:58:04 +03:00
static void gfs2_glock_dealloc ( struct rcu_head * rcu )
2006-01-16 19:50:04 +03:00
{
2011-01-19 12:30:01 +03:00
struct gfs2_glock * gl = container_of ( rcu , struct gfs2_glock , gl_rcu ) ;
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
if ( gl - > gl_ops - > go_flags & GLOF_ASPACE )
kmem_cache_free ( gfs2_glock_aspace_cachep , gl ) ;
else
kmem_cache_free ( gfs2_glock_cachep , gl ) ;
2011-03-09 13:58:04 +03:00
}
void gfs2_glock_free ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
struct gfs2_sbd * sdp = gl - > gl_sbd ;
2011-03-09 13:58:04 +03:00
call_rcu ( & gl - > gl_rcu , gfs2_glock_dealloc ) ;
2011-01-19 12:30:01 +03:00
if ( atomic_dec_and_test ( & sdp - > sd_glock_disposal ) )
wake_up ( & sdp - > sd_glock_wait ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_hold ( ) - increment reference count on glock
* @ gl : The glock to hold
*
*/
2009-07-24 03:52:34 +04:00
void gfs2_glock_hold ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2009-02-05 13:12:38 +03:00
GLOCK_BUG_ON ( gl , atomic_read ( & gl - > gl_ref ) = = 0 ) ;
2006-09-13 18:43:37 +04:00
atomic_inc ( & gl - > gl_ref ) ;
2006-01-16 19:50:04 +03:00
}
2009-07-11 03:04:24 +04:00
/**
* demote_ok - Check to see if it ' s ok to unlock a glock
* @ gl : the glock
*
* Returns : 1 if it ' s ok
*/
static int demote_ok ( const struct gfs2_glock * gl )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
2011-01-19 12:30:01 +03:00
/* assert_spin_locked(&gl->gl_spin); */
2009-07-11 03:04:24 +04:00
if ( gl - > gl_state = = LM_ST_UNLOCKED )
return 0 ;
2011-01-19 12:30:01 +03:00
if ( test_bit ( GLF_LFLUSH , & gl - > gl_flags ) )
return 0 ;
if ( ( gl - > gl_name . ln_type ! = LM_TYPE_INODE ) & &
! list_empty ( & gl - > gl_holders ) )
2009-07-11 03:04:24 +04:00
return 0 ;
if ( glops - > go_demote_ok )
return glops - > go_demote_ok ( gl ) ;
return 1 ;
}
2011-01-19 12:30:01 +03:00
2008-11-20 16:39:47 +03:00
/**
2011-01-19 12:30:01 +03:00
* __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
2008-11-20 16:39:47 +03:00
* @ gl : the glock
*
2011-01-19 12:30:01 +03:00
* If the glock is demotable , then we add it ( or move it ) to the end
* of the glock LRU list .
2008-11-20 16:39:47 +03:00
*/
2011-01-19 12:30:01 +03:00
static void __gfs2_glock_schedule_for_reclaim ( struct gfs2_glock * gl )
2008-11-20 16:39:47 +03:00
{
2011-01-19 12:30:01 +03:00
if ( demote_ok ( gl ) ) {
spin_lock ( & lru_lock ) ;
if ( ! list_empty ( & gl - > gl_lru ) )
list_del_init ( & gl - > gl_lru ) ;
else
atomic_inc ( & lru_count ) ;
2008-11-20 16:39:47 +03:00
list_add_tail ( & gl - > gl_lru , & lru_list ) ;
2011-01-19 12:30:01 +03:00
spin_unlock ( & lru_lock ) ;
2008-11-20 16:39:47 +03:00
}
2011-01-19 12:30:01 +03:00
}
void gfs2_glock_schedule_for_reclaim ( struct gfs2_glock * gl )
{
spin_lock ( & gl - > gl_spin ) ;
__gfs2_glock_schedule_for_reclaim ( gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
2008-11-20 16:39:47 +03:00
}
2009-07-11 03:04:24 +04:00
/**
* gfs2_glock_put_nolock ( ) - Decrement reference count on glock
* @ gl : The glock to put
*
* This function should only be used if the caller has its own reference
* to the glock , in addition to the one it is dropping .
*/
2009-07-24 03:52:34 +04:00
void gfs2_glock_put_nolock ( struct gfs2_glock * gl )
2009-07-11 03:04:24 +04:00
{
if ( atomic_dec_and_test ( & gl - > gl_ref ) )
GLOCK_BUG_ON ( gl , 1 ) ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_put ( ) - Decrement reference count on glock
* @ gl : The glock to put
*
*/
2011-01-19 12:30:01 +03:00
void gfs2_glock_put ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2011-01-19 12:30:01 +03:00
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct address_space * mapping = gfs2_glock2aspace ( gl ) ;
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
if ( atomic_dec_and_test ( & gl - > gl_ref ) ) {
spin_lock_bucket ( gl - > gl_hash ) ;
hlist_bl_del_rcu ( & gl - > gl_list ) ;
spin_unlock_bucket ( gl - > gl_hash ) ;
spin_lock ( & lru_lock ) ;
2008-11-20 16:39:47 +03:00
if ( ! list_empty ( & gl - > gl_lru ) ) {
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
}
spin_unlock ( & lru_lock ) ;
2008-05-21 20:03:22 +04:00
GLOCK_BUG_ON ( gl , ! list_empty ( & gl - > gl_holders ) ) ;
2011-01-19 12:30:01 +03:00
GLOCK_BUG_ON ( gl , mapping & & mapping - > nrpages ) ;
trace_gfs2_glock_put ( gl ) ;
sdp - > sd_lockstruct . ls_ops - > lm_put_lock ( gl ) ;
2006-01-16 19:50:04 +03:00
}
}
/**
* search_bucket ( ) - Find struct gfs2_glock by lock number
* @ bucket : the bucket to search
* @ name : The lock name
*
* Returns : NULL , or the struct gfs2_glock with the requested number
*/
2006-09-08 21:35:56 +04:00
static struct gfs2_glock * search_bucket ( unsigned int hash ,
2006-08-30 20:50:28 +04:00
const struct gfs2_sbd * sdp ,
2006-08-30 19:16:23 +04:00
const struct lm_lockname * name )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
2011-01-19 12:30:01 +03:00
struct hlist_bl_node * h ;
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
hlist_bl_for_each_entry_rcu ( gl , h , & gl_hash_table [ hash ] , gl_list ) {
2006-01-16 19:50:04 +03:00
if ( ! lm_name_equal ( & gl - > gl_name , name ) )
continue ;
2006-08-30 20:50:28 +04:00
if ( gl - > gl_sbd ! = sdp )
continue ;
2011-01-19 12:30:01 +03:00
if ( atomic_inc_not_zero ( & gl - > gl_ref ) )
return gl ;
2006-01-16 19:50:04 +03:00
}
return NULL ;
}
2008-05-21 20:03:22 +04:00
/**
* may_grant - check if its ok to grant a new lock
* @ gl : The glock
* @ gh : The lock request which we wish to grant
*
* Returns : true if its ok to grant the lock
*/
static inline int may_grant ( const struct gfs2_glock * gl , const struct gfs2_holder * gh )
{
const struct gfs2_holder * gh_head = list_entry ( gl - > gl_holders . next , const struct gfs2_holder , gh_list ) ;
if ( ( gh - > gh_state = = LM_ST_EXCLUSIVE | |
gh_head - > gh_state = = LM_ST_EXCLUSIVE ) & & gh ! = gh_head )
return 0 ;
if ( gl - > gl_state = = gh - > gh_state )
return 1 ;
if ( gh - > gh_flags & GL_EXACT )
return 0 ;
2008-07-07 13:07:28 +04:00
if ( gl - > gl_state = = LM_ST_EXCLUSIVE ) {
if ( gh - > gh_state = = LM_ST_SHARED & & gh_head - > gh_state = = LM_ST_SHARED )
return 1 ;
if ( gh - > gh_state = = LM_ST_DEFERRED & & gh_head - > gh_state = = LM_ST_DEFERRED )
return 1 ;
}
2008-05-21 20:03:22 +04:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & ( gh - > gh_flags & LM_FLAG_ANY ) )
return 1 ;
return 0 ;
}
static void gfs2_holder_wake ( struct gfs2_holder * gh )
{
clear_bit ( HIF_WAIT , & gh - > gh_iflags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gh - > gh_iflags , HIF_WAIT ) ;
}
2010-07-23 17:05:51 +04:00
/**
* do_error - Something unexpected has happened during a lock request
*
*/
static inline void do_error ( struct gfs2_glock * gl , const int ret )
{
struct gfs2_holder * gh , * tmp ;
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( ret & LM_OUT_ERROR )
gh - > gh_error = - EIO ;
else if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) )
gh - > gh_error = GLR_TRYFAILED ;
else
continue ;
list_del_init ( & gh - > gh_list ) ;
trace_gfs2_glock_queue ( gh , 0 ) ;
gfs2_holder_wake ( gh ) ;
}
}
2008-05-21 20:03:22 +04:00
/**
* do_promote - promote as many requests as possible on the current queue
* @ gl : The glock
*
2008-11-18 16:38:48 +03:00
* Returns : 1 if there is a blocked holder at the head of the list , or 2
* if a type specific operation is underway .
2008-05-21 20:03:22 +04:00
*/
static int do_promote ( struct gfs2_glock * gl )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh , * tmp ;
int ret ;
restart :
list_for_each_entry_safe ( gh , tmp , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( may_grant ( gl , gh ) ) {
if ( gh - > gh_list . prev = = & gl - > gl_holders & &
glops - > go_lock ) {
spin_unlock ( & gl - > gl_spin ) ;
/* FIXME: eliminate this eventually */
ret = glops - > go_lock ( gh ) ;
spin_lock ( & gl - > gl_spin ) ;
if ( ret ) {
2008-11-18 16:38:48 +03:00
if ( ret = = 1 )
return 2 ;
2008-05-21 20:03:22 +04:00
gh - > gh_error = ret ;
list_del_init ( & gh - > gh_list ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_glock_queue ( gh , 0 ) ;
2008-05-21 20:03:22 +04:00
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_promote ( gh , 1 ) ;
2008-05-21 20:03:22 +04:00
gfs2_holder_wake ( gh ) ;
goto restart ;
}
set_bit ( HIF_HOLDER , & gh - > gh_iflags ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_promote ( gh , 0 ) ;
2008-05-21 20:03:22 +04:00
gfs2_holder_wake ( gh ) ;
continue ;
}
if ( gh - > gh_list . prev = = & gl - > gl_holders )
return 1 ;
2010-07-23 17:05:51 +04:00
do_error ( gl , 0 ) ;
2008-05-21 20:03:22 +04:00
break ;
}
return 0 ;
}
/**
* find_first_waiter - find the first gh that ' s waiting for the glock
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_waiter ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
if ( ! test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* state_change - record that the glock is now in a different state
* @ gl : the glock
* @ new_state the new state
*
*/
static void state_change ( struct gfs2_glock * gl , unsigned int new_state )
{
int held1 , held2 ;
held1 = ( gl - > gl_state ! = LM_ST_UNLOCKED ) ;
held2 = ( new_state ! = LM_ST_UNLOCKED ) ;
if ( held1 ! = held2 ) {
if ( held2 )
gfs2_glock_hold ( gl ) ;
else
2009-07-11 03:04:24 +04:00
gfs2_glock_put_nolock ( gl ) ;
2008-05-21 20:03:22 +04:00
}
2010-09-03 12:39:20 +04:00
if ( held1 & & held2 & & list_empty ( & gl - > gl_holders ) )
clear_bit ( GLF_QUEUED , & gl - > gl_flags ) ;
2008-05-21 20:03:22 +04:00
gl - > gl_state = new_state ;
gl - > gl_tchange = jiffies ;
}
static void gfs2_demote_wake ( struct gfs2_glock * gl )
{
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
clear_bit ( GLF_DEMOTE , & gl - > gl_flags ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_bit ( & gl - > gl_flags , GLF_DEMOTE ) ;
}
/**
* finish_xmote - The DLM has replied to one of our lock requests
* @ gl : The glock
* @ ret : The status from the DLM
*
*/
static void finish_xmote ( struct gfs2_glock * gl , unsigned int ret )
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_holder * gh ;
unsigned state = ret & LM_OUT_ST_MASK ;
2008-11-18 16:38:48 +03:00
int rv ;
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_glock_state_change ( gl , state ) ;
2008-05-21 20:03:22 +04:00
state_change ( gl , state ) ;
gh = find_first_waiter ( gl ) ;
/* Demote to UN request arrived during demote to SH or DF */
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) & &
state ! = LM_ST_UNLOCKED & & gl - > gl_demote_state = = LM_ST_UNLOCKED )
gl - > gl_target = LM_ST_UNLOCKED ;
/* Check for state != intended state */
if ( unlikely ( state ! = gl - > gl_target ) ) {
if ( gh & & ! test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) {
/* move to back of queue and try next entry */
if ( ret & LM_OUT_CANCELED ) {
if ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) = = 0 )
list_move_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
goto retry ;
}
/* Some error or failed "try lock" - report it */
if ( ( ret & LM_OUT_ERROR ) | |
( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) ) {
gl - > gl_target = gl - > gl_state ;
do_error ( gl , ret ) ;
goto out ;
}
}
switch ( state ) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED :
retry :
do_xmote ( gl , gh , gl - > gl_target ) ;
break ;
/* Conversion fails, unlock and try again */
case LM_ST_SHARED :
case LM_ST_DEFERRED :
do_xmote ( gl , gh , LM_ST_UNLOCKED ) ;
break ;
default : /* Everything else */
printk ( KERN_ERR " GFS2: wanted %u got %u \n " , gl - > gl_target , state ) ;
GLOCK_BUG_ON ( gl , 1 ) ;
}
spin_unlock ( & gl - > gl_spin ) ;
return ;
}
/* Fast path - we got what we asked for */
if ( test_and_clear_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
if ( state ! = LM_ST_UNLOCKED ) {
if ( glops - > go_xmote_bh ) {
spin_unlock ( & gl - > gl_spin ) ;
rv = glops - > go_xmote_bh ( gl , gh ) ;
spin_lock ( & gl - > gl_spin ) ;
if ( rv ) {
do_error ( gl , rv ) ;
goto out ;
}
}
2008-11-18 16:38:48 +03:00
rv = do_promote ( gl ) ;
if ( rv = = 2 )
goto out_locked ;
2008-05-21 20:03:22 +04:00
}
out :
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2008-11-18 16:38:48 +03:00
out_locked :
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
}
/**
* do_xmote - Calls the DLM to change the state of a lock
* @ gl : The lock state
* @ gh : The holder ( only for promotes )
* @ target : The target lock state
*
*/
static void do_xmote ( struct gfs2_glock * gl , struct gfs2_holder * gh , unsigned int target )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
unsigned int lck_flags = gh ? gh - > gh_flags : 0 ;
int ret ;
lck_flags & = ( LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY ) ;
2010-11-29 15:50:38 +03:00
GLOCK_BUG_ON ( gl , gl - > gl_state = = target ) ;
GLOCK_BUG_ON ( gl , gl - > gl_state = = gl - > gl_target ) ;
2008-05-21 20:03:22 +04:00
if ( ( target = = LM_ST_UNLOCKED | | target = = LM_ST_DEFERRED ) & &
glops - > go_inval ) {
set_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
2010-11-30 18:49:31 +03:00
gl - > gl_req = target ;
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
if ( glops - > go_xmote_th )
glops - > go_xmote_th ( gl ) ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
glops - > go_inval ( gl , target = = LM_ST_DEFERRED ? 0 : DIO_METADATA ) ;
clear_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( target ! = LM_ST_UNLOCKED & & ( gl - > gl_state = = LM_ST_SHARED | |
gl - > gl_state = = LM_ST_DEFERRED ) & &
! ( lck_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
lck_flags | = LM_FLAG_TRY_1CB ;
2010-11-29 15:50:38 +03:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_lock ) {
/* lock_dlm */
ret = sdp - > sd_lockstruct . ls_ops - > lm_lock ( gl , target , lck_flags ) ;
GLOCK_BUG_ON ( gl , ret ) ;
} else { /* lock_nolock */
finish_xmote ( gl , target ) ;
2008-05-21 20:03:22 +04:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
}
2010-11-29 15:50:38 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
}
/**
* find_first_holder - find the first " holder " gh
* @ gl : the glock
*/
static inline struct gfs2_holder * find_first_holder ( const struct gfs2_glock * gl )
{
struct gfs2_holder * gh ;
if ( ! list_empty ( & gl - > gl_holders ) ) {
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
return gh ;
}
return NULL ;
}
/**
* run_queue - do all outstanding tasks related to a glock
* @ gl : The glock in question
* @ nonblock : True if we must not block in run_queue
*
*/
static void run_queue ( struct gfs2_glock * gl , const int nonblock )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2008-05-21 20:03:22 +04:00
{
struct gfs2_holder * gh = NULL ;
2008-11-18 16:38:48 +03:00
int ret ;
2008-05-21 20:03:22 +04:00
if ( test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) )
return ;
GLOCK_BUG_ON ( gl , test_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ) ;
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_demote_state ! = gl - > gl_state ) {
if ( find_first_holder ( gl ) )
2009-02-05 13:12:38 +03:00
goto out_unlock ;
2008-05-21 20:03:22 +04:00
if ( nonblock )
goto out_sched ;
set_bit ( GLF_DEMOTE_IN_PROGRESS , & gl - > gl_flags ) ;
2008-07-07 13:02:36 +04:00
GLOCK_BUG_ON ( gl , gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) ;
2008-05-21 20:03:22 +04:00
gl - > gl_target = gl - > gl_demote_state ;
} else {
if ( test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
gfs2_demote_wake ( gl ) ;
2008-11-18 16:38:48 +03:00
ret = do_promote ( gl ) ;
if ( ret = = 0 )
2009-02-05 13:12:38 +03:00
goto out_unlock ;
2008-11-18 16:38:48 +03:00
if ( ret = = 2 )
2009-04-07 17:01:34 +04:00
goto out ;
2008-05-21 20:03:22 +04:00
gh = find_first_waiter ( gl ) ;
gl - > gl_target = gh - > gh_state ;
if ( ! ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) )
do_error ( gl , 0 ) ; /* Fail queued try locks */
}
do_xmote ( gl , gh , gl - > gl_target ) ;
2009-04-07 17:01:34 +04:00
out :
2008-05-21 20:03:22 +04:00
return ;
out_sched :
2009-09-22 13:56:16 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
smp_mb__after_clear_bit ( ) ;
2008-05-21 20:03:22 +04:00
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
2009-07-11 03:04:24 +04:00
gfs2_glock_put_nolock ( gl ) ;
2009-09-22 13:56:16 +04:00
return ;
2009-02-05 13:12:38 +03:00
out_unlock :
2008-05-21 20:03:22 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2009-09-22 13:56:16 +04:00
smp_mb__after_clear_bit ( ) ;
return ;
2008-05-21 20:03:22 +04:00
}
2009-07-24 03:52:34 +04:00
static void delete_work_func ( struct work_struct * work )
{
struct gfs2_glock * gl = container_of ( work , struct gfs2_glock , gl_delete ) ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
2010-11-03 23:01:07 +03:00
struct gfs2_inode * ip ;
2009-07-24 03:52:34 +04:00
struct inode * inode ;
2010-11-03 23:01:07 +03:00
u64 no_addr = gl - > gl_name . ln_number ;
ip = gl - > gl_object ;
/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
2009-07-24 03:52:34 +04:00
if ( ip )
inode = gfs2_ilookup ( sdp - > sd_vfs , no_addr ) ;
2010-11-03 23:01:07 +03:00
else
inode = gfs2_lookup_by_inum ( sdp , no_addr , NULL , GFS2_BLKST_UNLINKED ) ;
if ( inode & & ! IS_ERR ( inode ) ) {
d_prune_aliases ( inode ) ;
iput ( inode ) ;
2009-07-24 03:52:34 +04:00
}
gfs2_glock_put ( gl ) ;
}
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
static void glock_work_func ( struct work_struct * work )
{
2008-05-21 20:03:22 +04:00
unsigned long delay = 0 ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
struct gfs2_glock * gl = container_of ( work , struct gfs2_glock , gl_work . work ) ;
2009-11-27 13:31:11 +03:00
int drop_ref = 0 ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
2009-11-27 13:31:11 +03:00
if ( test_and_clear_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ) {
2008-05-21 20:03:22 +04:00
finish_xmote ( gl , gl - > gl_reply ) ;
2009-11-27 13:31:11 +03:00
drop_ref = 1 ;
}
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
spin_lock ( & gl - > gl_spin ) ;
2008-07-07 13:02:36 +04:00
if ( test_and_clear_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
gl - > gl_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = LM_ST_EXCLUSIVE ) {
2008-05-21 20:03:22 +04:00
unsigned long holdtime , now = jiffies ;
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
set_bit ( delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE , & gl - > gl_flags ) ;
}
run_queue ( gl , 0 ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( ! delay | |
queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2009-11-27 13:31:11 +03:00
if ( drop_ref )
gfs2_glock_put ( gl ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_get ( ) - Get a glock , or create one if one doesn ' t exist
* @ sdp : The GFS2 superblock
* @ number : the lock number
* @ glops : The glock_operations to use
* @ create : If 0 , don ' t create the glock if it doesn ' t exist
* @ glp : the glock is returned here
*
* This does not lock a glock , just finds / creates structures for one .
*
* Returns : errno
*/
2006-09-04 20:49:07 +04:00
int gfs2_glock_get ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops , int create ,
2006-01-16 19:50:04 +03:00
struct gfs2_glock * * glp )
{
2009-12-08 15:12:13 +03:00
struct super_block * s = sdp - > sd_vfs ;
2006-09-08 21:35:56 +04:00
struct lm_lockname name = { . ln_number = number , . ln_type = glops - > go_type } ;
2006-01-16 19:50:04 +03:00
struct gfs2_glock * gl , * tmp ;
2006-09-08 21:35:56 +04:00
unsigned int hash = gl_hash ( sdp , & name ) ;
2009-12-08 15:12:13 +03:00
struct address_space * mapping ;
2011-01-19 12:30:01 +03:00
struct kmem_cache * cachep ;
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
rcu_read_lock ( ) ;
2006-09-08 21:35:56 +04:00
gl = search_bucket ( hash , sdp , & name ) ;
2011-01-19 12:30:01 +03:00
rcu_read_unlock ( ) ;
2006-01-16 19:50:04 +03:00
GFS2: Add a "demote a glock" interface to sysfs
This adds a sysfs file called demote_rq to GFS2's
per filesystem directory. Its possible to use this
file to demote arbitrary glocks in exactly the same
way as if a request had come in from a remote node.
This is intended for testing issues relating to caching
of data under glocks. Despite that, the interface is
generic enough to send requests to any type of glock,
but be careful as its not always safe to send an
arbitrary message to an arbitrary glock. For that reason
and to prevent DoS, this interface is restricted to root
only.
The messages look like this:
<type>:<glocknumber> <mode>
Example:
echo -n "2:13324 EX" >/sys/fs/gfs2/unity:myfs/demote_rq
Which means "please demote inode glock (type 2) number 13324 so that
I can get an EX (exclusive) lock". The lock modes are those which
would normally be sent by a remote node in its callback so if you
want to unlock a glock, you use EX, to demote to shared, use SH or PR
(depending on whether you like GFS2 or DLM lock modes better!).
If the glock doesn't exist, you'll get -ENOENT returned. If the
arguments don't make sense, you'll get -EINVAL returned.
The plan is that this interface will be used in combination with
the blktrace patch which I recently posted for comments although
it is, of course, still useful in its own right.
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2009-02-12 16:31:58 +03:00
* glp = gl ;
if ( gl )
2006-01-16 19:50:04 +03:00
return 0 ;
GFS2: Add a "demote a glock" interface to sysfs
This adds a sysfs file called demote_rq to GFS2's
per filesystem directory. Its possible to use this
file to demote arbitrary glocks in exactly the same
way as if a request had come in from a remote node.
This is intended for testing issues relating to caching
of data under glocks. Despite that, the interface is
generic enough to send requests to any type of glock,
but be careful as its not always safe to send an
arbitrary message to an arbitrary glock. For that reason
and to prevent DoS, this interface is restricted to root
only.
The messages look like this:
<type>:<glocknumber> <mode>
Example:
echo -n "2:13324 EX" >/sys/fs/gfs2/unity:myfs/demote_rq
Which means "please demote inode glock (type 2) number 13324 so that
I can get an EX (exclusive) lock". The lock modes are those which
would normally be sent by a remote node in its callback so if you
want to unlock a glock, you use EX, to demote to shared, use SH or PR
(depending on whether you like GFS2 or DLM lock modes better!).
If the glock doesn't exist, you'll get -ENOENT returned. If the
arguments don't make sense, you'll get -EINVAL returned.
The plan is that this interface will be used in combination with
the blktrace patch which I recently posted for comments although
it is, of course, still useful in its own right.
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2009-02-12 16:31:58 +03:00
if ( ! create )
return - ENOENT ;
2006-01-16 19:50:04 +03:00
2009-12-08 15:12:13 +03:00
if ( glops - > go_flags & GLOF_ASPACE )
2011-01-19 12:30:01 +03:00
cachep = gfs2_glock_aspace_cachep ;
2009-12-08 15:12:13 +03:00
else
2011-01-19 12:30:01 +03:00
cachep = gfs2_glock_cachep ;
gl = kmem_cache_alloc ( cachep , GFP_KERNEL ) ;
2006-01-16 19:50:04 +03:00
if ( ! gl )
return - ENOMEM ;
2010-01-29 18:21:27 +03:00
atomic_inc ( & sdp - > sd_glock_disposal ) ;
2006-08-30 18:36:52 +04:00
gl - > gl_flags = 0 ;
2006-01-16 19:50:04 +03:00
gl - > gl_name = name ;
2006-09-13 18:43:37 +04:00
atomic_set ( & gl - > gl_ref , 1 ) ;
2006-01-16 19:50:04 +03:00
gl - > gl_state = LM_ST_UNLOCKED ;
2008-05-21 20:03:22 +04:00
gl - > gl_target = LM_ST_UNLOCKED ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gl - > gl_demote_state = LM_ST_EXCLUSIVE ;
2006-09-08 21:35:56 +04:00
gl - > gl_hash = hash ;
2006-01-16 19:50:04 +03:00
gl - > gl_ops = glops ;
2009-01-12 13:43:39 +03:00
snprintf ( gl - > gl_strname , GDLM_STRNAME_BYTES , " %8x%16llx " , name . ln_type , ( unsigned long long ) number ) ;
memset ( & gl - > gl_lksb , 0 , sizeof ( struct dlm_lksb ) ) ;
gl - > gl_lksb . sb_lvbptr = gl - > gl_lvb ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gl - > gl_tchange = jiffies ;
2006-08-30 18:36:52 +04:00
gl - > gl_object = NULL ;
2006-01-16 19:50:04 +03:00
gl - > gl_sbd = sdp ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
INIT_DELAYED_WORK ( & gl - > gl_work , glock_work_func ) ;
2009-07-24 03:52:34 +04:00
INIT_WORK ( & gl - > gl_delete , delete_work_func ) ;
2006-01-16 19:50:04 +03:00
2009-12-08 15:12:13 +03:00
mapping = gfs2_glock2aspace ( gl ) ;
if ( mapping ) {
mapping - > a_ops = & gfs2_meta_aops ;
mapping - > host = s - > s_bdev - > bd_inode ;
mapping - > flags = 0 ;
mapping_set_gfp_mask ( mapping , GFP_NOFS ) ;
mapping - > assoc_mapping = NULL ;
mapping - > backing_dev_info = s - > s_bdi ;
mapping - > writeback_index = 0 ;
2006-01-16 19:50:04 +03:00
}
2011-01-19 12:30:01 +03:00
spin_lock_bucket ( hash ) ;
2006-09-08 21:35:56 +04:00
tmp = search_bucket ( hash , sdp , & name ) ;
2006-01-16 19:50:04 +03:00
if ( tmp ) {
2011-01-19 12:30:01 +03:00
spin_unlock_bucket ( hash ) ;
kmem_cache_free ( cachep , gl ) ;
2011-03-09 13:58:04 +03:00
atomic_dec ( & sdp - > sd_glock_disposal ) ;
2006-01-16 19:50:04 +03:00
gl = tmp ;
} else {
2011-01-19 12:30:01 +03:00
hlist_bl_add_head_rcu ( & gl - > gl_list , & gl_hash_table [ hash ] ) ;
spin_unlock_bucket ( hash ) ;
2006-01-16 19:50:04 +03:00
}
* glp = gl ;
return 0 ;
}
/**
* gfs2_holder_init - initialize a struct gfs2_holder in the default way
* @ gl : the glock
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
*/
2006-04-21 00:57:23 +04:00
void gfs2_holder_init ( struct gfs2_glock * gl , unsigned int state , unsigned flags ,
2006-01-16 19:50:04 +03:00
struct gfs2_holder * gh )
{
INIT_LIST_HEAD ( & gh - > gh_list ) ;
gh - > gh_gl = gl ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2008-02-07 11:13:19 +03:00
gh - > gh_owner_pid = get_pid ( task_pid ( current ) ) ;
2006-01-16 19:50:04 +03:00
gh - > gh_state = state ;
gh - > gh_flags = flags ;
gh - > gh_error = 0 ;
gh - > gh_iflags = 0 ;
gfs2_glock_hold ( gl ) ;
}
/**
* gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
* @ state : the state we ' re requesting
* @ flags : the modifier flags
* @ gh : the holder structure
*
* Don ' t mess with the glock .
*
*/
2006-04-21 00:57:23 +04:00
void gfs2_holder_reinit ( unsigned int state , unsigned flags , struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
gh - > gh_state = state ;
2006-04-26 22:58:26 +04:00
gh - > gh_flags = flags ;
2007-03-16 12:40:31 +03:00
gh - > gh_iflags = 0 ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = ( unsigned long ) __builtin_return_address ( 0 ) ;
2010-04-14 19:58:16 +04:00
if ( gh - > gh_owner_pid )
put_pid ( gh - > gh_owner_pid ) ;
gh - > gh_owner_pid = get_pid ( task_pid ( current ) ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_holder_uninit - uninitialize a holder structure ( drop glock reference )
* @ gh : the holder structure
*
*/
void gfs2_holder_uninit ( struct gfs2_holder * gh )
{
2008-02-07 11:13:19 +03:00
put_pid ( gh - > gh_owner_pid ) ;
2006-01-16 19:50:04 +03:00
gfs2_glock_put ( gh - > gh_gl ) ;
gh - > gh_gl = NULL ;
2006-03-29 23:36:49 +04:00
gh - > gh_ip = 0 ;
2006-01-16 19:50:04 +03:00
}
2009-05-19 13:01:18 +04:00
/**
* gfs2_glock_holder_wait
* @ word : unused
*
* This function and gfs2_glock_demote_wait both show up in the WCHAN
* field . Thus I ' ve separated these otherwise identical functions in
* order to be more informative to the user .
*/
static int gfs2_glock_holder_wait ( void * word )
2007-01-17 18:33:23 +03:00
{
schedule ( ) ;
return 0 ;
}
2009-05-19 13:01:18 +04:00
static int gfs2_glock_demote_wait ( void * word )
{
schedule ( ) ;
return 0 ;
}
2008-05-21 20:03:22 +04:00
static void wait_on_holder ( struct gfs2_holder * gh )
2008-01-30 18:34:04 +03:00
{
2008-05-21 20:03:22 +04:00
might_sleep ( ) ;
2009-05-19 13:01:18 +04:00
wait_on_bit ( & gh - > gh_iflags , HIF_WAIT , gfs2_glock_holder_wait , TASK_UNINTERRUPTIBLE ) ;
2008-01-30 18:34:04 +03:00
}
2008-05-21 20:03:22 +04:00
static void wait_on_demote ( struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
might_sleep ( ) ;
2009-05-19 13:01:18 +04:00
wait_on_bit ( & gl - > gl_flags , GLF_DEMOTE , gfs2_glock_demote_wait , TASK_UNINTERRUPTIBLE ) ;
2006-01-16 19:50:04 +03:00
}
/**
2008-05-21 20:03:22 +04:00
* handle_callback - process a demote request
* @ gl : the glock
* @ state : the state the caller wants us to change to
2006-01-16 19:50:04 +03:00
*
2008-05-21 20:03:22 +04:00
* There are only two requests that we are going to see in actual
* practise : LM_ST_SHARED and LM_ST_UNLOCKED
2006-01-16 19:50:04 +03:00
*/
2008-05-21 20:03:22 +04:00
static void handle_callback ( struct gfs2_glock * gl , unsigned int state ,
2008-11-20 16:39:47 +03:00
unsigned long delay )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
set_bit ( bit , & gl - > gl_flags ) ;
if ( gl - > gl_demote_state = = LM_ST_EXCLUSIVE ) {
gl - > gl_demote_state = state ;
gl - > gl_demote_time = jiffies ;
} else if ( gl - > gl_demote_state ! = LM_ST_UNLOCKED & &
gl - > gl_demote_state ! = state ) {
gl - > gl_demote_state = LM_ST_UNLOCKED ;
2006-01-16 19:50:04 +03:00
}
2009-07-24 03:52:34 +04:00
if ( gl - > gl_ops - > go_callback )
gl - > gl_ops - > go_callback ( gl ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_demote_rq ( gl ) ;
2006-01-16 19:50:04 +03:00
}
/**
2008-05-21 20:03:22 +04:00
* gfs2_glock_wait - wait on a glock acquisition
2006-01-16 19:50:04 +03:00
* @ gh : the glock holder
*
* Returns : 0 on success
*/
2008-05-21 20:03:22 +04:00
int gfs2_glock_wait ( struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
2007-01-17 18:33:23 +03:00
wait_on_holder ( gh ) ;
2006-01-16 19:50:04 +03:00
return gh - > gh_error ;
}
2008-05-21 20:03:22 +04:00
void gfs2_print_dbg ( struct seq_file * seq , const char * fmt , . . . )
2007-03-16 13:26:37 +03:00
{
2010-11-10 03:35:20 +03:00
struct va_format vaf ;
2007-03-16 13:26:37 +03:00
va_list args ;
va_start ( args , fmt ) ;
2010-11-10 03:35:20 +03:00
2008-05-21 20:03:22 +04:00
if ( seq ) {
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
vsprintf ( gi - > string , fmt , args ) ;
2008-05-21 20:03:22 +04:00
seq_printf ( seq , gi - > string ) ;
} else {
2010-11-10 03:35:20 +03:00
vaf . fmt = fmt ;
vaf . va = & args ;
printk ( KERN_ERR " %pV " , & vaf ) ;
2008-05-21 20:03:22 +04:00
}
2010-11-10 03:35:20 +03:00
2007-03-16 13:26:37 +03:00
va_end ( args ) ;
}
2006-01-16 19:50:04 +03:00
/**
* add_to_queue - Add a holder to the wait queue ( but look for recursion )
* @ gh : the holder structure to add
*
2008-05-21 20:03:22 +04:00
* Eventually we should move the recursive locking trap to a
* debugging option or something like that . This is the fast
* path and needs to have the minimum number of distractions .
*
2006-01-16 19:50:04 +03:00
*/
2008-05-21 20:03:22 +04:00
static inline void add_to_queue ( struct gfs2_holder * gh )
2008-10-24 22:31:12 +04:00
__releases ( & gl - > gl_spin )
__acquires ( & gl - > gl_spin )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl = gh - > gh_gl ;
2008-05-21 20:03:22 +04:00
struct gfs2_sbd * sdp = gl - > gl_sbd ;
struct list_head * insert_pt = NULL ;
struct gfs2_holder * gh2 ;
int try_lock = 0 ;
2006-01-16 19:50:04 +03:00
2008-02-07 11:13:19 +03:00
BUG_ON ( gh - > gh_owner_pid = = NULL ) ;
2007-01-17 18:33:23 +03:00
if ( test_and_set_bit ( HIF_WAIT , & gh - > gh_iflags ) )
BUG ( ) ;
2006-04-21 00:57:23 +04:00
2008-05-21 20:03:22 +04:00
if ( gh - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) {
if ( test_bit ( GLF_LOCK , & gl - > gl_flags ) )
try_lock = 1 ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , & gl - > gl_flags ) )
goto fail ;
}
list_for_each_entry ( gh2 , & gl - > gl_holders , gh_list ) {
if ( unlikely ( gh2 - > gh_owner_pid = = gh - > gh_owner_pid & &
( gh - > gh_gl - > gl_ops - > go_type ! = LM_TYPE_FLOCK ) ) )
goto trap_recursive ;
if ( try_lock & &
! ( gh2 - > gh_flags & ( LM_FLAG_TRY | LM_FLAG_TRY_1CB ) ) & &
! may_grant ( gl , gh ) ) {
fail :
gh - > gh_error = GLR_TRYFAILED ;
gfs2_holder_wake ( gh ) ;
return ;
2007-09-14 08:35:27 +04:00
}
2008-05-21 20:03:22 +04:00
if ( test_bit ( HIF_HOLDER , & gh2 - > gh_iflags ) )
continue ;
if ( unlikely ( ( gh - > gh_flags & LM_FLAG_PRIORITY ) & & ! insert_pt ) )
insert_pt = & gh2 - > gh_list ;
}
2010-09-03 12:39:20 +04:00
set_bit ( GLF_QUEUED , & gl - > gl_flags ) ;
2011-01-31 12:38:12 +03:00
trace_gfs2_glock_queue ( gh , 1 ) ;
2008-05-21 20:03:22 +04:00
if ( likely ( insert_pt = = NULL ) ) {
list_add_tail ( & gh - > gh_list , & gl - > gl_holders ) ;
if ( unlikely ( gh - > gh_flags & LM_FLAG_PRIORITY ) )
goto do_cancel ;
return ;
}
list_add_tail ( & gh - > gh_list , insert_pt ) ;
do_cancel :
gh = list_entry ( gl - > gl_holders . next , struct gfs2_holder , gh_list ) ;
if ( ! ( gh - > gh_flags & LM_FLAG_PRIORITY ) ) {
spin_unlock ( & gl - > gl_spin ) ;
2008-05-23 17:46:04 +04:00
if ( sdp - > sd_lockstruct . ls_ops - > lm_cancel )
2009-01-12 13:43:39 +03:00
sdp - > sd_lockstruct . ls_ops - > lm_cancel ( gl ) ;
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
}
2008-05-21 20:03:22 +04:00
return ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
trap_recursive :
print_symbol ( KERN_ERR " original: %s \n " , gh2 - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh2 - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh2 - > gh_gl - > gl_name . ln_type , gh2 - > gh_state ) ;
print_symbol ( KERN_ERR " new: %s \n " , gh - > gh_ip ) ;
printk ( KERN_ERR " pid: %d \n " , pid_nr ( gh - > gh_owner_pid ) ) ;
printk ( KERN_ERR " lock type: %d req lock state : %d \n " ,
gh - > gh_gl - > gl_name . ln_type , gh - > gh_state ) ;
__dump_glock ( NULL , gl ) ;
BUG ( ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock ( acquire a glock )
* @ gh : the holder structure
*
* if ( gh - > gh_flags & GL_ASYNC ) , this never returns an error
*
* Returns : 0 , GLR_TRYFAILED , or errno on failure
*/
int gfs2_glock_nq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
struct gfs2_sbd * sdp = gl - > gl_sbd ;
int error = 0 ;
2008-05-21 20:03:22 +04:00
if ( unlikely ( test_bit ( SDF_SHUTDOWN , & sdp - > sd_flags ) ) )
2006-01-16 19:50:04 +03:00
return - EIO ;
spin_lock ( & gl - > gl_spin ) ;
add_to_queue ( gh ) ;
2010-08-02 13:15:17 +04:00
if ( ( LM_FLAG_NOEXP & gh - > gh_flags ) & &
test_and_clear_bit ( GLF_FROZEN , & gl - > gl_flags ) )
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
2008-05-21 20:03:22 +04:00
run_queue ( gl , 1 ) ;
2006-01-16 19:50:04 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( ! ( gh - > gh_flags & GL_ASYNC ) )
error = gfs2_glock_wait ( gh ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
/**
* gfs2_glock_poll - poll to see if an async request has been completed
* @ gh : the holder
*
* Returns : 1 if the request is ready to be gfs2_glock_wait ( ) ed on
*/
int gfs2_glock_poll ( struct gfs2_holder * gh )
{
2008-05-21 20:03:22 +04:00
return test_bit ( HIF_WAIT , & gh - > gh_iflags ) ? 0 : 1 ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock ( release a glock )
* @ gh : the glock holder
*
*/
void gfs2_glock_dq ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
unsigned delay = 0 ;
2008-05-21 20:03:22 +04:00
int fast_path = 0 ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
if ( gh - > gh_flags & GL_NOCACHE )
2008-11-20 16:39:47 +03:00
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
2006-01-16 19:50:04 +03:00
list_del_init ( & gh - > gh_list ) ;
2008-05-21 20:03:22 +04:00
if ( find_first_holder ( gl ) = = NULL ) {
2007-11-02 11:39:34 +03:00
if ( glops - > go_unlock ) {
2008-05-21 20:03:22 +04:00
GLOCK_BUG_ON ( gl , test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) ) ;
2007-11-02 11:39:34 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
glops - > go_unlock ( gh ) ;
2007-11-02 11:39:34 +03:00
spin_lock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
2007-11-02 11:39:34 +03:00
}
2008-05-21 20:03:22 +04:00
if ( list_empty ( & gl - > gl_holders ) & &
! test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
fast_path = 1 ;
2006-01-16 19:50:04 +03:00
}
2011-01-19 12:30:01 +03:00
__gfs2_glock_schedule_for_reclaim ( gl ) ;
2009-06-12 11:49:20 +04:00
trace_gfs2_glock_queue ( gh , 0 ) ;
2006-01-16 19:50:04 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2008-05-21 20:03:22 +04:00
if ( likely ( fast_path ) )
return ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
gfs2_glock_hold ( gl ) ;
if ( test_bit ( GLF_PENDING_DEMOTE , & gl - > gl_flags ) & &
! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2007-06-11 11:22:32 +04:00
void gfs2_glock_dq_wait ( struct gfs2_holder * gh )
{
struct gfs2_glock * gl = gh - > gh_gl ;
gfs2_glock_dq ( gh ) ;
wait_on_demote ( gl ) ;
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
* @ gh : the holder structure
*
*/
void gfs2_glock_dq_uninit ( struct gfs2_holder * gh )
{
gfs2_glock_dq ( gh ) ;
gfs2_holder_uninit ( gh ) ;
}
/**
* gfs2_glock_nq_num - acquire a glock based on lock number
* @ sdp : the filesystem
* @ number : the lock number
* @ glops : the glock operations for the type of glock
* @ state : the state to acquire the glock in
* @ flags : modifier flags for the aquisition
* @ gh : the struct gfs2_holder
*
* Returns : errno
*/
2006-09-04 20:49:07 +04:00
int gfs2_glock_nq_num ( struct gfs2_sbd * sdp , u64 number ,
2006-08-30 17:30:00 +04:00
const struct gfs2_glock_operations * glops ,
unsigned int state , int flags , struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
int error ;
error = gfs2_glock_get ( sdp , number , glops , CREATE , & gl ) ;
if ( ! error ) {
error = gfs2_glock_nq_init ( gl , state , flags , gh ) ;
gfs2_glock_put ( gl ) ;
}
return error ;
}
/**
* glock_compare - Compare two struct gfs2_glock structures for sorting
* @ arg_a : the first structure
* @ arg_b : the second structure
*
*/
static int glock_compare ( const void * arg_a , const void * arg_b )
{
2006-09-10 01:07:05 +04:00
const struct gfs2_holder * gh_a = * ( const struct gfs2_holder * * ) arg_a ;
const struct gfs2_holder * gh_b = * ( const struct gfs2_holder * * ) arg_b ;
const struct lm_lockname * a = & gh_a - > gh_gl - > gl_name ;
const struct lm_lockname * b = & gh_b - > gh_gl - > gl_name ;
2006-01-16 19:50:04 +03:00
if ( a - > ln_number > b - > ln_number )
2006-09-10 01:07:05 +04:00
return 1 ;
if ( a - > ln_number < b - > ln_number )
return - 1 ;
2007-01-22 20:10:39 +03:00
BUG_ON ( gh_a - > gh_gl - > gl_ops - > go_type = = gh_b - > gh_gl - > gl_ops - > go_type ) ;
2006-09-10 01:07:05 +04:00
return 0 ;
2006-01-16 19:50:04 +03:00
}
/**
* nq_m_sync - synchonously acquire more than one glock in deadlock free order
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
static int nq_m_sync ( unsigned int num_gh , struct gfs2_holder * ghs ,
struct gfs2_holder * * p )
{
unsigned int x ;
int error = 0 ;
for ( x = 0 ; x < num_gh ; x + + )
p [ x ] = & ghs [ x ] ;
sort ( p , num_gh , sizeof ( struct gfs2_holder * ) , glock_compare , NULL ) ;
for ( x = 0 ; x < num_gh ; x + + ) {
p [ x ] - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
error = gfs2_glock_nq ( p [ x ] ) ;
if ( error ) {
while ( x - - )
gfs2_glock_dq ( p [ x ] ) ;
break ;
}
}
return error ;
}
/**
* gfs2_glock_nq_m - acquire multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*
* Returns : 0 on success ( all glocks acquired ) ,
* errno on failure ( no glocks acquired )
*/
int gfs2_glock_nq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
2007-06-19 18:38:17 +04:00
struct gfs2_holder * tmp [ 4 ] ;
struct gfs2_holder * * pph = tmp ;
2006-01-16 19:50:04 +03:00
int error = 0 ;
2007-06-19 18:38:17 +04:00
switch ( num_gh ) {
case 0 :
2006-01-16 19:50:04 +03:00
return 0 ;
2007-06-19 18:38:17 +04:00
case 1 :
2006-01-16 19:50:04 +03:00
ghs - > gh_flags & = ~ ( LM_FLAG_TRY | GL_ASYNC ) ;
return gfs2_glock_nq ( ghs ) ;
2007-06-19 18:38:17 +04:00
default :
if ( num_gh < = 4 )
2006-01-16 19:50:04 +03:00
break ;
2007-06-19 18:38:17 +04:00
pph = kmalloc ( num_gh * sizeof ( struct gfs2_holder * ) , GFP_NOFS ) ;
if ( ! pph )
return - ENOMEM ;
2006-01-16 19:50:04 +03:00
}
2007-06-19 18:38:17 +04:00
error = nq_m_sync ( num_gh , ghs , pph ) ;
2006-01-16 19:50:04 +03:00
2007-06-19 18:38:17 +04:00
if ( pph ! = tmp )
kfree ( pph ) ;
2006-01-16 19:50:04 +03:00
return error ;
}
/**
* gfs2_glock_dq_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
2011-03-10 19:41:57 +03:00
while ( num_gh - - )
gfs2_glock_dq ( & ghs [ num_gh ] ) ;
2006-01-16 19:50:04 +03:00
}
/**
* gfs2_glock_dq_uninit_m - release multiple glocks
* @ num_gh : the number of structures
* @ ghs : an array of struct gfs2_holder structures
*
*/
void gfs2_glock_dq_uninit_m ( unsigned int num_gh , struct gfs2_holder * ghs )
{
2011-03-10 19:41:57 +03:00
while ( num_gh - - )
gfs2_glock_dq_uninit ( & ghs [ num_gh ] ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
void gfs2_glock_cb ( struct gfs2_glock * gl , unsigned int state )
2008-01-30 18:34:04 +03:00
{
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
unsigned long delay = 0 ;
unsigned long holdtime ;
unsigned long now = jiffies ;
2006-01-16 19:50:04 +03:00
2009-01-12 13:43:39 +03:00
gfs2_glock_hold ( gl ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
holdtime = gl - > gl_tchange + gl - > gl_ops - > go_min_hold_time ;
2010-09-03 12:39:20 +04:00
if ( test_bit ( GLF_QUEUED , & gl - > gl_flags ) ) {
if ( time_before ( now , holdtime ) )
delay = holdtime - now ;
if ( test_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) )
delay = gl - > gl_ops - > go_min_hold_time ;
}
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2008-11-20 16:39:47 +03:00
handle_callback ( gl , state , delay ) ;
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , delay ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2010-08-02 13:15:17 +04:00
/**
* gfs2_should_freeze - Figure out if glock should be frozen
* @ gl : The glock in question
*
* Glocks are not frozen if ( a ) the result of the dlm operation is
* an error , ( b ) the locking operation was an unlock operation or
* ( c ) if there is a " noexp " flagged request anywhere in the queue
*
* Returns : 1 if freezing should occur , 0 otherwise
*/
static int gfs2_should_freeze ( const struct gfs2_glock * gl )
{
const struct gfs2_holder * gh ;
if ( gl - > gl_reply & ~ LM_OUT_ST_MASK )
return 0 ;
if ( gl - > gl_target = = LM_ST_UNLOCKED )
return 0 ;
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
if ( test_bit ( HIF_HOLDER , & gh - > gh_iflags ) )
continue ;
if ( LM_FLAG_NOEXP & gh - > gh_flags )
return 0 ;
}
return 1 ;
}
2006-01-16 19:50:04 +03:00
/**
2009-01-12 13:43:39 +03:00
* gfs2_glock_complete - Callback used by locking
* @ gl : Pointer to the glock
* @ ret : The return value from the dlm
2006-01-16 19:50:04 +03:00
*
2010-11-30 18:49:31 +03:00
* The gl_reply field is under the gl_spin lock so that it is ok
* to use a bitfield shared with other glock state fields .
2006-01-16 19:50:04 +03:00
*/
2009-01-12 13:43:39 +03:00
void gfs2_glock_complete ( struct gfs2_glock * gl , int ret )
2006-01-16 19:50:04 +03:00
{
2009-01-12 13:43:39 +03:00
struct lm_lockstruct * ls = & gl - > gl_sbd - > sd_lockstruct ;
2010-08-02 13:15:17 +04:00
2010-11-30 18:49:31 +03:00
spin_lock ( & gl - > gl_spin ) ;
2009-01-12 13:43:39 +03:00
gl - > gl_reply = ret ;
2010-08-02 13:15:17 +04:00
2009-01-12 13:43:39 +03:00
if ( unlikely ( test_bit ( DFL_BLOCK_LOCKS , & ls - > ls_flags ) ) ) {
2010-08-02 13:15:17 +04:00
if ( gfs2_should_freeze ( gl ) ) {
2009-01-12 13:43:39 +03:00
set_bit ( GLF_FROZEN , & gl - > gl_flags ) ;
2010-08-02 13:15:17 +04:00
spin_unlock ( & gl - > gl_spin ) ;
2006-01-16 19:50:04 +03:00
return ;
2010-08-02 13:15:17 +04:00
}
2006-01-16 19:50:04 +03:00
}
2010-11-30 18:49:31 +03:00
spin_unlock ( & gl - > gl_spin ) ;
2009-01-12 13:43:39 +03:00
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
2010-11-30 18:49:31 +03:00
smp_wmb ( ) ;
2009-01-12 13:43:39 +03:00
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2010-07-19 08:56:17 +04:00
static int gfs2_shrink_glock_memory ( struct shrinker * shrink , int nr , gfp_t gfp_mask )
2006-01-16 19:50:04 +03:00
{
struct gfs2_glock * gl ;
2008-11-20 16:39:47 +03:00
int may_demote ;
int nr_skipped = 0 ;
LIST_HEAD ( skipped ) ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
if ( nr = = 0 )
goto out ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
if ( ! ( gfp_mask & __GFP_FS ) )
return - 1 ;
2006-01-16 19:50:04 +03:00
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
while ( nr & & ! list_empty ( & lru_list ) ) {
gl = list_entry ( lru_list . next , struct gfs2_glock , gl_lru ) ;
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
/* Test for being demotable */
if ( ! test_and_set_bit ( GLF_LOCK , & gl - > gl_flags ) ) {
gfs2_glock_hold ( gl ) ;
spin_unlock ( & lru_lock ) ;
spin_lock ( & gl - > gl_spin ) ;
may_demote = demote_ok ( gl ) ;
if ( may_demote ) {
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
nr - - ;
}
2009-09-22 13:56:16 +04:00
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
smp_mb__after_clear_bit ( ) ;
2009-06-25 19:30:26 +04:00
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
2009-07-24 03:52:34 +04:00
gfs2_glock_put_nolock ( gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
2009-06-25 19:30:26 +04:00
continue ;
2008-11-20 16:39:47 +03:00
}
2009-06-25 19:30:26 +04:00
nr_skipped + + ;
list_add ( & gl - > gl_lru , & skipped ) ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
list_splice ( & skipped , & lru_list ) ;
atomic_add ( nr_skipped , & lru_count ) ;
spin_unlock ( & lru_lock ) ;
out :
return ( atomic_read ( & lru_count ) / 100 ) * sysctl_vfs_cache_pressure ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
static struct shrinker glock_shrinker = {
. shrink = gfs2_shrink_glock_memory ,
. seeks = DEFAULT_SEEKS ,
} ;
2006-01-16 19:50:04 +03:00
/**
* examine_bucket - Call a function for glock in a hash bucket
* @ examiner : the function
* @ sdp : the filesystem
* @ bucket : the bucket
*
*/
2011-01-19 12:30:01 +03:00
static void examine_bucket ( glock_examiner examiner , const struct gfs2_sbd * sdp ,
2006-09-08 21:35:56 +04:00
unsigned int hash )
2006-01-16 19:50:04 +03:00
{
2011-01-19 12:30:01 +03:00
struct gfs2_glock * gl ;
struct hlist_bl_head * head = & gl_hash_table [ hash ] ;
struct hlist_bl_node * pos ;
2006-01-16 19:50:04 +03:00
2011-01-19 12:30:01 +03:00
rcu_read_lock ( ) ;
hlist_bl_for_each_entry_rcu ( gl , pos , head , gl_list ) {
if ( ( gl - > gl_sbd = = sdp ) & & atomic_read ( & gl - > gl_ref ) )
2006-09-12 05:40:30 +04:00
examiner ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2011-01-19 12:30:01 +03:00
rcu_read_unlock ( ) ;
2007-08-01 16:57:10 +04:00
cond_resched ( ) ;
2011-01-19 12:30:01 +03:00
}
static void glock_hash_walk ( glock_examiner examiner , const struct gfs2_sbd * sdp )
{
unsigned x ;
for ( x = 0 ; x < GFS2_GL_HASH_SIZE ; x + + )
examine_bucket ( examiner , sdp , x ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @ gl : The glock to thaw
*
* N . B . When we freeze a glock , we leave a ref to the glock outstanding ,
* so this has to result in the ref count being dropped by one .
*/
static void thaw_glock ( struct gfs2_glock * gl )
{
if ( ! test_and_clear_bit ( GLF_FROZEN , & gl - > gl_flags ) )
return ;
set_bit ( GLF_REPLY_PENDING , & gl - > gl_flags ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
}
2006-01-16 19:50:04 +03:00
/**
* clear_glock - look at a glock and see if we can free it from glock cache
* @ gl : the glock to look at
*
*/
static void clear_glock ( struct gfs2_glock * gl )
{
2008-11-20 16:39:47 +03:00
spin_lock ( & lru_lock ) ;
if ( ! list_empty ( & gl - > gl_lru ) ) {
list_del_init ( & gl - > gl_lru ) ;
atomic_dec ( & lru_count ) ;
2006-01-16 19:50:04 +03:00
}
2008-11-20 16:39:47 +03:00
spin_unlock ( & lru_lock ) ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
spin_lock ( & gl - > gl_spin ) ;
2010-09-29 17:20:52 +04:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED )
2008-11-20 16:39:47 +03:00
handle_callback ( gl , LM_ST_UNLOCKED , 0 ) ;
2008-05-21 20:03:22 +04:00
spin_unlock ( & gl - > gl_spin ) ;
gfs2_glock_hold ( gl ) ;
if ( queue_delayed_work ( glock_workqueue , & gl - > gl_work , 0 ) = = 0 )
gfs2_glock_put ( gl ) ;
2006-01-16 19:50:04 +03:00
}
2009-01-12 13:43:39 +03:00
/**
* gfs2_glock_thaw - Thaw any frozen glocks
* @ sdp : The super block
*
*/
void gfs2_glock_thaw ( struct gfs2_sbd * sdp )
{
2011-01-19 12:30:01 +03:00
glock_hash_walk ( thaw_glock , sdp ) ;
}
2009-01-12 13:43:39 +03:00
2011-01-19 12:30:01 +03:00
static int dump_glock ( struct seq_file * seq , struct gfs2_glock * gl )
{
int ret ;
spin_lock ( & gl - > gl_spin ) ;
ret = __dump_glock ( seq , gl ) ;
spin_unlock ( & gl - > gl_spin ) ;
return ret ;
}
static void dump_glock_func ( struct gfs2_glock * gl )
{
dump_glock ( NULL , gl ) ;
2009-01-12 13:43:39 +03:00
}
2006-01-16 19:50:04 +03:00
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @ sdp : the filesystem
* @ wait : wait until it ' s all gone
*
2008-06-03 17:09:53 +04:00
* Called when unmounting the filesystem .
2006-01-16 19:50:04 +03:00
*/
2008-12-19 18:32:06 +03:00
void gfs2_gl_hash_clear ( struct gfs2_sbd * sdp )
2006-01-16 19:50:04 +03:00
{
2011-01-19 12:30:01 +03:00
glock_hash_walk ( clear_glock , sdp ) ;
2010-01-29 18:21:27 +03:00
flush_workqueue ( glock_workqueue ) ;
wait_event ( sdp - > sd_glock_wait , atomic_read ( & sdp - > sd_glock_disposal ) = = 0 ) ;
2011-01-19 12:30:01 +03:00
glock_hash_walk ( dump_glock_func , sdp ) ;
2006-01-16 19:50:04 +03:00
}
2008-11-18 16:38:48 +03:00
void gfs2_glock_finish_truncate ( struct gfs2_inode * ip )
{
struct gfs2_glock * gl = ip - > i_gl ;
int ret ;
ret = gfs2_truncatei_resume ( ip ) ;
gfs2_assert_withdraw ( gl - > gl_sbd , ret = = 0 ) ;
spin_lock ( & gl - > gl_spin ) ;
clear_bit ( GLF_LOCK , & gl - > gl_flags ) ;
run_queue ( gl , 1 ) ;
spin_unlock ( & gl - > gl_spin ) ;
}
2008-05-21 20:03:22 +04:00
static const char * state2str ( unsigned state )
2007-03-24 01:05:15 +03:00
{
2008-05-21 20:03:22 +04:00
switch ( state ) {
case LM_ST_UNLOCKED :
return " UN " ;
case LM_ST_SHARED :
return " SH " ;
case LM_ST_DEFERRED :
return " DF " ;
case LM_ST_EXCLUSIVE :
return " EX " ;
}
return " ?? " ;
}
static const char * hflags2str ( char * buf , unsigned flags , unsigned long iflags )
{
char * p = buf ;
if ( flags & LM_FLAG_TRY )
* p + + = ' t ' ;
if ( flags & LM_FLAG_TRY_1CB )
* p + + = ' T ' ;
if ( flags & LM_FLAG_NOEXP )
* p + + = ' e ' ;
if ( flags & LM_FLAG_ANY )
2009-01-12 13:43:39 +03:00
* p + + = ' A ' ;
2008-05-21 20:03:22 +04:00
if ( flags & LM_FLAG_PRIORITY )
* p + + = ' p ' ;
if ( flags & GL_ASYNC )
* p + + = ' a ' ;
if ( flags & GL_EXACT )
* p + + = ' E ' ;
if ( flags & GL_NOCACHE )
* p + + = ' c ' ;
if ( test_bit ( HIF_HOLDER , & iflags ) )
* p + + = ' H ' ;
if ( test_bit ( HIF_WAIT , & iflags ) )
* p + + = ' W ' ;
if ( test_bit ( HIF_FIRST , & iflags ) )
* p + + = ' F ' ;
* p = 0 ;
return buf ;
2007-03-24 01:05:15 +03:00
}
2006-01-16 19:50:04 +03:00
/**
* dump_holder - print information about a glock holder
2008-05-21 20:03:22 +04:00
* @ seq : the seq_file struct
2006-01-16 19:50:04 +03:00
* @ gh : the glock holder
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 20:03:22 +04:00
static int dump_holder ( struct seq_file * seq , const struct gfs2_holder * gh )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
struct task_struct * gh_owner = NULL ;
char flags_buf [ 32 ] ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
if ( gh - > gh_owner_pid )
2008-02-07 11:13:19 +03:00
gh_owner = pid_task ( gh - > gh_owner_pid , PIDTYPE_PID ) ;
2010-11-06 02:12:36 +03:00
gfs2_print_dbg ( seq , " H: s:%s f:%s e:%d p:%ld [%s] %pS \n " ,
state2str ( gh - > gh_state ) ,
hflags2str ( flags_buf , gh - > gh_flags , gh - > gh_iflags ) ,
gh - > gh_error ,
gh - > gh_owner_pid ? ( long ) pid_nr ( gh - > gh_owner_pid ) : - 1 ,
gh_owner ? gh_owner - > comm : " (ended) " ,
( void * ) gh - > gh_ip ) ;
2007-03-16 13:26:37 +03:00
return 0 ;
2006-01-16 19:50:04 +03:00
}
2008-05-21 20:03:22 +04:00
static const char * gflags2str ( char * buf , const unsigned long * gflags )
{
char * p = buf ;
if ( test_bit ( GLF_LOCK , gflags ) )
* p + + = ' l ' ;
if ( test_bit ( GLF_DEMOTE , gflags ) )
* p + + = ' D ' ;
if ( test_bit ( GLF_PENDING_DEMOTE , gflags ) )
* p + + = ' d ' ;
if ( test_bit ( GLF_DEMOTE_IN_PROGRESS , gflags ) )
* p + + = ' p ' ;
if ( test_bit ( GLF_DIRTY , gflags ) )
* p + + = ' y ' ;
if ( test_bit ( GLF_LFLUSH , gflags ) )
* p + + = ' f ' ;
if ( test_bit ( GLF_INVALIDATE_IN_PROGRESS , gflags ) )
* p + + = ' i ' ;
if ( test_bit ( GLF_REPLY_PENDING , gflags ) )
* p + + = ' r ' ;
2009-01-12 13:43:39 +03:00
if ( test_bit ( GLF_INITIAL , gflags ) )
2009-02-05 13:12:38 +03:00
* p + + = ' I ' ;
2009-01-12 13:43:39 +03:00
if ( test_bit ( GLF_FROZEN , gflags ) )
* p + + = ' F ' ;
2010-09-03 12:39:20 +04:00
if ( test_bit ( GLF_QUEUED , gflags ) )
* p + + = ' q ' ;
2008-05-21 20:03:22 +04:00
* p = 0 ;
return buf ;
2006-01-16 19:50:04 +03:00
}
/**
2008-05-21 20:03:22 +04:00
* __dump_glock - print information about a glock
* @ seq : The seq_file struct
2006-01-16 19:50:04 +03:00
* @ gl : the glock
2008-05-21 20:03:22 +04:00
*
* The file format is as follows :
* One line per object , capital letters are used to indicate objects
* G = glock , I = Inode , R = rgrp , H = holder . Glocks are not indented ,
* other objects are indented by a single space and follow the glock to
* which they are related . Fields are indicated by lower case letters
* followed by a colon and the field value , except for strings which are in
* [ ] so that its possible to see if they are composed of spaces for
* example . The field ' s are n = number ( id of the object ) , f = flags ,
* t = type , s = state , r = refcount , e = error , p = pid .
2006-01-16 19:50:04 +03:00
*
* Returns : 0 on success , - ENOBUFS when we run out of space
*/
2008-05-21 20:03:22 +04:00
static int __dump_glock ( struct seq_file * seq , const struct gfs2_glock * gl )
2006-01-16 19:50:04 +03:00
{
2008-05-21 20:03:22 +04:00
const struct gfs2_glock_operations * glops = gl - > gl_ops ;
unsigned long long dtime ;
const struct gfs2_holder * gh ;
char gflags_buf [ 32 ] ;
int error = 0 ;
2006-01-16 19:50:04 +03:00
2008-05-21 20:03:22 +04:00
dtime = jiffies - gl - > gl_demote_time ;
dtime * = 1000000 / HZ ; /* demote time in uSec */
if ( ! test_bit ( GLF_DEMOTE , & gl - > gl_flags ) )
dtime = 0 ;
2010-02-23 20:20:00 +03:00
gfs2_print_dbg ( seq , " G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d \n " ,
2008-05-21 20:03:22 +04:00
state2str ( gl - > gl_state ) ,
gl - > gl_name . ln_type ,
( unsigned long long ) gl - > gl_name . ln_number ,
gflags2str ( gflags_buf , & gl - > gl_flags ) ,
state2str ( gl - > gl_target ) ,
state2str ( gl - > gl_demote_state ) , dtime ,
atomic_read ( & gl - > gl_ail_count ) ,
atomic_read ( & gl - > gl_ref ) ) ;
2006-01-16 19:50:04 +03:00
list_for_each_entry ( gh , & gl - > gl_holders , gh_list ) {
2008-05-21 20:03:22 +04:00
error = dump_holder ( seq , gh ) ;
2006-01-16 19:50:04 +03:00
if ( error )
goto out ;
}
2008-05-21 20:03:22 +04:00
if ( gl - > gl_state ! = LM_ST_UNLOCKED & & glops - > go_dump )
error = glops - > go_dump ( seq , gl ) ;
2006-09-04 20:04:26 +04:00
out :
2006-01-16 19:50:04 +03:00
return error ;
}
2008-05-21 20:03:22 +04:00
2006-01-16 19:50:04 +03:00
2007-08-01 16:57:10 +04:00
2006-09-07 22:40:21 +04:00
int __init gfs2_glock_init ( void )
{
unsigned i ;
for ( i = 0 ; i < GFS2_GL_HASH_SIZE ; i + + ) {
2011-01-19 12:30:01 +03:00
INIT_HLIST_BL_HEAD ( & gl_hash_table [ i ] ) ;
2006-09-10 00:59:11 +04:00
}
2007-08-01 16:57:10 +04:00
2010-11-03 22:58:53 +03:00
glock_workqueue = alloc_workqueue ( " glock_workqueue " , WQ_MEM_RECLAIM |
2011-02-16 11:25:31 +03:00
WQ_HIGHPRI | WQ_FREEZABLE , 0 ) ;
2008-11-20 16:39:47 +03:00
if ( IS_ERR ( glock_workqueue ) )
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
return PTR_ERR ( glock_workqueue ) ;
2010-11-03 22:58:53 +03:00
gfs2_delete_workqueue = alloc_workqueue ( " delete_workqueue " ,
2011-02-16 11:25:31 +03:00
WQ_MEM_RECLAIM | WQ_FREEZABLE ,
2010-11-03 22:58:53 +03:00
0 ) ;
2009-07-24 03:52:34 +04:00
if ( IS_ERR ( gfs2_delete_workqueue ) ) {
destroy_workqueue ( glock_workqueue ) ;
return PTR_ERR ( gfs2_delete_workqueue ) ;
}
2008-11-20 16:39:47 +03:00
register_shrinker ( & glock_shrinker ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
2006-09-07 22:40:21 +04:00
return 0 ;
}
2007-08-01 16:57:10 +04:00
void gfs2_glock_exit ( void )
{
2008-11-20 16:39:47 +03:00
unregister_shrinker ( & glock_shrinker ) ;
[GFS2] delay glock demote for a minimum hold time
When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second. This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.
A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.
Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-08-23 22:19:05 +04:00
destroy_workqueue ( glock_workqueue ) ;
2009-07-24 03:52:34 +04:00
destroy_workqueue ( gfs2_delete_workqueue ) ;
2007-08-01 16:57:10 +04:00
}
2011-01-19 12:30:01 +03:00
static inline struct gfs2_glock * glock_hash_chain ( unsigned hash )
{
return hlist_bl_entry ( hlist_bl_first_rcu ( & gl_hash_table [ hash ] ) ,
struct gfs2_glock , gl_list ) ;
}
static inline struct gfs2_glock * glock_hash_next ( struct gfs2_glock * gl )
{
2011-03-15 11:32:14 +03:00
return hlist_bl_entry ( rcu_dereference ( gl - > gl_list . next ) ,
2011-01-19 12:30:01 +03:00
struct gfs2_glock , gl_list ) ;
}
2008-05-21 20:03:22 +04:00
static int gfs2_glock_iter_next ( struct gfs2_glock_iter * gi )
2007-03-16 13:26:37 +03:00
{
2007-07-24 16:53:36 +04:00
struct gfs2_glock * gl ;
2011-01-19 12:30:01 +03:00
do {
gl = gi - > gl ;
if ( gl ) {
gi - > gl = glock_hash_next ( gl ) ;
} else {
gi - > gl = glock_hash_chain ( gi - > hash ) ;
}
while ( gi - > gl = = NULL ) {
gi - > hash + + ;
if ( gi - > hash > = GFS2_GL_HASH_SIZE ) {
rcu_read_unlock ( ) ;
return 1 ;
}
gi - > gl = glock_hash_chain ( gi - > hash ) ;
}
/* Skip entries for other sb and dead entries */
} while ( gi - > sdp ! = gi - > gl - > gl_sbd | | atomic_read ( & gi - > gl - > gl_ref ) = = 0 ) ;
2007-08-21 18:57:29 +04:00
2007-03-16 13:26:37 +03:00
return 0 ;
}
2008-05-21 20:03:22 +04:00
static void * gfs2_glock_seq_start ( struct seq_file * seq , loff_t * pos )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
loff_t n = * pos ;
2008-05-21 20:03:22 +04:00
gi - > hash = 0 ;
2011-01-19 12:30:01 +03:00
rcu_read_lock ( ) ;
2007-03-16 13:26:37 +03:00
2008-05-21 20:03:22 +04:00
do {
2011-01-19 12:30:01 +03:00
if ( gfs2_glock_iter_next ( gi ) )
2007-03-16 13:26:37 +03:00
return NULL ;
2008-05-21 20:03:22 +04:00
} while ( n - - ) ;
2007-03-16 13:26:37 +03:00
2008-05-21 20:03:22 +04:00
return gi - > gl ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static void * gfs2_glock_seq_next ( struct seq_file * seq , void * iter_ptr ,
2007-03-16 13:26:37 +03:00
loff_t * pos )
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
2007-03-16 13:26:37 +03:00
( * pos ) + + ;
2011-01-19 12:30:01 +03:00
if ( gfs2_glock_iter_next ( gi ) )
2007-03-16 13:26:37 +03:00
return NULL ;
2008-05-21 20:03:22 +04:00
return gi - > gl ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static void gfs2_glock_seq_stop ( struct seq_file * seq , void * iter_ptr )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
struct gfs2_glock_iter * gi = seq - > private ;
2011-01-19 12:30:01 +03:00
if ( gi - > gl )
rcu_read_unlock ( ) ;
gi - > gl = NULL ;
2007-03-16 13:26:37 +03:00
}
2008-05-21 20:03:22 +04:00
static int gfs2_glock_seq_show ( struct seq_file * seq , void * iter_ptr )
2007-03-16 13:26:37 +03:00
{
2008-05-21 20:03:22 +04:00
return dump_glock ( seq , iter_ptr ) ;
2007-03-16 13:26:37 +03:00
}
2007-07-31 14:31:11 +04:00
static const struct seq_operations gfs2_glock_seq_ops = {
2007-03-16 13:26:37 +03:00
. start = gfs2_glock_seq_start ,
. next = gfs2_glock_seq_next ,
. stop = gfs2_glock_seq_stop ,
. show = gfs2_glock_seq_show ,
} ;
static int gfs2_debugfs_open ( struct inode * inode , struct file * file )
{
2008-05-21 20:03:22 +04:00
int ret = seq_open_private ( file , & gfs2_glock_seq_ops ,
sizeof ( struct gfs2_glock_iter ) ) ;
if ( ret = = 0 ) {
struct seq_file * seq = file - > private_data ;
struct gfs2_glock_iter * gi = seq - > private ;
gi - > sdp = inode - > i_private ;
}
return ret ;
2007-03-16 13:26:37 +03:00
}
static const struct file_operations gfs2_debug_fops = {
. owner = THIS_MODULE ,
. open = gfs2_debugfs_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2008-05-21 20:03:22 +04:00
. release = seq_release_private ,
2007-03-16 13:26:37 +03:00
} ;
int gfs2_create_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 20:41:11 +04:00
sdp - > debugfs_dir = debugfs_create_dir ( sdp - > sd_table_name , gfs2_root ) ;
if ( ! sdp - > debugfs_dir )
return - ENOMEM ;
sdp - > debugfs_dentry_glocks = debugfs_create_file ( " glocks " ,
S_IFREG | S_IRUGO ,
sdp - > debugfs_dir , sdp ,
& gfs2_debug_fops ) ;
if ( ! sdp - > debugfs_dentry_glocks )
2007-03-16 13:26:37 +03:00
return - ENOMEM ;
return 0 ;
}
void gfs2_delete_debugfs_file ( struct gfs2_sbd * sdp )
{
2007-04-18 20:41:11 +04:00
if ( sdp & & sdp - > debugfs_dir ) {
if ( sdp - > debugfs_dentry_glocks ) {
debugfs_remove ( sdp - > debugfs_dentry_glocks ) ;
sdp - > debugfs_dentry_glocks = NULL ;
}
debugfs_remove ( sdp - > debugfs_dir ) ;
sdp - > debugfs_dir = NULL ;
}
2007-03-16 13:26:37 +03:00
}
int gfs2_register_debugfs ( void )
{
gfs2_root = debugfs_create_dir ( " gfs2 " , NULL ) ;
return gfs2_root ? 0 : - ENOMEM ;
}
void gfs2_unregister_debugfs ( void )
{
debugfs_remove ( gfs2_root ) ;
2007-04-18 20:41:11 +04:00
gfs2_root = NULL ;
2007-03-16 13:26:37 +03:00
}