performance/write-behind: fix fulfill and readdirp race
Current invalidation of stats in wb_readdirp_cbk is prone to races. As the deleted comment explains, <snip> We cannot guarantee integrity of entry->d_stat as there are cached writes. The stat is most likely stale as it doesn't account the cached writes. However, checking for non-empty liability list here is not a fool-proof solution as there can be races like, 1. readdirp is successful on posix 2. sync of cached write is successful on posix 3. write-behind received sync response and removed the request from liability queue 4. readdirp response is processed at write-behind. In the above scenario, stat for the file is sent back in readdirp response but it is stale. </snip> The fix is to mark readdirp sessions (tracked in this patch by non-zero value of "readdirps" on parent inode) and if fulfill completes when one or more readdirp sessions are in progress, mark the inode so that wb_readdirp_cbk doesn't send iatts for that in inode in readdirp response. Note that wb_readdirp_cbk already checks for presence of a non-empty liability queue and invalidates iatt. Since the only way a liability queue can shrink is by fulfilling requests in liability queue, wb_fulfill_cbk indicates wb_readdirp_cbk that a potential race could've happened b/w readdirp and fulfill. Change-Id: I12d167bf450648baa64be1cbe1ca0fddf5379521 Signed-off-by: Raghavendra G <rgowdapp@redhat.com> updates: bz#1512691
This commit is contained in:
parent
59e5602487
commit
370f05546e
@ -84,6 +84,13 @@ typedef struct wb_inode {
|
||||
writes are in progress at the same time. Modules
|
||||
like eager-lock in AFR depend on this behavior.
|
||||
*/
|
||||
list_head_t invalidate_list; /* list of wb_inodes that were marked for
|
||||
* iatt invalidation due to requests in
|
||||
* liability queue fulfilled while there
|
||||
* was a readdirp session on parent
|
||||
* directory. For a directory inode, this
|
||||
* list points to list of children.
|
||||
*/
|
||||
uint64_t gen; /* Liability generation number. Represents
|
||||
the current 'state' of liability. Every
|
||||
new addition to the liability list bumps
|
||||
@ -107,6 +114,7 @@ typedef struct wb_inode {
|
||||
size_t size; /* Size of the file to catch write after EOF. */
|
||||
gf_lock_t lock;
|
||||
xlator_t *this;
|
||||
inode_t *inode;
|
||||
int dontsync; /* If positive, don't pick lies for
|
||||
* winding. This is needed to break infinite
|
||||
* recursion during invocation of
|
||||
@ -114,6 +122,8 @@ typedef struct wb_inode {
|
||||
* wb_fulfill_cbk in case of an
|
||||
* error during fulfill.
|
||||
*/
|
||||
gf_atomic_int32_t readdirps;
|
||||
gf_atomic_int8_t invalidate;
|
||||
|
||||
} wb_inode_t;
|
||||
|
||||
@ -186,9 +196,6 @@ typedef struct wb_conf {
|
||||
} wb_conf_t;
|
||||
|
||||
|
||||
void
|
||||
wb_process_queue (wb_inode_t *wb_inode);
|
||||
|
||||
|
||||
wb_inode_t *
|
||||
__wb_inode_ctx_get (xlator_t *this, inode_t *inode)
|
||||
@ -225,6 +232,44 @@ out:
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
wb_set_invalidate (wb_inode_t *wb_inode, int set)
|
||||
{
|
||||
int readdirps = 0;
|
||||
inode_t *parent_inode = NULL;
|
||||
wb_inode_t *wb_parent_inode = NULL;
|
||||
|
||||
parent_inode = inode_parent (wb_inode->inode, NULL, NULL);
|
||||
if (parent_inode)
|
||||
wb_parent_inode = wb_inode_ctx_get (wb_inode->this,
|
||||
parent_inode);
|
||||
|
||||
if (wb_parent_inode) {
|
||||
LOCK (&wb_parent_inode->lock);
|
||||
{
|
||||
readdirps = GF_ATOMIC_GET (wb_parent_inode->readdirps);
|
||||
if (readdirps && set) {
|
||||
GF_ATOMIC_SWAP (wb_inode->invalidate, 1);
|
||||
list_del_init (&wb_inode->invalidate_list);
|
||||
list_add (&wb_inode->invalidate_list,
|
||||
&wb_parent_inode->invalidate_list);
|
||||
} else if (readdirps == 0) {
|
||||
GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
|
||||
list_del_init (&wb_inode->invalidate_list);
|
||||
}
|
||||
}
|
||||
UNLOCK (&wb_parent_inode->lock);
|
||||
} else {
|
||||
GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
wb_process_queue (wb_inode_t *wb_inode);
|
||||
|
||||
|
||||
/*
|
||||
Below is a succinct explanation of the code deciding whether two regions
|
||||
overlap, from Pavan <tcp@gluster.com>.
|
||||
@ -646,12 +691,16 @@ __wb_inode_create (xlator_t *this, inode_t *inode)
|
||||
INIT_LIST_HEAD (&wb_inode->liability);
|
||||
INIT_LIST_HEAD (&wb_inode->temptation);
|
||||
INIT_LIST_HEAD (&wb_inode->wip);
|
||||
INIT_LIST_HEAD (&wb_inode->invalidate_list);
|
||||
|
||||
wb_inode->this = this;
|
||||
|
||||
wb_inode->window_conf = conf->window_size;
|
||||
wb_inode->inode = inode;
|
||||
|
||||
LOCK_INIT (&wb_inode->lock);
|
||||
GF_ATOMIC_INIT (wb_inode->invalidate, 0);
|
||||
GF_ATOMIC_INIT (wb_inode->readdirps, 0);
|
||||
|
||||
ret = __inode_ctx_put (inode, this, (uint64_t)(unsigned long)wb_inode);
|
||||
if (ret) {
|
||||
@ -1047,6 +1096,30 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
|
||||
wb_inode = head->wb_inode;
|
||||
|
||||
/* There could be a readdirp session in progress. Since wb_fulfill_cbk
|
||||
* can potentially remove a request from liability queue,
|
||||
* wb_readdirp_cbk will miss writes on this inode (as it invalidates
|
||||
* stats only if liability queue is not empty) and hence mark inode
|
||||
* for invalidation of stats in readdirp response. Specifically this
|
||||
* code fixes the following race mentioned in wb_readdirp_cbk:
|
||||
*/
|
||||
|
||||
/* <removed comment from wb_readdirp_cbk>
|
||||
* We cannot guarantee integrity of entry->d_stat as there are cached
|
||||
* writes. The stat is most likely stale as it doesn't account the
|
||||
* cached writes. However, checking for non-empty liability list here is
|
||||
* not a fool-proof solution as there can be races like,
|
||||
* 1. readdirp is successful on posix
|
||||
* 2. sync of cached write is successful on posix
|
||||
* 3. write-behind received sync response and removed the request from
|
||||
* liability queue
|
||||
* 4. readdirp response is processed at write-behind
|
||||
*
|
||||
* In the above scenario, stat for the file is sent back in readdirp
|
||||
* response but it is stale.
|
||||
* </comment> */
|
||||
wb_set_invalidate (wb_inode, 1);
|
||||
|
||||
if (op_ret == -1) {
|
||||
wb_fulfill_err (head, op_errno);
|
||||
} else if (op_ret < head->total_size) {
|
||||
@ -1070,14 +1143,13 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
head->total_size += req->write_size; \
|
||||
} while (0)
|
||||
|
||||
|
||||
int
|
||||
wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
|
||||
{
|
||||
struct iovec vector[MAX_VECTOR_COUNT];
|
||||
int count = 0;
|
||||
wb_request_t *req = NULL;
|
||||
call_frame_t *frame = NULL;
|
||||
int count = 0;
|
||||
wb_request_t *req = NULL;
|
||||
call_frame_t *frame = NULL;
|
||||
|
||||
/* make sure head->total_size is updated before we run into any
|
||||
* errors
|
||||
@ -1148,7 +1220,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
|
||||
|
||||
conf = wb_inode->this->private;
|
||||
|
||||
list_for_each_entry_safe (req, tmp, liabilities, winds) {
|
||||
list_for_each_entry_safe (req, tmp, liabilities, winds) {
|
||||
list_del_init (&req->winds);
|
||||
|
||||
if (!head) {
|
||||
@ -2442,6 +2514,48 @@ noqueue:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
wb_mark_readdirp_start (xlator_t *this, inode_t *directory)
|
||||
{
|
||||
wb_inode_t *wb_directory_inode = NULL;
|
||||
|
||||
wb_directory_inode = wb_inode_create (this, directory);
|
||||
|
||||
LOCK (&wb_directory_inode->lock);
|
||||
{
|
||||
GF_ATOMIC_INC (wb_directory_inode->readdirps);
|
||||
}
|
||||
UNLOCK (&wb_directory_inode->lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static void
|
||||
wb_mark_readdirp_end (xlator_t *this, inode_t *directory)
|
||||
{
|
||||
wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL;
|
||||
int readdirps = 0;
|
||||
|
||||
wb_directory_inode = wb_inode_ctx_get (this, directory);
|
||||
|
||||
LOCK (&wb_directory_inode->lock);
|
||||
{
|
||||
readdirps = GF_ATOMIC_DEC (wb_directory_inode->readdirps);
|
||||
if (readdirps)
|
||||
goto unlock;
|
||||
|
||||
list_for_each_entry_safe (wb_inode, tmp,
|
||||
&wb_directory_inode->invalidate_list,
|
||||
invalidate_list) {
|
||||
list_del_init (&wb_inode->invalidate_list);
|
||||
GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
UNLOCK (&wb_directory_inode->lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t
|
||||
wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
@ -2451,6 +2565,10 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
wb_inode_t *wb_inode = NULL;
|
||||
gf_dirent_t *entry = NULL;
|
||||
inode_t *inode = NULL;
|
||||
fd_t *fd = NULL;
|
||||
|
||||
fd = frame->local;
|
||||
frame->local = NULL;
|
||||
|
||||
if (op_ret <= 0)
|
||||
goto unwind;
|
||||
@ -2465,29 +2583,8 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
|
||||
LOCK (&wb_inode->lock);
|
||||
{
|
||||
if (!list_empty (&wb_inode->liability)) {
|
||||
/* We cannot guarantee integrity of
|
||||
entry->d_stat as there are cached writes.
|
||||
The stat is most likely stale as it doesn't
|
||||
account the cached writes. However, checking
|
||||
for non-empty liability list here is not a
|
||||
fool-proof solution as there can be races
|
||||
like,
|
||||
1. readdirp is successful on posix
|
||||
2. sync of cached write is successful on
|
||||
posix
|
||||
3. write-behind received sync response and
|
||||
removed the request from liability queue
|
||||
4. readdirp response is processed at
|
||||
write-behind
|
||||
|
||||
In the above scenario, stat for the file is
|
||||
sent back in readdirp response but it is
|
||||
stale.
|
||||
|
||||
For lack of better solutions I am sticking
|
||||
with current solution.
|
||||
*/
|
||||
if (!list_empty (&wb_inode->liability) ||
|
||||
GF_ATOMIC_GET (wb_inode->invalidate)) {
|
||||
inode = entry->inode;
|
||||
|
||||
entry->inode = NULL;
|
||||
@ -2500,9 +2597,11 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
UNLOCK (&wb_inode->lock);
|
||||
}
|
||||
|
||||
wb_mark_readdirp_end (this, fd->inode);
|
||||
|
||||
unwind:
|
||||
STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno,
|
||||
entries, xdata);
|
||||
frame->local = NULL;
|
||||
STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2511,6 +2610,10 @@ int32_t
|
||||
wb_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
|
||||
off_t off, dict_t *xdata)
|
||||
{
|
||||
wb_mark_readdirp_start (this, fd->inode);
|
||||
|
||||
frame->local = fd;
|
||||
|
||||
STACK_WIND (frame, wb_readdirp_cbk, FIRST_CHILD(this),
|
||||
FIRST_CHILD(this)->fops->readdirp,
|
||||
fd, size, off, xdata);
|
||||
|
Loading…
x
Reference in New Issue
Block a user