cluster/afr: Don't let NFS cache stat after writes

Problem:
Afr does post-ops after write but the stat buffer it unwinds is at the
time of write, so if nfs client caches this, it will see different
ctime when it does stat on it after post-op is done. From NFS client's
perspective it thinks the file is changed. Tar which depends on this
to be correct keeps giving 'file changed as we read it' warning.
If Afr instead has to choose to unwind after post-op, eager-lock,
delayed-post-op will have to be disabled which will lead to bad
performance for all write usecases.

Fix:
Don't let client cache stat after write.

Change-Id: Ic6062acc6e5cdd97a9c83c56bd529ec83cee8a23
BUG: 1302948
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Reviewed-on: http://review.gluster.org/13785
Smoke: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Niels de Vos <ndevos@redhat.com>
This commit is contained in:
Pranith Kumar K 2016-03-19 11:40:26 +05:30 committed by Pranith Kumar Karampuri
parent e3b2ed938a
commit 4c4624c9ba
10 changed files with 138 additions and 30 deletions

View File

@ -4425,3 +4425,27 @@ fop_enum_to_string (glusterfs_fop_t fop)
return "UNKNOWNFOP"; return "UNKNOWNFOP";
} }
gf_boolean_t
gf_is_zero_filled_stat (struct iatt *buf)
{
if (!buf)
return 1;
/* Do not use st_dev because it is transformed to store the xlator id
* in place of the device number. Do not use st_ino because by this time
* we've already mapped the root ino to 1 so it is not guaranteed to be
* 0.
*/
if ((buf->ia_nlink == 0) && (buf->ia_ctime == 0))
return 1;
return 0;
}
void
gf_zero_fill_stat (struct iatt *buf)
{
buf->ia_nlink = 0;
buf->ia_ctime = 0;
}

View File

@ -820,4 +820,10 @@ gf_nwrite (int fd, const void *buf, size_t count);
void _mask_cancellation (void); void _mask_cancellation (void);
void _unmask_cancellation (void); void _unmask_cancellation (void);
gf_boolean_t
gf_is_zero_filled_stat (struct iatt *buf);
void
gf_zero_fill_stat (struct iatt *buf);
#endif /* _COMMON_UTILS_H */ #endif /* _COMMON_UTILS_H */

View File

@ -0,0 +1,37 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../nfs.rc
TESTS_EXPECTED_IN_LOOP=10
cleanup;
#Basic checks
TEST glusterd
TEST pidof glusterd
#Create a distributed-replicate volume
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1..6};
TEST $CLI volume set $V0 cluster.consistent-metadata on
TEST $CLI volume set $V0 cluster.post-op-delay-secs 0
TEST $CLI volume set $V0 nfs.rdirplus off
TEST $CLI volume set $V0 nfs.disable false
TEST $CLI volume start $V0
EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
# Mount NFS
mount_nfs $H0:/$V0 $N0 vers=3
#Create files
TEST mkdir -p $N0/nfs/dir1/dir2
for i in {1..10}; do
TEST_IN_LOOP dd if=/dev/urandom of=$N0/nfs/dir1/dir2/file$i bs=1024k count=1
done
TEST tar cvf /tmp/dir1.tar.gz $N0/nfs/dir1
TEST rm -f /tmp/dir1.tar.gz
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
cleanup;

View File

@ -234,7 +234,9 @@ __afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_local_t *local = NULL; afr_local_t *local = NULL;
int child_index = (long) cookie; int child_index = (long) cookie;
int call_count = -1; int call_count = -1;
afr_private_t *priv = NULL;
priv = this->private;
local = frame->local; local = frame->local;
LOCK (&frame->lock); LOCK (&frame->lock);
@ -249,8 +251,13 @@ __afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) { if (call_count == 0) {
__afr_dir_write_finalize (frame, this); __afr_dir_write_finalize (frame, this);
if (afr_txn_nothing_failed (frame, this)) if (afr_txn_nothing_failed (frame, this)) {
local->transaction.unwind (frame, this); /*if it did pre-op, it will do post-op changing ctime*/
if (priv->consistent_metadata &&
afr_needs_changelog_update (local))
afr_zero_fill_stat (local);
local->transaction.unwind (frame, this);
}
afr_mark_entry_pending_changelog (frame, this); afr_mark_entry_pending_changelog (frame, this);

View File

@ -182,7 +182,9 @@ __afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_local_t *local = NULL; afr_local_t *local = NULL;
int child_index = (long) cookie; int child_index = (long) cookie;
int call_count = -1; int call_count = -1;
afr_private_t *priv = NULL;
priv = this->private;
local = frame->local; local = frame->local;
LOCK (&frame->lock); LOCK (&frame->lock);
@ -198,8 +200,13 @@ __afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) { if (call_count == 0) {
__afr_inode_write_finalize (frame, this); __afr_inode_write_finalize (frame, this);
if (afr_txn_nothing_failed (frame, this)) if (afr_txn_nothing_failed (frame, this)) {
local->transaction.unwind (frame, this); /*if it did pre-op, it will do post-op changing ctime*/
if (priv->consistent_metadata &&
afr_needs_changelog_update (local))
afr_zero_fill_stat (local);
local->transaction.unwind (frame, this);
}
local->transaction.resume (frame, this); local->transaction.resume (frame, this);
} }
@ -230,8 +237,13 @@ void
afr_writev_unwind (call_frame_t *frame, xlator_t *this) afr_writev_unwind (call_frame_t *frame, xlator_t *this)
{ {
afr_local_t * local = NULL; afr_local_t * local = NULL;
afr_private_t *priv = this->private;
local = frame->local; local = frame->local;
if (priv->consistent_metadata)
afr_zero_fill_stat (local);
AFR_STACK_UNWIND (writev, frame, AFR_STACK_UNWIND (writev, frame,
local->op_ret, local->op_errno, local->op_ret, local->op_errno,
&local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.prebuf,

View File

@ -36,6 +36,37 @@ afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
afr_changelog_resume_t changelog_resume, afr_changelog_resume_t changelog_resume,
afr_xattrop_type_t op); afr_xattrop_type_t op);
void
afr_zero_fill_stat (afr_local_t *local)
{
if (!local)
return;
if (local->transaction.type == AFR_DATA_TRANSACTION ||
local->transaction.type == AFR_METADATA_TRANSACTION) {
gf_zero_fill_stat (&local->cont.inode_wfop.prebuf);
gf_zero_fill_stat (&local->cont.inode_wfop.postbuf);
} else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
gf_zero_fill_stat (&local->cont.dir_fop.buf);
gf_zero_fill_stat (&local->cont.dir_fop.preparent);
gf_zero_fill_stat (&local->cont.dir_fop.postparent);
if (local->transaction.type == AFR_ENTRY_TRANSACTION)
return;
gf_zero_fill_stat (&local->cont.dir_fop.prenewparent);
gf_zero_fill_stat (&local->cont.dir_fop.postnewparent);
}
}
gf_boolean_t
afr_needs_changelog_update (afr_local_t *local)
{
if (local->transaction.type == AFR_DATA_TRANSACTION)
return _gf_true;
if (!local->optimistic_change_log)
return _gf_true;
return _gf_false;
}
static int32_t static int32_t
afr_quorum_errno (afr_private_t *priv) afr_quorum_errno (afr_private_t *priv)
{ {
@ -85,9 +116,21 @@ int
__afr_txn_write_done (call_frame_t *frame, xlator_t *this) __afr_txn_write_done (call_frame_t *frame, xlator_t *this)
{ {
afr_local_t *local = NULL; afr_local_t *local = NULL;
afr_private_t *priv = NULL;
gf_boolean_t unwind = _gf_false;
priv = this->private;
local = frame->local; local = frame->local;
if (priv->consistent_metadata) {
LOCK (&frame->lock);
{
unwind = (local->transaction.main_frame != NULL);
}
UNLOCK (&frame->lock);
if (unwind)/*It definitely did post-op*/
afr_zero_fill_stat (local);
}
local->transaction.unwind (frame, this); local->transaction.unwind (frame, this);
AFR_STACK_DESTROY (frame); AFR_STACK_DESTROY (frame);
@ -1232,8 +1275,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
goto err; goto err;
} }
if ((local->transaction.type == AFR_DATA_TRANSACTION || if (afr_needs_changelog_update (local)) {
!local->optimistic_change_log)) {
local->dirty[idx] = hton32(1); local->dirty[idx] = hton32(1);

View File

@ -52,5 +52,7 @@ int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);
int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);
call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this); gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
void afr_zero_fill_stat (afr_local_t *local);
#endif /* __TRANSACTION_H__ */ #endif /* __TRANSACTION_H__ */

View File

@ -106,25 +106,6 @@ nfs_mntpath_to_xlator (xlator_list_t *cl, char *path)
} }
/* Returns 1 if the stat seems to be filled with zeroes. */
int
nfs_zero_filled_stat (struct iatt *buf)
{
if (!buf)
return 1;
/* Do not use st_dev because it is transformed to store the xlator id
* in place of the device number. Do not use st_ino because by this time
* we've already mapped the root ino to 1 so it is not guaranteed to be
* 0.
*/
if ((buf->ia_nlink == 0) && (buf->ia_ctime == 0))
return 1;
return 0;
}
void void
nfs_loc_wipe (loc_t *loc) nfs_loc_wipe (loc_t *loc)
{ {

View File

@ -37,9 +37,6 @@ nfs_path_to_xlator (xlator_list_t *cl, char *path);
extern xlator_t * extern xlator_t *
nfs_mntpath_to_xlator (xlator_list_t *cl, char *path); nfs_mntpath_to_xlator (xlator_list_t *cl, char *path);
extern int
nfs_zero_filled_stat (struct iatt *buf);
extern void extern void
nfs_loc_wipe (loc_t *loc); nfs_loc_wipe (loc_t *loc);

View File

@ -373,7 +373,7 @@ nfs3_stat_to_post_op_attr (struct iatt *buf)
* returning these zeroed out attrs. * returning these zeroed out attrs.
*/ */
attr.attributes_follow = FALSE; attr.attributes_follow = FALSE;
if (nfs_zero_filled_stat (buf)) if (gf_is_zero_filled_stat (buf))
goto out; goto out;
nfs3_stat_to_fattr3 (buf, &(attr.post_op_attr_u.attributes)); nfs3_stat_to_fattr3 (buf, &(attr.post_op_attr_u.attributes));
@ -394,7 +394,7 @@ nfs3_stat_to_pre_op_attr (struct iatt *pre)
* returning these zeroed out attrs. * returning these zeroed out attrs.
*/ */
poa.attributes_follow = FALSE; poa.attributes_follow = FALSE;
if (nfs_zero_filled_stat (pre)) if (gf_is_zero_filled_stat (pre))
goto out; goto out;
poa.attributes_follow = TRUE; poa.attributes_follow = TRUE;