lock: Add fencing support

design reference: https://review.gluster.org/#/c/glusterfs-specs/+/21925/

This patch adds the lock preempt support.

Note: The current model stores lock enforcement information as separate
xattr on disk. There is another effort going in parallel to store this
in stat(x) of the file. This patch is self sufficient to add fencing
support. Based on the availability of the stat(x) support either I will
rebase this patch or we can modify the necessary bits post merging this
patch.

Change-Id: If4a42f3e0afaee1f66cdb0360ad4e0c005b5b017
updates: #466
Signed-off-by: Susant Palai <spalai@redhat.com>
This commit is contained in:
Susant Palai 2018-11-30 15:04:17 +05:30 committed by Amar Tumballi
parent 11cf73bc41
commit ebaf09a2a3
10 changed files with 1029 additions and 122 deletions

View File

@ -179,6 +179,8 @@
#define GLUSTERFS_INTERNAL_FOP_KEY "glusterfs-internal-fop"
#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
/* GlusterFS Internal FOP Indicator flags
* (To pass information on the context in which a paritcular
* fop is performed between translators)

View File

@ -0,0 +1,229 @@
#include <glusterfs/api/glfs.h>
#include <glusterfs/api/glfs-handles.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NO_INIT 1
#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
FILE *fp;
char *buf = "0123456789";
#define LOG_ERR(func, err) \
do { \
if (!fp) { \
fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__, \
func, strerror(err)); \
fflush(stderr); \
} else { \
fprintf(fp, "\n%d %s : returned error (%s)\n", __LINE__, func, \
strerror(err)); \
fflush(fp); \
} \
} while (0)
glfs_t *
setup_new_client(char *hostname, char *volname, char *log_file, int flag)
{
int ret = 0;
glfs_t *fs = NULL;
fs = glfs_new(volname);
if (!fs) {
fprintf(fp, "\nglfs_new: returned NULL (%s)\n", strerror(errno));
goto error;
}
ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
if (ret < 0) {
fprintf(fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n", ret,
strerror(errno));
goto error;
}
ret = glfs_set_logging(fs, log_file, 7);
if (ret < 0) {
fprintf(fp, "\nglfs_set_logging failed with ret: %d (%s)\n", ret,
strerror(errno));
goto error;
}
if (flag == NO_INIT)
goto out;
ret = glfs_init(fs);
if (ret < 0) {
fprintf(fp, "\nglfs_init failed with ret: %d (%s)\n", ret,
strerror(errno));
goto error;
}
out:
return fs;
error:
return NULL;
}
/* test plan
*
* - take mandatory lock from client 1
* - preempt mandatory lock from client 2
* - write from client 1 which should fail
*/
int
test(glfs_t *fs1, glfs_t *fs2, char *fname)
{
struct flock lock;
int ret = 0;
glfs_fd_t *fd1, *fd2 = NULL;
fd1 = glfs_creat(fs1, fname, O_RDWR, 0777);
if (ret) {
LOG_ERR("glfs_creat", errno);
ret = -1;
goto out;
}
fd2 = glfs_open(fs2, fname, O_RDWR | O_NONBLOCK);
if (ret) {
LOG_ERR("glfs_open", errno);
ret = -1;
goto out;
}
/* initialize lock */
lock.l_type = F_WRLCK;
lock.l_whence = SEEK_SET;
lock.l_start = 0;
lock.l_len = 100;
ret = glfs_fsetxattr(fd1, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0);
if (ret < 0) {
LOG_ERR("glfs_fsetxattr", errno);
ret = -1;
goto out;
}
/* take a write mandatory lock */
ret = glfs_file_lock(fd1, F_SETLKW, &lock, GLFS_LK_MANDATORY);
if (ret) {
LOG_ERR("glfs_file_lock", errno);
goto out;
}
ret = glfs_write(fd1, buf, 10, 0);
if (ret != 10) {
LOG_ERR("glfs_write", errno);
ret = -1;
goto out;
}
/* write should fail */
ret = glfs_write(fd2, buf, 10, 0);
if (ret != -1) {
LOG_ERR("glfs_write", errno);
ret = -1;
goto out;
}
/* preempt mandatory lock from client 1*/
ret = glfs_file_lock(fd2, F_SETLKW, &lock, GLFS_LK_MANDATORY);
if (ret) {
LOG_ERR("glfs_file_lock", errno);
goto out;
}
/* write should succeed from client 2 */
ret = glfs_write(fd2, buf, 10, 0);
if (ret == -1) {
LOG_ERR("glfs_write", errno);
goto out;
}
/* write should fail from client 1 */
ret = glfs_write(fd1, buf, 10, 0);
if (ret == 10) {
LOG_ERR("glfs_write", errno);
ret = -1;
goto out;
}
ret = 0;
out:
if (fd1) {
glfs_close(fd1);
}
if (fd2) {
glfs_close(fd2);
}
return ret;
}
int
main(int argc, char *argv[])
{
int ret = 0;
glfs_t *fs1 = NULL;
glfs_t *fs2 = NULL;
char *volname = NULL;
char log_file[100];
char *hostname = NULL;
char *fname = "/file";
glfs_fd_t *fd1 = NULL;
glfs_fd_t *fd2 = NULL;
if (argc != 4) {
fprintf(
stderr,
"Expect following args %s <hostname> <Vol> <log file location>\n",
argv[0]);
return -1;
}
hostname = argv[1];
volname = argv[2];
sprintf(log_file, "%s/%s", argv[3], "fence-basic.log");
fp = fopen(log_file, "w");
if (!fp) {
fprintf(stderr, "\nfailed to open %s\n", log_file);
fflush(stderr);
return -1;
}
sprintf(log_file, "%s/%s", argv[3], "glfs-client-1.log");
fs1 = setup_new_client(hostname, volname, log_file, 0);
if (!fs1) {
LOG_ERR("setup_new_client", errno);
return -1;
}
sprintf(log_file, "%s/%s", argv[3], "glfs-client-2.log");
fs2 = setup_new_client(hostname, volname, log_file, 0);
if (!fs2) {
LOG_ERR("setup_new_client", errno);
ret = -1;
goto error;
}
ret = test(fs1, fs2, fname);
error:
if (fs1) {
glfs_fini(fs1);
}
if (fs2) {
glfs_fini(fs2);
}
fclose(fp);
return ret;
}

View File

@ -0,0 +1,29 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup;
TEST glusterd
TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 $H0:$B0/brick1
EXPECT 'Created' volinfo_field $V0 'Status';
TEST $CLI volume start $V0;
EXPECT 'Started' volinfo_field $V0 'Status';
TEST $CLI volume set $V0 diagnostics.client-log-flush-timeout 30
TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 locks.mandatory-locking forced
logdir=`gluster --print-logdir`
TEST build_tester $(dirname $0)/fence-basic.c -lgfapi -ggdb
TEST $(dirname $0)/fence-basic $H0 $V0 $logdir
cleanup_tester $(dirname $0)/fence-basic
cleanup;

View File

@ -0,0 +1,61 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup;
# with lock enforcement flag write should fail with out lock
TEST glusterd
TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 $H0:$B0/${V0}1
EXPECT 'Created' volinfo_field $V0 'Status';
TEST $CLI volume start $V0;
EXPECT 'Started' volinfo_field $V0 'Status';
TEST $CLI volume set $V0 performance.write-behind off
TEST glusterfs -s $H0 --volfile-id $V0 $M0
TEST touch $M0/file
#write should pass
TEST "echo "test" > $M0/file"
TEST "truncate -s 0 $M0/file"
#enable mandatory locking
TEST $CLI volume set $V0 locks.mandatory-locking forced
#write should pass
TEST "echo "test" >> $M0/file"
TEST "truncate -s 0 $M0/file"
#enforce lock on the file
TEST setfattr -n trusted.glusterfs.enforce-mandatory-lock -v 1 $M0/file
#write should fail
TEST ! "echo "test" >> $M0/file"
TEST ! "truncate -s 0 $M0/file"
#remove lock enforcement flag
TEST setfattr -x trusted.glusterfs.enforce-mandatory-lock $M0/file
#write should pass
TEST "echo "test" >> $M0/file"
TEST "truncate -s 0 $M0/file"
#enforce lock on the file
TEST setfattr -n trusted.glusterfs.enforce-mandatory-lock -v 1 $M0/file
#kill brick
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST $CLI volume start $V0 force
# wait one second for the brick to come online
sleep 2
#write should fail (lock xlator gets lock enforcement info from disk)
TEST ! "echo "test" >> $M0/file"
TEST ! "truncate -s 0 $M0/file"
cleanup;

View File

@ -17,6 +17,7 @@
#include <glusterfs/xlator.h>
#include <glusterfs/logging.h>
#include <glusterfs/common-utils.h>
#include <glusterfs/syncop.h>
#include "locks.h"
#include "common.h"
@ -322,7 +323,7 @@ pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd)
if (!priv->trace)
return;
pl_inode = pl_inode_get(this, fd->inode);
pl_inode = pl_inode_get(this, fd->inode, NULL);
if (pl_inode && __pl_inode_is_empty(pl_inode))
return;
@ -358,7 +359,7 @@ pl_update_refkeeper(xlator_t *this, inode_t *inode)
int need_unref = 0;
int need_ref = 0;
pl_inode = pl_inode_get(this, inode);
pl_inode = pl_inode_get(this, inode, NULL);
if (!pl_inode)
return;
@ -385,8 +386,51 @@ pl_update_refkeeper(xlator_t *this, inode_t *inode)
inode_ref(inode);
}
/* Get lock enforcement info from disk */
int
pl_fetch_mlock_info_from_disk(xlator_t *this, pl_inode_t *pl_inode,
pl_local_t *local)
{
dict_t *xdata_rsp = NULL;
int ret = 0;
int op_ret = 0;
if (!local) {
return -1;
}
if (local->fd) {
op_ret = syncop_fgetxattr(this, local->fd, &xdata_rsp,
GF_ENFORCE_MANDATORY_LOCK, NULL, NULL);
} else {
op_ret = syncop_getxattr(this, &local->loc[0], &xdata_rsp,
GF_ENFORCE_MANDATORY_LOCK, NULL, NULL);
}
pthread_mutex_lock(&pl_inode->mutex);
{
if (op_ret >= 0) {
pl_inode->mlock_enforced = _gf_true;
pl_inode->check_mlock_info = _gf_false;
} else {
gf_msg(this->name, GF_LOG_WARNING, -op_ret, 0,
"getxattr failed with %d", op_ret);
pl_inode->mlock_enforced = _gf_false;
if (-op_ret == ENODATA) {
pl_inode->check_mlock_info = _gf_false;
} else {
pl_inode->check_mlock_info = _gf_true;
}
}
}
pthread_mutex_unlock(&pl_inode->mutex);
return ret;
}
pl_inode_t *
pl_inode_get(xlator_t *this, inode_t *inode)
pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
{
uint64_t tmp_pl_inode = 0;
pl_inode_t *pl_inode = NULL;
@ -399,6 +443,7 @@ pl_inode_get(xlator_t *this, inode_t *inode)
pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
goto unlock;
}
pl_inode = GF_CALLOC(1, sizeof(*pl_inode), gf_locks_mt_pl_inode_t);
if (!pl_inode) {
goto unlock;
@ -407,6 +452,7 @@ pl_inode_get(xlator_t *this, inode_t *inode)
gf_log(this->name, GF_LOG_TRACE, "Allocating new pl inode");
pthread_mutex_init(&pl_inode->mutex, NULL);
pthread_cond_init(&pl_inode->check_fop_wind_count, 0);
INIT_LIST_HEAD(&pl_inode->dom_list);
INIT_LIST_HEAD(&pl_inode->ext_list);
@ -418,6 +464,9 @@ pl_inode_get(xlator_t *this, inode_t *inode)
INIT_LIST_HEAD(&pl_inode->queued_locks);
gf_uuid_copy(pl_inode->gfid, inode->gfid);
pl_inode->check_mlock_info = _gf_true;
pl_inode->mlock_enforced = _gf_false;
ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
if (ret) {
pthread_mutex_destroy(&pl_inode->mutex);
@ -429,6 +478,15 @@ pl_inode_get(xlator_t *this, inode_t *inode)
unlock:
UNLOCK(&inode->lock);
if (pl_is_mandatory_locking_enabled(pl_inode) &&
pl_inode->check_mlock_info && local) {
/* Note: The lock enforcement information per file can be stored in the
attribute flag of stat(x) in posix. With that there won't be a need
for doing getxattr post a reboot
*/
pl_fetch_mlock_info_from_disk(this, pl_inode, local);
}
return pl_inode;
}
@ -1070,3 +1128,148 @@ pl_does_monkey_want_stuck_lock()
return _gf_true;
return _gf_false;
}
int
pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock)
{
posix_lock_t *lock = NULL;
posix_lock_t *i = NULL;
pl_rw_req_t *rw = NULL;
pl_rw_req_t *itr = NULL;
struct list_head unwind_blist = {
0,
};
struct list_head unwind_rw_list = {
0,
};
int ret = 0;
INIT_LIST_HEAD(&unwind_blist);
INIT_LIST_HEAD(&unwind_rw_list);
pthread_mutex_lock(&pl_inode->mutex);
{
/*
- go through the lock list
- remove all locks from different owners
- same owner locks will be added or substracted based on
the new request
- add the new lock
*/
list_for_each_entry_safe(lock, i, &pl_inode->ext_list, list)
{
if (lock->blocked) {
list_del_init(&lock->list);
list_add(&lock->list, &unwind_blist);
continue;
}
if (locks_overlap(lock, reqlock)) {
if (same_owner(lock, reqlock))
continue;
/* remove conflicting locks */
list_del_init(&lock->list);
__delete_lock(lock);
__destroy_lock(lock);
}
}
__insert_and_merge(pl_inode, reqlock);
list_for_each_entry_safe(rw, itr, &pl_inode->rw_list, list)
{
list_del_init(&rw->list);
list_add(&rw->list, &unwind_rw_list);
}
while (pl_inode->fop_wind_count != 0) {
gf_msg(THIS->name, GF_LOG_TRACE, 0, 0,
"waiting for fops to be drained");
pthread_cond_wait(&pl_inode->check_fop_wind_count,
&pl_inode->mutex);
}
}
pthread_mutex_unlock(&pl_inode->mutex);
/* unwind blocked locks */
list_for_each_entry_safe(lock, i, &unwind_blist, list)
{
PL_STACK_UNWIND_AND_FREE(((pl_local_t *)lock->frame->local), lk,
lock->frame, -1, EBUSY, &lock->user_flock,
NULL);
__destroy_lock(lock);
}
/* unwind blocked IOs */
list_for_each_entry_safe(rw, itr, &unwind_rw_list, list)
{
pl_clean_local(rw->stub->frame->local);
call_unwind_error(rw->stub, -1, EBUSY);
GF_FREE(lock);
}
return ret;
}
/* Return true in case we need to ensure mandatory-locking
* semantics under different modes.
*/
gf_boolean_t
pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode)
{
posix_locks_private_t *priv = THIS->private;
if (priv->mandatory_mode == MLK_FILE_BASED && pl_inode->mandatory)
return _gf_true;
else if (priv->mandatory_mode == MLK_FORCED ||
priv->mandatory_mode == MLK_OPTIMAL)
return _gf_true;
return _gf_false;
}
void
pl_clean_local(pl_local_t *local)
{
if (!local)
return;
if (local->inodelk_dom_count_req)
data_unref(local->inodelk_dom_count_req);
loc_wipe(&local->loc[0]);
loc_wipe(&local->loc[1]);
if (local->fd)
fd_unref(local->fd);
if (local->inode)
inode_unref(local->inode);
mem_put(local);
}
/*
TODO: detach local initialization from PL_LOCAL_GET_REQUESTS and add it here
*/
int
pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
{
pl_local_t *local = NULL;
if (!loc && !fd) {
return -1;
}
if (!frame->local) {
local = mem_get0(this->local_pool);
if (!local) {
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
"mem allocation failed");
return -1;
}
local->inode = (fd ? inode_ref(fd->inode) : inode_ref(loc->inode));
frame->local = local;
}
return 0;
}

View File

@ -32,12 +32,29 @@
#define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid)
#define PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params...) \
do { \
frame->local = NULL; \
STACK_UNWIND_STRICT(fop, frame, op_ret, params); \
if (__local) { \
if (__local->inodelk_dom_count_req) \
data_unref(__local->inodelk_dom_count_req); \
loc_wipe(&__local->loc[0]); \
loc_wipe(&__local->loc[1]); \
if (__local->fd) \
fd_unref(__local->fd); \
if (__local->inode) \
inode_unref(__local->inode); \
mem_put(__local); \
} \
} while (0)
posix_lock_t *
new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid,
gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int can_block);
pl_inode_t *
pl_inode_get(xlator_t *this, inode_t *inode);
pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local);
posix_lock_t *
pl_getlk(pl_inode_t *inode, posix_lock_t *lock);
@ -45,6 +62,9 @@ pl_getlk(pl_inode_t *inode, posix_lock_t *lock);
int
pl_setlk(xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, int can_block);
int
pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock);
void
grant_blocked_locks(xlator_t *this, pl_inode_t *inode);
@ -182,4 +202,14 @@ __pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock);
gf_boolean_t
pl_does_monkey_want_stuck_lock();
gf_boolean_t
pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode);
void
pl_clean_local(pl_local_t *local);
int
pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd);
#endif /* __COMMON_H__ */

View File

@ -647,7 +647,7 @@ check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename)
pl_dom_list_t *dom = NULL;
pl_entry_lock_t *conf = NULL;
pl_inode_t *pinode = pl_inode_get(this, parent);
pl_inode_t *pinode = pl_inode_get(this, parent, NULL);
if (!pinode)
goto out;
pthread_mutex_lock(&pinode->mutex);
@ -769,7 +769,7 @@ pl_common_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
if (xdata)
dict_ret = dict_get_str(xdata, "connection-id", &conn_id);
pinode = pl_inode_get(this, inode);
pinode = pl_inode_get(this, inode, NULL);
if (!pinode) {
op_errno = ENOMEM;
goto out;

View File

@ -992,7 +992,7 @@ pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
}
}
pinode = pl_inode_get(this, inode);
pinode = pl_inode_get(this, inode, NULL);
if (!pinode) {
op_errno = ENOMEM;
goto unwind;

View File

@ -179,6 +179,24 @@ struct __pl_inode {
of inode_t as long as there are
locks on it */
gf_boolean_t migrated;
/* Flag to indicate whether to read mlock-enforce xattr from disk */
gf_boolean_t check_mlock_info;
/* Mandatory_lock enforce: IO will be allowed if and only if the lkowner has
held the lock.
Note: An xattr is set on the file to recover this information post
reboot. If client does not want mandatory lock to be enforced, then it
should remove this xattr explicitly
*/
gf_boolean_t mlock_enforced;
/* There are scenarios where mandatory lock is granted but there are IOs
pending at posix level. To avoid this before preempting the previous lock
owner, we wait for all the fops to be unwound.
*/
int fop_wind_count;
pthread_cond_t check_fop_wind_count;
};
typedef struct __pl_inode pl_inode_t;
@ -213,12 +231,14 @@ typedef struct {
dict_t *xdata;
loc_t loc[2];
fd_t *fd;
inode_t *inode;
off_t offset;
glusterfs_fop_t op;
gf_boolean_t entrylk_count_req;
gf_boolean_t inodelk_count_req;
gf_boolean_t posixlk_count_req;
gf_boolean_t parent_entrylk_req;
int update_mlock_enforced_flag;
} pl_local_t;
typedef struct {
@ -239,6 +259,8 @@ typedef struct _locks_ctx {
struct list_head metalk_list;
} pl_ctx_t;
typedef enum { DECREMENT, INCREMENT } pl_count_op_t;
pl_ctx_t *
pl_ctx_get(client_t *client, xlator_t *xlator);

File diff suppressed because it is too large Load Diff