geo-rep: log ENTRY failures from slave on master

ENTRY operations failures on slave left no trace for debugging purposes.
This patch captures such failures on slave cluster and forwards them to
the master and logs them. Failures of specific interest are the ones
which return code EEXIST on the failing operations.

Change-Id: Iecab876f16593c746d53f4b7ec2e0783367856bb
BUG: 1207115
Signed-off-by: Milind Changire <mchangir@redhat.com>
Reviewed-on: http://review.gluster.org/10048
Reviewed-by: Aravinda VK <avishwan@redhat.com>
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
This commit is contained in:
Milind Changire 2015-03-30 16:53:05 +05:30 committed by Vijay Bellur
parent 1adf231fc4
commit 70a729e975
3 changed files with 48 additions and 10 deletions

View File

@ -927,6 +927,12 @@ class GMasterChangelogMixin(GMasterCommon):
def purge_update():
files_pending['purge'] += 1
def log_failures(failures, entry_key, gfid_prefix, log_prefix):
for failure in failures:
st = lstat(os.path.join(gfid_prefix, failure[0][entry_key]))
if not isinstance(st, int):
logging.warn('%s FAILED: %s' % (log_prefix, repr(failure)))
for e in clist:
e = e.strip()
et = e[self.IDX_START:self.IDX_END] # entry type
@ -1029,7 +1035,8 @@ class GMasterChangelogMixin(GMasterCommon):
self.update_worker_cumilitive_status(files_pending)
# sync namespace
if entries:
self.slave.server.entry_ops(entries)
failures = self.slave.server.entry_ops(entries)
log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
# sync metadata
if meta_gfid:
meta_entries = []
@ -1043,7 +1050,8 @@ class GMasterChangelogMixin(GMasterCommon):
continue
meta_entries.append(edct('META', go=go[0], stat=st))
if meta_entries:
self.slave.server.meta_ops(meta_entries)
failures = self.slave.server.meta_ops(meta_entries)
log_failures(failures, 'go', '', 'META')
# sync data
if datas:
self.a_syncdata(datas)

View File

@ -607,6 +607,19 @@ class Server(object):
er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
if er == ENOTEMPTY:
return er
def collect_failure(e, cmd_ret):
# We do this for failing fops on Slave
# Master should be logging this
if cmd_ret == EEXIST:
disk_gfid = cls.gfid_mnt(e['entry'])
if isinstance(disk_gfid, basestring):
if e['gfid'] != disk_gfid:
failures.append((e, cmd_ret, disk_gfid))
else:
failures.append((e, cmd_ret))
failures = []
for e in entries:
blob = None
op = e['op']
@ -644,7 +657,10 @@ class Server(object):
(pg, bname) = entry2pb(entry)
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
else:
errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST])
cmd_ret = errno_wrap(os.link,
[slink, entry],
[ENOENT, EEXIST])
collect_failure(e, cmd_ret)
elif op == 'SYMLINK':
blob = entry_pack_symlink(gfid, bname, e['link'], e['stat'])
elif op == 'RENAME':
@ -655,16 +671,22 @@ class Server(object):
(pg, bname) = entry2pb(en)
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
else:
errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
cmd_ret = errno_wrap(os.rename,
[entry, en],
[ENOENT, EEXIST])
collect_failure(e, cmd_ret)
if blob:
errno_wrap(Xattr.lsetxattr,
[pg, 'glusterfs.gfid.newfile', blob],
[EEXIST],
[ENOENT, ESTALE, EINVAL])
cmd_ret = errno_wrap(Xattr.lsetxattr,
[pg, 'glusterfs.gfid.newfile', blob],
[EEXIST],
[ENOENT, ESTALE, EINVAL])
collect_failure(e, cmd_ret)
return failures
@classmethod
def meta_ops(cls, meta_entries):
logging.debug('Meta-entries: %s' % repr(meta_entries))
failures = []
for e in meta_entries:
mode = e['stat']['mode']
uid = e['stat']['uid']
@ -672,10 +694,18 @@ class Server(object):
atime = e['stat']['atime']
mtime = e['stat']['mtime']
go = e['go']
errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
cmd_ret = errno_wrap(os.chmod, [go, mode],
[ENOENT], [ESTALE, EINVAL])
# This is a fail fast mechanism
# We do this for failing fops on Slave
# Master should be logging this
if isinstance(cmd_ret, int):
failures.append((e, cmd_ret))
continue
errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
errno_wrap(os.utime, [go, (atime, mtime)],
[ENOENT], [ESTALE, EINVAL])
return failures
@classmethod
@_pathguard

View File

@ -484,7 +484,7 @@ def errno_wrap(call, arg=[], errnos=[], retry_errnos=[ESTALE]):
if nr_tries == GF_OP_RETRIES:
# probably a screwed state, cannot do much...
logging.warn('reached maximum retries (%s)...' % repr(arg))
return
return ex.errno
time.sleep(0.250) # retry the call