geo-rep: log ENTRY failures from slave on master
ENTRY operations failures on slave left no trace for debugging purposes. This patch captures such failures on slave cluster and forwards them to the master and logs them. Failures of specific interest are the ones which return code EEXIST on the failing operations. Change-Id: Iecab876f16593c746d53f4b7ec2e0783367856bb BUG: 1207115 Signed-off-by: Milind Changire <mchangir@redhat.com> Reviewed-on: http://review.gluster.org/10048 Reviewed-by: Aravinda VK <avishwan@redhat.com> Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com>
This commit is contained in:
parent
1adf231fc4
commit
70a729e975
@ -927,6 +927,12 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
def purge_update():
|
||||
files_pending['purge'] += 1
|
||||
|
||||
def log_failures(failures, entry_key, gfid_prefix, log_prefix):
|
||||
for failure in failures:
|
||||
st = lstat(os.path.join(gfid_prefix, failure[0][entry_key]))
|
||||
if not isinstance(st, int):
|
||||
logging.warn('%s FAILED: %s' % (log_prefix, repr(failure)))
|
||||
|
||||
for e in clist:
|
||||
e = e.strip()
|
||||
et = e[self.IDX_START:self.IDX_END] # entry type
|
||||
@ -1029,7 +1035,8 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
self.update_worker_cumilitive_status(files_pending)
|
||||
# sync namespace
|
||||
if entries:
|
||||
self.slave.server.entry_ops(entries)
|
||||
failures = self.slave.server.entry_ops(entries)
|
||||
log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
|
||||
# sync metadata
|
||||
if meta_gfid:
|
||||
meta_entries = []
|
||||
@ -1043,7 +1050,8 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
continue
|
||||
meta_entries.append(edct('META', go=go[0], stat=st))
|
||||
if meta_entries:
|
||||
self.slave.server.meta_ops(meta_entries)
|
||||
failures = self.slave.server.meta_ops(meta_entries)
|
||||
log_failures(failures, 'go', '', 'META')
|
||||
# sync data
|
||||
if datas:
|
||||
self.a_syncdata(datas)
|
||||
|
@ -607,6 +607,19 @@ class Server(object):
|
||||
er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
|
||||
if er == ENOTEMPTY:
|
||||
return er
|
||||
|
||||
def collect_failure(e, cmd_ret):
|
||||
# We do this for failing fops on Slave
|
||||
# Master should be logging this
|
||||
if cmd_ret == EEXIST:
|
||||
disk_gfid = cls.gfid_mnt(e['entry'])
|
||||
if isinstance(disk_gfid, basestring):
|
||||
if e['gfid'] != disk_gfid:
|
||||
failures.append((e, cmd_ret, disk_gfid))
|
||||
else:
|
||||
failures.append((e, cmd_ret))
|
||||
|
||||
failures = []
|
||||
for e in entries:
|
||||
blob = None
|
||||
op = e['op']
|
||||
@ -644,7 +657,10 @@ class Server(object):
|
||||
(pg, bname) = entry2pb(entry)
|
||||
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
|
||||
else:
|
||||
errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST])
|
||||
cmd_ret = errno_wrap(os.link,
|
||||
[slink, entry],
|
||||
[ENOENT, EEXIST])
|
||||
collect_failure(e, cmd_ret)
|
||||
elif op == 'SYMLINK':
|
||||
blob = entry_pack_symlink(gfid, bname, e['link'], e['stat'])
|
||||
elif op == 'RENAME':
|
||||
@ -655,16 +671,22 @@ class Server(object):
|
||||
(pg, bname) = entry2pb(en)
|
||||
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
|
||||
else:
|
||||
errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
|
||||
cmd_ret = errno_wrap(os.rename,
|
||||
[entry, en],
|
||||
[ENOENT, EEXIST])
|
||||
collect_failure(e, cmd_ret)
|
||||
if blob:
|
||||
errno_wrap(Xattr.lsetxattr,
|
||||
[pg, 'glusterfs.gfid.newfile', blob],
|
||||
[EEXIST],
|
||||
[ENOENT, ESTALE, EINVAL])
|
||||
cmd_ret = errno_wrap(Xattr.lsetxattr,
|
||||
[pg, 'glusterfs.gfid.newfile', blob],
|
||||
[EEXIST],
|
||||
[ENOENT, ESTALE, EINVAL])
|
||||
collect_failure(e, cmd_ret)
|
||||
return failures
|
||||
|
||||
@classmethod
|
||||
def meta_ops(cls, meta_entries):
|
||||
logging.debug('Meta-entries: %s' % repr(meta_entries))
|
||||
failures = []
|
||||
for e in meta_entries:
|
||||
mode = e['stat']['mode']
|
||||
uid = e['stat']['uid']
|
||||
@ -672,10 +694,18 @@ class Server(object):
|
||||
atime = e['stat']['atime']
|
||||
mtime = e['stat']['mtime']
|
||||
go = e['go']
|
||||
errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
|
||||
cmd_ret = errno_wrap(os.chmod, [go, mode],
|
||||
[ENOENT], [ESTALE, EINVAL])
|
||||
# This is a fail fast mechanism
|
||||
# We do this for failing fops on Slave
|
||||
# Master should be logging this
|
||||
if isinstance(cmd_ret, int):
|
||||
failures.append((e, cmd_ret))
|
||||
continue
|
||||
errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
|
||||
errno_wrap(os.utime, [go, (atime, mtime)],
|
||||
[ENOENT], [ESTALE, EINVAL])
|
||||
return failures
|
||||
|
||||
@classmethod
|
||||
@_pathguard
|
||||
|
@ -484,7 +484,7 @@ def errno_wrap(call, arg=[], errnos=[], retry_errnos=[ESTALE]):
|
||||
if nr_tries == GF_OP_RETRIES:
|
||||
# probably a screwed state, cannot do much...
|
||||
logging.warn('reached maximum retries (%s)...' % repr(arg))
|
||||
return
|
||||
return ex.errno
|
||||
time.sleep(0.250) # retry the call
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user