glusterfs/extras/rebalance.py
Nigel Babu 8297b909ab python3: Fix python3 compatibility
Updates: #411
Change-Id: I7c1eaa92cd4ac05c3667b760e0db2cddcfbbaed8
Signed-off-by: Nigel Babu <nigelb@redhat.com>
2018-09-24 03:43:39 +00:00

310 lines
12 KiB
Python
Executable File

#!/usr/bin/python3
from __future__ import print_function
import atexit
import copy
import optparse
import os
import pipes
import shutil
import string
import subprocess
import sys
import tempfile
import volfilter
import platform
# It's just more convenient to have named fields.
class Brick:
def __init__ (self, path, name):
self.path = path
self.sv_name = name
self.size = 0
self.curr_size = 0
self.good_size = 0
def set_size (self, size):
self.size = size
def set_range (self, rs, re):
self.r_start = rs
self.r_end = re
self.curr_size = self.r_end - self.r_start + 1
def __repr__ (self):
value = self.path[:]
value += "(%d," % self.size
if self.curr_size:
value += "0x%x,0x%x)" % (self.r_start, self.r_end)
else:
value += "-)"
return value
def get_bricks (host, vol):
t = pipes.Template()
t.prepend("gluster --remote-host=%s system getspec %s"%(host, vol), ".-")
return t.open(None, "r")
def generate_stanza (vf, all_xlators, cur_subvol):
sv_list = []
for sv in cur_subvol.subvols:
generate_stanza(vf, all_xlators, sv)
sv_list.append(sv.name)
vf.write("volume %s\n" % cur_subvol.name)
vf.write(" type %s\n" % cur_subvol.type)
for kvpair in cur_subvol.opts.items():
vf.write(" option %s %s\n" % kvpair)
if sv_list:
vf.write(" subvolumes %s\n" % ''.join(sv_list))
vf.write("end-volume\n\n")
def mount_brick (localpath, all_xlators, dht_subvol):
# Generate a volfile.
vf_name = localpath + ".vol"
vf = open(vf_name, "w")
generate_stanza(vf, all_xlators, dht_subvol)
vf.flush()
vf.close()
# Create a brick directory and mount the brick there.
os.mkdir(localpath)
subprocess.call(["glusterfs", "-f", vf_name, localpath])
# We use the command-line tools because there's no getxattr support in the
# Python standard library (which is ridiculous IMO). Adding the xattr package
# from PyPI would create a new and difficult dependency because the bits to
# satisfy it don't seem to exist in Fedora. We already expect the command-line
# tools to be there, so it's safer just to rely on them.
#
# We might have to revisit this if we get as far as actually issuing millions
# of setxattr requests. Even then, it might be better to do that part with a C
# program which has only a build-time dependency.
def get_range (brick):
t = pipes.Template()
cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null"
t.prepend(cmd%brick, ".-")
t.append("grep ^trusted.glusterfs.dht=", "--")
f = t.open(None, "r")
try:
value = f.readline().rstrip().split('=')[1][2:]
except:
print("could not get layout for %s (might be OK)" % brick)
return None
v_start = int("0x"+value[16:24], 16)
v_end = int("0x"+value[24:32], 16)
return (v_start, v_end)
def calc_sizes (bricks, total):
leftover = 1 << 32
for b in bricks:
if b.size:
b.good_size = (b.size << 32) / total
leftover -= b.good_size
else:
b.good_size = 0
if leftover:
# Add the leftover to an old brick if we can.
for b in bricks:
if b.good_size:
b.good_size += leftover
break
else:
# Fine, just add it wherever.
bricks[0].good_size += leftover
# Normalization means sorting the bricks by r_start and (b) ensuring that there
# are no gaps.
def normalize (in_bricks):
out_bricks = []
curr_hash = 0
used = 0
while curr_hash < (1<<32):
curr_best = None
for b in in_bricks:
if b.r_start == curr_hash:
used += 1
out_bricks.append(b)
in_bricks.remove(b)
curr_hash = b.r_end + 1
break
else:
print("gap found at 0x%08x" % curr_hash)
sys.exit(1)
return out_bricks + in_bricks, used
def get_score (bricks):
score = 0
curr_hash = 0
for b in bricks:
if not b.curr_size:
curr_hash += b.good_size
continue
new_start = curr_hash
curr_hash += b.good_size
new_end = curr_hash - 1
if new_start > b.r_start:
max_start = new_start
else:
max_start = b.r_start
if new_end < b.r_end:
min_end = new_end
else:
min_end = b.r_end
if max_start <= min_end:
score += (min_end - max_start + 1)
return score
if __name__ == "__main__":
my_usage = "%prog [options] server volume [directory]"
parser = optparse.OptionParser(usage=my_usage)
parser.add_option("-f", "--free-space", dest="free_space",
default=False, action="store_true",
help="use free space instead of total space")
parser.add_option("-l", "--leave-mounted", dest="leave_mounted",
default=False, action="store_true",
help="leave subvolumes mounted")
parser.add_option("-v", "--verbose", dest="verbose",
default=False, action="store_true",
help="verbose output")
options, args = parser.parse_args()
if len(args) == 3:
fix_dir = args[2]
else:
if len(args) != 2:
parser.print_help()
sys.exit(1)
fix_dir = None
hostname, volname = args[:2]
# Make sure stuff gets cleaned up, even if there are exceptions.
orig_dir = os.getcwd()
work_dir = tempfile.mkdtemp()
bricks = []
def cleanup_workdir ():
os.chdir(orig_dir)
if options.verbose:
print("Cleaning up %s" % work_dir)
for b in bricks:
subprocess.call(["umount", b.path])
shutil.rmtree(work_dir)
if not options.leave_mounted:
atexit.register(cleanup_workdir)
os.chdir(work_dir)
# Mount each brick individually, so we can issue brick-specific calls.
if options.verbose:
print("Mounting subvolumes...")
index = 0
volfile_pipe = get_bricks(hostname, volname)
all_xlators, last_xlator = volfilter.load(volfile_pipe)
for dht_vol in all_xlators.itervalues():
if dht_vol.type == "cluster/distribute":
break
else:
print("no DHT volume found")
sys.exit(1)
for sv in dht_vol.subvols:
#print "found subvol %s" % sv.name
lpath = "%s/brick%s" % (work_dir, index)
index += 1
mount_brick(lpath, all_xlators, sv)
bricks.append(Brick(lpath, sv.name))
if index == 0:
print("no bricks")
sys.exit(1)
# Collect all of the sizes.
if options.verbose:
print("Collecting information...")
total = 0
for b in bricks:
info = os.statvfs(b.path)
# On FreeBSD f_bsize (info[0]) contains the optimal I/O size,
# not the block size as it's found on Linux. In this case we
# use f_frsize (info[1]).
if platform.system() == 'FreeBSD':
bsize = info[1]
else:
bsize = info[0]
# We want a standard unit even if different bricks use
# different block sizes. The size is chosen to avoid overflows
# for very large bricks with very small block sizes, but also
# accommodate filesystems which use very large block sizes to
# cheat on benchmarks.
blocksper100mb = 104857600 / bsize
if options.free_space:
size = info[3] / blocksper100mb
else:
size = info[2] / blocksper100mb
if size <= 0:
print("brick %s has invalid size %d" % (b.path, size))
sys.exit(1)
b.set_size(size)
total += size
# Collect all of the layout information.
for b in bricks:
hash_range = get_range(b.path)
if hash_range is not None:
rs, re = hash_range
if rs > re:
print("%s has backwards hash range" % b.path)
sys.exit(1)
b.set_range(hash_range[0], hash_range[1])
if options.verbose:
print("Calculating new layouts...")
calc_sizes(bricks, total)
bricks, used = normalize(bricks)
# We can't afford O(n!) here, but O(n^2) should be OK and the result
# should be almost as good.
while used < len(bricks):
best_place = used
best_score = get_score(bricks)
for i in range(used):
new_bricks = bricks[:]
del new_bricks[used]
new_bricks.insert(i, bricks[used])
new_score = get_score(new_bricks)
if new_score > best_score:
best_place = i
best_score = new_score
if best_place != used:
nb = bricks[used]
del bricks[used]
bricks.insert(best_place, nb)
used += 1
# Finalize whatever we decided on.
curr_hash = 0
for b in bricks:
b.r_start = curr_hash
curr_hash += b.good_size
b.r_end = curr_hash - 1
print("Here are the xattr values for your size-weighted layout:")
for b in bricks:
print(" %s: 0x0000000200000000%08x%08x" % (
b.sv_name, b.r_start, b.r_end))
if fix_dir:
if options.verbose:
print("Fixing layout for %s" % fix_dir)
for b in bricks:
value = "0x0000000200000000%08x%08x" % (
b.r_start, b.r_end)
path = "%s/%s" % (b.path, fix_dir)
cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % (
value, path)
print(cmd)
if options.leave_mounted:
print("The following subvolumes are still mounted:")
for b in bricks:
print("%s on %s" % (b.sv_name, b.path))
print("Don't forget to clean up when you're done.")