Pranith Kumar K 2da6650dfa storage/posix: Add active-fd-count option in gluster
when dd happens on sharded replicate volume all the writes on shards happen
through anon-fd. When the writes don't come quick enough, old anon-fd closes
and new fd gets created to serve the new writes. open-fd-count is decremented
only after the fd is closed as part of fd_destroy(). So even when one fd is on
the way to be closed a new fd will be created and during this short period it
appears as though there are multiple fds opened on the file. AFR thinks another
application opened the same file and switches off eager-lock leading to
extra latency.

Have a different option called active-fd whose life cycle starts at
fd_bind() and ends just before fd_destroy()

BUG: 1557932
Change-Id: I2e221f6030feeedf29fbb3bd6554673b8a5b9c94
Signed-off-by: Pranith Kumar K <>
2018-03-21 10:36:31 +05:30

845 lines
21 KiB

function volinfo_field()
local vol=$1;
local field=$2;
$CLI volume info $vol | grep "^$field: " | sed 's/.*: //';
function volume_get_field()
local vol=$1
local field=$2
$CLI volume get $vol $field | tail -1 | awk '{print $2}'
function brick_count()
local vol=$1;
$CLI volume info $vol | egrep "^Brick[0-9]+: " | wc -l;
function check_brick_status() {
cmd="gluster --xml volume status"
local daemon=$1
if [[ -z $daemon ]]
echo `$cmd | grep '<status>1' | wc -l`
echo `$cmd | grep -A 5 ${daemon} | grep '<status>1' | wc -l`
function online_brick_count ()
local v1=0
local v2=0
local v3=0
local v4=0
local v5=0
local tot=0
#First count total Number of bricks and then subtract daemon status
v2=`check_brick_status "Self-heal"`
v3=`check_brick_status "Quota"`
v4=`check_brick_status "Snapshot"`
v5=`check_brick_status "Tier"`
echo $tot
function brick_up_status {
local vol=$1
local host=$2
local brick=$3
$CLI volume status $vol $host:$brick --xml | sed -ne 's/.*<status>\([01]\)<\/status>/\1/p'
function volume_option()
local vol=$1
local key=$2
$CLI volume info $vol | egrep "^$key: " | cut -f2 -d' ';
function rebalance_status_field {
$CLI volume rebalance $1 status | awk '{print $7}' | sed -n 3p
function fix-layout_status_field {
#The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4.
#But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00').
#So we trim the numbers out with `tr`. Finally remove the trailing white spaces with sed. What we get is one of the
#strings in the 'cli_vol_task_status_str' char array of cli-rpc-ops.c
$CLI volume rebalance $1 status | awk '{print $2,$3,$4}' |sed -n 3p |tr -d '[^0-9+\.]'|sed 's/ *$//g'
function detach_tier_status_field_complete {
$CLI volume tier $1 detach status | awk '{print $7}' |sed -n 4p
function remove_brick_status_completed_field {
local vol=$1
local brick_list=$2
$CLI volume remove-brick $vol $brick_list status | awk '{print $7}' | sed -n 3p
function get_mount_process_pid {
local vol=$1
local mnt=$2
ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
function get_nfs_pid ()
ps auxww | grep "volfile-id\ gluster\/nfs" | awk '{print $2}' | head -1
function read_nfs_pidfile ()
echo `cat $GLUSTERD_PIDFILEDIR/nfs/`
function cleanup_statedump {
rm -f $statedumpdir/*$pid.dump.*
#.vimrc friendly comment */
function generate_statedump {
local fpath=""
#remove old stale statedumps
cleanup_statedump $pid
kill -USR1 $pid
#Wait till the statedump is generated
sleep 1
fname=$(ls $statedumpdir | grep -E "\.$pid\.dump\.")
echo $statedumpdir/$fname
function generate_mount_statedump {
local vol=$1
local mnt=$2
generate_statedump $(get_mount_process_pid $vol $mnt)
function cleanup_mount_statedump {
local vol=$1
cleanup_statedump $(get_mount_process_pid $vol)
function snap_client_connected_status {
local vol=$1
local fpath=$(generate_mount_statedump $vol)
up=$(grep -a -A2 xlator.protocol.client.$vol-snapd-client.priv $fpath | tail -1 | cut -f 2 -d'=')
rm -f $fpath
echo "$up"
function _jbrc_child_up_status {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
local gen_state_dump=$3
local fpath=$($gen_state_dump $vol)
up=$(grep -a -B1 child_$brick_id=$vol-client-$brick_id $fpath | head -1 | cut -f2 -d'=')
rm -f $fpath
echo "$up"
function jbrc_child_up_status {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
_jbrc_child_up_status $vol $brick_id generate_mount_statedump
function _afr_child_up_status {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
local gen_state_dump=$3
local fpath=$($gen_state_dump $vol)
up=$(grep -a -B1 trusted.afr.$vol-client-$brick_id $fpath | head -1 | cut -f2 -d'=')
rm -f $fpath
echo "$up"
function afr_child_up_status_meta {
local mnt=$1
local repl=$2
local child=$3
grep "child_up\[$child\]" $mnt/.meta/graphs/active/$repl/private | awk '{print $3}'
function client_connected_status_meta {
local mnt=$1
local client=$2
grep "connected" $mnt/.meta/graphs/active/$client/private | awk '{print $3}'
function afr_child_up_status {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
_afr_child_up_status $vol $brick_id generate_mount_statedump
function ec_get_info {
local vol=$1
local dist_id=$2
local key=$3
local fpath=$4
local value=$(sed -n "/^\[cluster\/disperse\.$vol-disperse-$dist_id\]/,/^\[/{s/^$key=\(.*\)/\1/p;}" $fpath | head -1)
rm -f $fpath
echo "$value"
function ec_child_up_status {
local vol=$1
local dist_id=$2
local brick_id=$(($3 + 1))
local mnt=$4
local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol $mnt))
echo "${mask: -$brick_id:1}"
function ec_child_up_count {
local vol=$1
local dist_id=$2
local mnt=$3
ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol $mnt)
function ec_child_up_status_shd {
local vol=$1
local dist_id=$2
local brick_id=$(($3 + 1))
local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_shd_statedump $vol))
echo "${mask: -$brick_id:1}"
function ec_child_up_count_shd {
local vol=$1
local dist_id=$2
ec_get_info $vol $dist_id "childs_up" $(generate_shd_statedump $vol)
function get_shd_process_pid {
ps auxww | grep glusterfs | grep -E "glustershd/" | awk '{print $2}' | head -1
function generate_shd_statedump {
generate_statedump $(get_shd_process_pid)
function generate_nfs_statedump {
generate_statedump $(get_nfs_pid)
function generate_brick_statedump {
local vol=$1
local host=$2
local brick=$3
generate_statedump $(get_brick_pid $vol $host $brick)
function afr_child_up_status_in_shd {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
_afr_child_up_status $vol $brick_id generate_shd_statedump
function afr_child_up_status_in_nfs {
local vol=$1
#brick_id is (brick-num in volume info - 1)
local brick_id=$2
_afr_child_up_status $vol $brick_id generate_nfs_statedump
function nfs_up_status {
gluster volume status | grep "NFS Server" | awk '{print $7}'
function glustershd_up_status {
gluster volume status | grep "Self-heal Daemon" | awk '{print $7}'
function quotad_up_status {
gluster volume status | grep "Quota Daemon" | awk '{print $7}'
function get_brick_pidfile {
local vol=$1
local host=$2
local brick=$3
local brick_hiphenated=$(echo $brick | tr '/' '-')
echo $GLUSTERD_PIDFILEDIR/vols/$vol/${host}${brick_hiphenated}.pid
function get_brick_pid {
cat $(get_brick_pidfile $*)
function kill_brick {
local vol=$1
local host=$2
local brick=$3
local pidfile=$(get_brick_pidfile $vol $host $brick)
local cmdline="/proc/$(cat $pidfile)/cmdline"
local socket=$(cat $cmdline | tr '\0' '\n' | grep '\.socket$')
gf_attach -d $socket $brick
# When the last brick in a process is terminated, the process has to
# sleep for a second to give the RPC response a chance to get back to
# GlusterD. Without that, we get random failures in tests that use
# "volume stop" whenever the process termination is observed before the
# RPC response. However, that same one-second sleep can cause other
# random failures in tests that assume a brick will already be gone
# before "gf_attach -d" returns. There are too many of those to fix,
# so we compensate by putting the same one-second sleep here.
sleep 1
function check_option_help_presence {
local option=$1
$CLI volume set help | grep "^Option:" | grep -w $option
function afr_get_changelog_xattr {
local file=$1
local xkey=$2
local xval=$(getfattr -n $xkey -e hex $file 2>/dev/null | grep "$xkey" | cut -f2 -d'=')
if [ -z $xval ]; then
echo $xval
function get_pending_heal_count {
local vol=$1
gluster volume heal $vol info | grep "Number of entries" | awk '{ sum+=$4} END {print sum}'
function afr_get_split_brain_count {
local vol=$1
gluster volume heal $vol info split-brain | grep "Number of entries in split-brain" | awk '{ sum+=$6} END {print sum}'
function afr_get_index_path {
local brick_path=$1
echo "$brick_path/.glusterfs/indices/xattrop"
function afr_get_num_indices_in_brick {
local brick_path=$1
echo $(ls $(afr_get_index_path $brick_path) | grep -v xattrop | wc -l)
function gf_get_gfid_xattr {
getfattr -n trusted.gfid -e hex $file 2>/dev/null | grep "trusted.gfid" | cut -f2 -d'='
function gf_gfid_xattr_to_str {
echo "${xval:2:8}-${xval:10:4}-${xval:14:4}-${xval:18:4}-${xval:22:12}"
function get_text_xattr {
local key=$1
local path=$2
getfattr -h -d -m. -e text $path 2>/dev/null | grep -a $key | cut -f2 -d'='
function get_gfid2path {
local path=$1
getfattr -h --only-values -n glusterfs.gfidtopath $path 2>/dev/null
function get_xattr_key {
local key=$1
local path=$2
getfattr -h -d -m. -e text $path 2>/dev/null | grep -a $key | cut -f1 -d'='
function gf_check_file_opened_in_brick {
ls -l /proc/$(get_brick_pid $vol $host $brick)/fd | grep "${realpath}$" 2>&1 > /dev/null
if [ $? -eq 0 ]; then
echo "Y"
echo "N"
function gf_get_gfid_backend_file_path {
gfid=$(gf_get_gfid_xattr "$brickpath/$filepath_in_brick")
gfidstr=$(gf_gfid_xattr_to_str $gfid)
echo "$brickpath/.glusterfs/${gfidstr:0:2}/${gfidstr:2:2}/$gfidstr"
function gf_rm_file_and_gfid_link {
rm -f $(gf_get_gfid_backend_file_path $brickpath $filepath_in_brick)
rm -f "$brickpath/$filepath_in_brick"
function gd_is_replace_brick_completed {
local host=$1
local vol=$2
local src_brick=$3
local dst_brick=$4
$CLI volume replace-brick $vol $src_brick $dst_brick status | grep -i "Migration complete"
if [ $? -eq 0 ]; then
echo "Y"
echo "N"
function dht_get_layout {
local my_xa=trusted.glusterfs.dht
getfattr -d -e hex -n $my_xa $1 2> /dev/null | grep "$my_xa=" | cut -d= -f2
function afr_get_specific_changelog_xattr ()
local path=$1
local key=$2
local type=$3
local specific_changelog=""
changelog_xattr=$(afr_get_changelog_xattr "$path" "$key")
if [ "$type" == "data" ]; then
elif [ "$type" == "metadata" ]; then
elif [ "$type" == "entry" ]; then
echo $specific_changelog
# query pathinfo xattr and extract POSIX pathname(s)
function get_backend_paths {
local path=$1
getfattr -m . -n trusted.glusterfs.pathinfo $path | tr ' ' '\n' | sed -n 's/<POSIX.*:.*:\(.*\)>.*/\1/p'
#Gets the xattr value in hex, also removed 0x in front of the value
function get_hex_xattr {
local key=$1
local path=$2
getfattr -d -m. -e hex $2 2>/dev/null | grep $1 | cut -f2 -d'=' | cut -f2 -d'x'
function cumulative_stat_count {
echo "$1" | grep "Cumulative Stats:" | wc -l
function incremental_stat_count {
echo "$1" | grep "Interval$2Stats:" | wc -l
function cleared_stat_count {
echo "$1" | grep "Cleared stats." | wc -l
function data_read_count {
echo "$1" | grep "Data Read:$2bytes" | wc -l
function data_written_count {
echo "$1" | grep "Data Written:$2bytes" | wc -l
function has_holes {
if [ $((`stat -c '%b*%B-%s' $1`)) -lt 0 ];
echo "1"
echo "0"
function do_volume_operations() {
local operation=$1
local count=$2
local force=$3
local pids=()
local cli
local v
for i in `seq 1 $count`; do
v="V`expr $i - 1`"
${!cli} volume $operation ${!v} $force &
for i in `seq 1 $count`; do
wait ${pids[$i]}
function start_volumes() {
do_volume_operations start $1
function stop_volumes() {
do_volume_operations stop $1
function start_force_volumes() {
do_volume_operations start $1 force
function stop_force_volumes() {
do_volume_operations stop $1 force
function delete_volumes() {
do_volume_operations delete $1
function volume_exists() {
$CLI volume info $1 > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "Y"
echo "N"
function killall_gluster() {
pkill gluster
find $GLUSTERD_PIDFILEDIR -name '*.pid' | xargs rm -f
sleep 1
function afr_get_index_count {
local brick=$1
ls $1/.glusterfs/indices/xattrop | grep -v xattrop | wc -l
function landfill_entry_count {
local brick=$1
ls $brick/.glusterfs/landfill | wc -l
function path_exists {
stat $1
if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
function path_size {
local size=$(stat -c %s $1)
if [ $? -eq 0 ]; then echo $size; else echo ""; fi
function force_umount {
${UMOUNT_F} $*
if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
function assign_gfid {
local gfid=$1
local file=$2
setfattr -n trusted.gfid -v $1 $2
function get_random_gfid {
echo "0x"$(uuidgen | awk -F '-' 'BEGIN {OFS=""} {print $1,$2,$3,$4,$5}')
function volgen_volume_exists {
local volfile="$1"
local xl_vol="$2"
local xl_type="$3"
local xl_feature="$4"
xl=$(sed -e "/./{H;\$!d;}" -e "x;/volume $xl_vol/!d;/type $xl_type\/$xl_feature/!d" $volfile)
if [ -z "$xl" ];
echo "N"
echo "Y"
function volgen_volume_option {
local volfile="$1"
local xl_vol="$2"
local xl_type="$3"
local xl_feature="$4"
local xl_option="$5"
sed -e "/./{H;\$!d;}" -e "x;/volume $xl_vol/!d;/type $xl_type\/$xl_feature/!d;/option $xl_option/!d" $volfile | grep " $xl_option " | awk '{print $3}'
function mount_get_option_value {
local m=$1
local subvol=$2
local key=$3
grep -w "$3" $m/.meta/graphs/active/$subvol/private | awk '{print $3}'
function get_volume_mark {
getfattr -n trusted.glusterfs.volume-mark -ehex $1 | sed -n 's/^trusted.glusterfs.volume-mark=0x//p' | cut -b5-36 | sed 's/\([a-f0-9]\{8\}\)\([a-f0-9]\{4\}\)\([a-f0-9]\{4\}\)\([a-f0-9]\{4\}\)/\1-\2-\3-\4-/'
# setup geo-rep in a single a node.
function setup_georep {
$CLI volume create $GMV0 replica 2 $H0:$B0/${GMV0}{1,2,3,4};
$CLI volume start $GMV0
$CLI volume create $GSV0 replica 2 $H0:$B0/${GSV0}{1,2,3,4};
$CLI volume start $GSV0
$CLI system:: execute gsec_create
$CLI volume geo-rep $GMV0 $H0::$GSV0 create push-pem
$CLI volume geo-rep $GMV0 $H0::$GSV0 start
sleep 80 # after start geo-rep takes a minute to get stable
# stop and delete geo-rep session
function cleanup_georep {
$CLI volume geo-rep $GMV0 $H0::$GSV0 stop
$CLI volume geo-rep $GMV0 $H0::$GSV0 delete
function num_graphs
local mountpoint=$1
echo `ls $mountpoint/.meta/graphs/ | grep -v active | wc -l`
function get_aux()
##Check if a auxiliary mount is there
local aux_suffix=$1
local rundir=$(gluster --print-statedumpdir)
local pidfile="${rundir}/${V0}$"
if [ -f $pidfile ];
local pid=$(cat ${rundir}/${V0}.pid)
pidof glusterfs 2>&1 | grep -w $pid > /dev/null
if [ $? -eq 0 ]
echo "0"
echo "1"
echo "1"
function get_list_aux()
# check for quota list aux mount
get_aux "_quota_list"
function get_limit_aux()
# check for quota list aux mount
get_aux "_quota_limit"
function check_for_xattr {
local xattr=$1
local filepath=$2
getfattr -n $xattr $filepath 2>/dev/null | grep "$xattr" | cut -f1 -d'='
function get_bitd_count {
ps auxww | grep glusterfs | grep | grep -v grep | wc -l
function get_scrubd_count {
ps auxww | grep glusterfs | grep | grep -v grep | wc -l
function get_quarantine_count {
ls -l "$1/.glusterfs/quarantine" | wc -l
function get_quotad_count {
ps auxww | grep glusterfs | grep | grep -v grep | wc -l
function get_nfs_count {
ps auxww | grep glusterfs | grep | grep -v grep | wc -l
function get_snapd_count {
ps auxww | grep glusterfs | grep | grep -v grep | wc -l
function drop_cache() {
case $OSTYPE in
echo 3 > /proc/sys/vm/drop_caches
# fail but flush caches
( cd $1 && umount $1 2>/dev/null )
function quota_list_field () {
local QUOTA_PATH=$1
local FIELD=$2
local awk_arg="{print \$$FIELD}"
$CLI volume quota $V0 list $QUOTA_PATH | grep $QUOTA_PATH | awk "$awk_arg"
function quota_object_list_field () {
local QUOTA_PATH=$1
local FIELD=$2
local awk_arg="{print \$$FIELD}"
$CLI volume quota $V0 list-objects $QUOTA_PATH | grep $QUOTA_PATH | awk "$awk_arg"
function quotausage()
quota_list_field $1 4
function quota_hard_limit()
quota_list_field $1 2
function quota_soft_limit()
quota_list_field $1 3
function quota_sl_exceeded()
quota_list_field $1 6
function quota_hl_exceeded()
quota_list_field $1 7
function quota_object_hard_limit()
quota_object_list_field $1 2
function scrub_status()
local vol=$1;
local field=$2;
$CLI volume bitrot $vol scrub status | grep "^$field: " | sed 's/.*: //';
function get_gfid_string {
local path=$1;
getfattr -n glusterfs.gfid.string $1 2>/dev/null \
| grep glusterfs.gfid.string | cut -d '"' -f 2
function file_all_zeroes {
< $1 tr -d '\0' | read -n 1 || echo 1
function get_hard_link_count {
local path=$1;
stat -c %h $path
function count_sh_entries()
ls $1/.glusterfs/indices/xattrop | grep -v "xattrop-" | wc -l
function get_fd_count {
local vol=$1
local host=$2
local brick=$3
local fname=$4
local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname))
local statedump=$(generate_brick_statedump $vol $host $brick)
local count=$(grep "gfid=$gfid_str" $statedump -A2 -B1 | grep $brick -A3 | grep -w fd-count | cut -f2 -d'=' | tail -1)
rm -f $statedump
echo $count
function get_active_fd_count {
local vol=$1
local host=$2
local brick=$3
local fname=$4
local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname))
local statedump=$(generate_brick_statedump $vol $host $brick)
local count=$(grep "gfid=$gfid_str" $statedump -A2 -B1 | grep $brick -A3 | grep -w active-fd-count | cut -f2 -d'=' | tail -1)
rm -f $statedump
echo $count
function get_mount_active_size_value {
local vol=$1
local statedump=$(generate_mount_statedump $vol)
sleep 1
local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1)
rm -f $statedump
echo $val
function get_mount_lru_size_value {
local vol=$1
local statedump=$(generate_mount_statedump $vol)
sleep 1
local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1)
rm -f $statedump
echo $val