2019-03-01 23:38:49 +03:00
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation .
*
* Example program for Host Bandwidth Managment
*
* This program loads a cgroup skb BPF program to enforce cgroup output
* ( egress ) or input ( ingress ) bandwidth limits .
*
* USAGE : hbm [ - d ] [ - l ] [ - n < id > ] [ - r < rate > ] [ - s ] [ - t < secs > ] [ - w ] [ - h ] [ prog ]
* Where :
* - d Print BPF trace debug buffer
* - l Also limit flows doing loopback
* - n < # > To create cgroup \ " /hbm# \" and attach prog
* Default is / hbm1
2019-05-29 02:59:39 +03:00
* - - no_cn Do not return cn notifications
2019-03-01 23:38:49 +03:00
* - r < rate > Rate limit in Mbps
* - s Get HBM stats ( marked , dropped , etc . )
2019-03-05 20:31:13 +03:00
* - t < time > Exit after specified seconds ( default is 0 )
2019-03-01 23:38:49 +03:00
* - w Work conserving flag . cgroup can increase its bandwidth
* beyond the rate limit specified while there is available
* bandwidth . Current implementation assumes there is only
* NIC ( eth0 ) , but can be extended to support multiple NICs .
* Currrently only supported for egress .
* - h Print this info
* prog BPF program file name . Name defaults to hbm_out_kern . o
*/
# define _GNU_SOURCE
# include <stdio.h>
# include <stdlib.h>
# include <assert.h>
# include <sys/resource.h>
# include <sys/time.h>
# include <unistd.h>
# include <errno.h>
# include <fcntl.h>
# include <linux/unistd.h>
# include <linux/bpf.h>
# include <bpf/bpf.h>
2019-05-29 02:59:39 +03:00
# include <getopt.h>
2019-03-01 23:38:49 +03:00
# include "bpf_load.h"
# include "bpf_rlimit.h"
# include "cgroup_helpers.h"
# include "hbm.h"
# include "bpf_util.h"
# include "bpf/bpf.h"
# include "bpf/libbpf.h"
bool outFlag = true ;
int minRate = 1000 ; /* cgroup rate limit in Mbps */
int rate = 1000 ; /* can grow if rate conserving is enabled */
int dur = 1 ;
bool stats_flag ;
bool loopback_flag ;
bool debugFlag ;
bool work_conserving_flag ;
2019-05-29 02:59:39 +03:00
bool no_cn_flag ;
2019-03-01 23:38:49 +03:00
static void Usage ( void ) ;
static void read_trace_pipe2 ( void ) ;
static void do_error ( char * msg , bool errno_flag ) ;
# define DEBUGFS " / sys / kernel / debug / tracing / "
struct bpf_object * obj ;
int bpfprog_fd ;
int cgroup_storage_fd ;
static void read_trace_pipe2 ( void )
{
int trace_fd ;
FILE * outf ;
char * outFname = " hbm_out.log " ;
trace_fd = open ( DEBUGFS " trace_pipe " , O_RDONLY , 0 ) ;
if ( trace_fd < 0 ) {
printf ( " Error opening trace_pipe \n " ) ;
return ;
}
// Future support of ingress
// if (!outFlag)
// outFname = "hbm_in.log";
outf = fopen ( outFname , " w " ) ;
if ( outf = = NULL )
printf ( " Error creating %s \n " , outFname ) ;
while ( 1 ) {
static char buf [ 4097 ] ;
ssize_t sz ;
sz = read ( trace_fd , buf , sizeof ( buf ) - 1 ) ;
if ( sz > 0 ) {
buf [ sz ] = 0 ;
puts ( buf ) ;
if ( outf ! = NULL ) {
fprintf ( outf , " %s \n " , buf ) ;
fflush ( outf ) ;
}
}
}
}
static void do_error ( char * msg , bool errno_flag )
{
if ( errno_flag )
printf ( " ERROR: %s, errno: %d \n " , msg , errno ) ;
else
printf ( " ERROR: %s \n " , msg ) ;
exit ( 1 ) ;
}
static int prog_load ( char * prog )
{
struct bpf_prog_load_attr prog_load_attr = {
. prog_type = BPF_PROG_TYPE_CGROUP_SKB ,
. file = prog ,
. expected_attach_type = BPF_CGROUP_INET_EGRESS ,
} ;
int map_fd ;
struct bpf_map * map ;
int ret = 0 ;
if ( access ( prog , O_RDONLY ) < 0 ) {
printf ( " Error accessing file %s: %s \n " , prog , strerror ( errno ) ) ;
return 1 ;
}
if ( bpf_prog_load_xattr ( & prog_load_attr , & obj , & bpfprog_fd ) )
ret = 1 ;
if ( ! ret ) {
map = bpf_object__find_map_by_name ( obj , " queue_stats " ) ;
map_fd = bpf_map__fd ( map ) ;
if ( map_fd < 0 ) {
printf ( " Map not found: %s \n " , strerror ( map_fd ) ) ;
ret = 1 ;
}
}
if ( ret ) {
printf ( " ERROR: load_bpf_file failed for: %s \n " , prog ) ;
printf ( " Output from verifier: \n %s \n ------ \n " , bpf_log_buf ) ;
ret = - 1 ;
} else {
ret = map_fd ;
}
return ret ;
}
static int run_bpf_prog ( char * prog , int cg_id )
{
int map_fd ;
int rc = 0 ;
int key = 0 ;
int cg1 = 0 ;
int type = BPF_CGROUP_INET_EGRESS ;
char cg_dir [ 100 ] ;
struct hbm_queue_stats qstats = { 0 } ;
sprintf ( cg_dir , " /hbm%d " , cg_id ) ;
map_fd = prog_load ( prog ) ;
if ( map_fd = = - 1 )
return 1 ;
if ( setup_cgroup_environment ( ) ) {
printf ( " ERROR: setting cgroup environment \n " ) ;
goto err ;
}
cg1 = create_and_get_cgroup ( cg_dir ) ;
if ( ! cg1 ) {
printf ( " ERROR: create_and_get_cgroup \n " ) ;
goto err ;
}
if ( join_cgroup ( cg_dir ) ) {
printf ( " ERROR: join_cgroup \n " ) ;
goto err ;
}
qstats . rate = rate ;
qstats . stats = stats_flag ? 1 : 0 ;
qstats . loopback = loopback_flag ? 1 : 0 ;
2019-05-29 02:59:39 +03:00
qstats . no_cn = no_cn_flag ? 1 : 0 ;
2019-03-01 23:38:49 +03:00
if ( bpf_map_update_elem ( map_fd , & key , & qstats , BPF_ANY ) ) {
printf ( " ERROR: Could not update map element \n " ) ;
goto err ;
}
if ( ! outFlag )
type = BPF_CGROUP_INET_INGRESS ;
if ( bpf_prog_attach ( bpfprog_fd , cg1 , type , 0 ) ) {
printf ( " ERROR: bpf_prog_attach fails! \n " ) ;
log_err ( " Attaching prog " ) ;
goto err ;
}
if ( work_conserving_flag ) {
struct timeval t0 , t_last , t_new ;
FILE * fin ;
unsigned long long last_eth_tx_bytes , new_eth_tx_bytes ;
signed long long last_cg_tx_bytes , new_cg_tx_bytes ;
signed long long delta_time , delta_bytes , delta_rate ;
int delta_ms ;
# define DELTA_RATE_CHECK 10000 /* in us */
# define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
bpf_map_lookup_elem ( map_fd , & key , & qstats ) ;
if ( gettimeofday ( & t0 , NULL ) < 0 )
do_error ( " gettimeofday failed " , true ) ;
t_last = t0 ;
fin = fopen ( " /sys/class/net/eth0/statistics/tx_bytes " , " r " ) ;
if ( fscanf ( fin , " %llu " , & last_eth_tx_bytes ) ! = 1 )
do_error ( " fscanf fails " , false ) ;
fclose ( fin ) ;
last_cg_tx_bytes = qstats . bytes_total ;
while ( true ) {
usleep ( DELTA_RATE_CHECK ) ;
if ( gettimeofday ( & t_new , NULL ) < 0 )
do_error ( " gettimeofday failed " , true ) ;
delta_ms = ( t_new . tv_sec - t0 . tv_sec ) * 1000 +
( t_new . tv_usec - t0 . tv_usec ) / 1000 ;
if ( delta_ms > dur * 1000 )
break ;
delta_time = ( t_new . tv_sec - t_last . tv_sec ) * 1000000 +
( t_new . tv_usec - t_last . tv_usec ) ;
if ( delta_time = = 0 )
continue ;
t_last = t_new ;
fin = fopen ( " /sys/class/net/eth0/statistics/tx_bytes " ,
" r " ) ;
if ( fscanf ( fin , " %llu " , & new_eth_tx_bytes ) ! = 1 )
do_error ( " fscanf fails " , false ) ;
fclose ( fin ) ;
printf ( " new_eth_tx_bytes:%llu \n " ,
new_eth_tx_bytes ) ;
bpf_map_lookup_elem ( map_fd , & key , & qstats ) ;
new_cg_tx_bytes = qstats . bytes_total ;
delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes ;
last_eth_tx_bytes = new_eth_tx_bytes ;
delta_rate = ( delta_bytes * 8000000 ) / delta_time ;
printf ( " %5d - eth_rate:%.1fGbps cg_rate:%.3fGbps " ,
delta_ms , delta_rate / 1000000000.0 ,
rate / 1000.0 ) ;
if ( delta_rate < RATE_THRESHOLD ) {
/* can increase cgroup rate limit, but first
* check if we are using the current limit .
* Currently increasing by 6.25 % , unknown
* if that is the optimal rate .
*/
int rate_diff100 ;
delta_bytes = new_cg_tx_bytes -
last_cg_tx_bytes ;
last_cg_tx_bytes = new_cg_tx_bytes ;
delta_rate = ( delta_bytes * 8000000 ) /
delta_time ;
printf ( " rate:%.3fGbps " ,
delta_rate / 1000000000.0 ) ;
rate_diff100 = ( ( ( long long ) rate ) * 1000000 -
delta_rate ) * 100 /
( ( ( long long ) rate ) * 1000000 ) ;
printf ( " rdiff:%d " , rate_diff100 ) ;
if ( rate_diff100 < = 3 ) {
rate + = ( rate > > 4 ) ;
if ( rate > RATE_THRESHOLD / 1000000 )
rate = RATE_THRESHOLD / 1000000 ;
qstats . rate = rate ;
printf ( " INC \n " ) ;
} else {
printf ( " \n " ) ;
}
} else {
/* Need to decrease cgroup rate limit.
* Currently decreasing by 12.5 % , unknown
* if that is optimal
*/
printf ( " DEC \n " ) ;
rate - = ( rate > > 3 ) ;
if ( rate < minRate )
rate = minRate ;
qstats . rate = rate ;
}
if ( bpf_map_update_elem ( map_fd , & key , & qstats , BPF_ANY ) )
do_error ( " update map element fails " , false ) ;
}
} else {
sleep ( dur ) ;
}
// Get stats!
if ( stats_flag & & bpf_map_lookup_elem ( map_fd , & key , & qstats ) ) {
char fname [ 100 ] ;
FILE * fout ;
if ( ! outFlag )
sprintf ( fname , " hbm.%d.in " , cg_id ) ;
else
sprintf ( fname , " hbm.%d.out " , cg_id ) ;
fout = fopen ( fname , " w " ) ;
fprintf ( fout , " id:%d \n " , cg_id ) ;
fprintf ( fout , " ERROR: Could not lookup queue_stats \n " ) ;
} else if ( stats_flag & & qstats . lastPacketTime >
qstats . firstPacketTime ) {
long long delta_us = ( qstats . lastPacketTime -
qstats . firstPacketTime ) / 1000 ;
unsigned int rate_mbps = ( ( qstats . bytes_total -
qstats . bytes_dropped ) * 8 /
delta_us ) ;
double percent_pkts , percent_bytes ;
char fname [ 100 ] ;
FILE * fout ;
// Future support of ingress
// if (!outFlag)
// sprintf(fname, "hbm.%d.in", cg_id);
// else
sprintf ( fname , " hbm.%d.out " , cg_id ) ;
fout = fopen ( fname , " w " ) ;
fprintf ( fout , " id:%d \n " , cg_id ) ;
fprintf ( fout , " rate_mbps:%d \n " , rate_mbps ) ;
fprintf ( fout , " duration:%.1f secs \n " ,
( qstats . lastPacketTime - qstats . firstPacketTime ) /
1000000000.0 ) ;
fprintf ( fout , " packets:%d \n " , ( int ) qstats . pkts_total ) ;
fprintf ( fout , " bytes_MB:%d \n " , ( int ) ( qstats . bytes_total /
1000000 ) ) ;
fprintf ( fout , " pkts_dropped:%d \n " , ( int ) qstats . pkts_dropped ) ;
fprintf ( fout , " bytes_dropped_MB:%d \n " ,
( int ) ( qstats . bytes_dropped /
1000000 ) ) ;
// Marked Pkts and Bytes
percent_pkts = ( qstats . pkts_marked * 100.0 ) /
( qstats . pkts_total + 1 ) ;
percent_bytes = ( qstats . bytes_marked * 100.0 ) /
( qstats . bytes_total + 1 ) ;
fprintf ( fout , " pkts_marked_percent:%6.2f \n " , percent_pkts ) ;
fprintf ( fout , " bytes_marked_percent:%6.2f \n " , percent_bytes ) ;
// Dropped Pkts and Bytes
percent_pkts = ( qstats . pkts_dropped * 100.0 ) /
( qstats . pkts_total + 1 ) ;
percent_bytes = ( qstats . bytes_dropped * 100.0 ) /
( qstats . bytes_total + 1 ) ;
fprintf ( fout , " pkts_dropped_percent:%6.2f \n " , percent_pkts ) ;
fprintf ( fout , " bytes_dropped_percent:%6.2f \n " , percent_bytes ) ;
fclose ( fout ) ;
}
if ( debugFlag )
read_trace_pipe2 ( ) ;
return rc ;
err :
rc = 1 ;
if ( cg1 )
close ( cg1 ) ;
cleanup_cgroup_environment ( ) ;
return rc ;
}
static void Usage ( void )
{
printf ( " This program loads a cgroup skb BPF program to enforce \n "
" cgroup output (egress) bandwidth limits. \n \n "
2019-05-29 02:59:39 +03:00
" USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>] \n "
" [-s] [-t <secs>] [-w] [-h] [prog] \n "
2019-03-01 23:38:49 +03:00
" Where: \n "
" -o indicates egress direction (default) \n "
" -d print BPF trace debug buffer \n "
" -l also limit flows using loopback \n "
" -n <#> to create cgroup \" /hbm# \" and attach prog \n "
" Default is /hbm1 \n "
2019-05-29 02:59:39 +03:00
" --no_cn disable CN notifcations \n "
2019-03-01 23:38:49 +03:00
" -r <rate> Rate in Mbps \n "
" -s Update HBM stats \n "
2019-03-05 20:31:13 +03:00
" -t <time> Exit after specified seconds (default is 0) \n "
2019-03-01 23:38:49 +03:00
" -w Work conserving flag. cgroup can increase \n "
" bandwidth beyond the rate limit specified \n "
" while there is available bandwidth. Current \n "
" implementation assumes there is only eth0 \n "
" but can be extended to support multiple NICs \n "
" -h print this info \n "
" prog BPF program file name. Name defaults to \n "
" hbm_out_kern.o \n " ) ;
}
int main ( int argc , char * * argv )
{
char * prog = " hbm_out_kern.o " ;
int k ;
int cg_id = 1 ;
char * optstring = " iodln:r:st:wh " ;
2019-05-29 02:59:39 +03:00
struct option loptions [ ] = {
{ " no_cn " , 0 , NULL , 1 } ,
{ NULL , 0 , NULL , 0 }
} ;
2019-03-01 23:38:49 +03:00
2019-05-29 02:59:39 +03:00
while ( ( k = getopt_long ( argc , argv , optstring , loptions , NULL ) ) ! = - 1 ) {
2019-03-01 23:38:49 +03:00
switch ( k ) {
2019-05-29 02:59:39 +03:00
case 1 :
no_cn_flag = true ;
break ;
2019-03-01 23:38:49 +03:00
case ' o ' :
break ;
case ' d ' :
debugFlag = true ;
break ;
case ' l ' :
loopback_flag = true ;
break ;
case ' n ' :
cg_id = atoi ( optarg ) ;
break ;
case ' r ' :
minRate = atoi ( optarg ) * 1.024 ;
rate = minRate ;
break ;
case ' s ' :
stats_flag = true ;
break ;
case ' t ' :
dur = atoi ( optarg ) ;
break ;
case ' w ' :
work_conserving_flag = true ;
break ;
case ' ? ' :
if ( optopt = = ' n ' | | optopt = = ' r ' | | optopt = = ' t ' )
fprintf ( stderr ,
" Option -%c requires an argument. \n \n " ,
optopt ) ;
case ' h ' :
// fallthrough
default :
Usage ( ) ;
return 0 ;
}
}
if ( optind < argc )
prog = argv [ optind ] ;
printf ( " HBM prog: %s \n " , prog ! = NULL ? prog : " NULL " ) ;
return run_bpf_prog ( prog , cg_id ) ;
}