2016-11-26 19:48:49 +01:00
package healthcheck
import (
2016-11-30 22:49:57 +01:00
"context"
2021-06-25 21:08:11 +02:00
"errors"
2017-03-24 09:36:33 +01:00
"fmt"
2017-05-10 14:28:57 -04:00
"net"
2016-11-26 19:48:49 +01:00
"net/http"
"net/url"
2017-05-10 14:28:57 -04:00
"strconv"
2022-08-08 10:22:07 -03:00
"strings"
2016-11-26 19:48:49 +01:00
"sync"
"time"
2017-01-31 22:55:02 +01:00
2020-09-26 13:30:03 +02:00
gokitmetrics "github.com/go-kit/kit/metrics"
2021-06-25 21:08:11 +02:00
"github.com/traefik/traefik/v2/pkg/config/dynamic"
2020-09-16 15:46:04 +02:00
"github.com/traefik/traefik/v2/pkg/config/runtime"
"github.com/traefik/traefik/v2/pkg/log"
2020-09-26 13:30:03 +02:00
"github.com/traefik/traefik/v2/pkg/metrics"
2020-09-16 15:46:04 +02:00
"github.com/traefik/traefik/v2/pkg/safe"
2022-12-05 11:30:05 +01:00
"github.com/vulcand/oxy/v2/roundrobin"
2016-11-26 19:48:49 +01:00
)
2019-05-16 10:58:06 +02:00
const (
serverUp = "UP"
serverDown = "DOWN"
)
2020-07-07 14:42:03 +02:00
var (
singleton * HealthCheck
once sync . Once
)
2016-11-26 19:48:49 +01:00
2020-05-11 12:06:07 +02:00
// Balancer is the set of operations required to manage the list of servers in a load-balancer.
2019-11-29 12:40:05 +01:00
type Balancer interface {
2018-06-11 11:36:03 +02:00
Servers ( ) [ ] * url . URL
RemoveServer ( u * url . URL ) error
UpsertServer ( u * url . URL , options ... roundrobin . ServerOption ) error
}
2019-11-29 12:40:05 +01:00
// BalancerHandler includes functionality for load-balancing management.
type BalancerHandler interface {
ServeHTTP ( w http . ResponseWriter , req * http . Request )
Balancer
}
2021-06-25 21:08:11 +02:00
// BalancerStatusHandler is an http Handler that does load-balancing,
2022-02-21 06:40:09 -05:00
// and updates its parents of its status.
2021-06-25 21:08:11 +02:00
type BalancerStatusHandler interface {
BalancerHandler
StatusUpdater
}
2020-09-26 13:30:03 +02:00
type metricsHealthcheck struct {
serverUpGauge gokitmetrics . Gauge
2016-11-26 19:48:49 +01:00
}
2017-03-15 19:16:06 +01:00
// Options are the public health check options.
type Options struct {
2020-02-26 17:28:04 +01:00
Headers map [ string ] string
Hostname string
Scheme string
Path string
2022-08-08 10:22:07 -03:00
Method string
2020-02-26 17:28:04 +01:00
Port int
FollowRedirects bool
Transport http . RoundTripper
Interval time . Duration
Timeout time . Duration
LB Balancer
2017-03-15 19:16:06 +01:00
}
2017-03-24 09:36:33 +01:00
func ( opt Options ) String ( ) string {
2022-08-08 10:22:07 -03:00
return fmt . Sprintf ( "[Hostname: %s Headers: %v Path: %s Method: %s Port: %d Interval: %s Timeout: %s FollowRedirects: %v]" , opt . Hostname , opt . Headers , opt . Path , opt . Method , opt . Port , opt . Interval , opt . Timeout , opt . FollowRedirects )
2017-03-24 09:36:33 +01:00
}
2019-08-07 08:14:04 -07:00
type backendURL struct {
url * url . URL
weight int
}
2020-05-11 12:06:07 +02:00
// BackendConfig HealthCheck configuration for a backend.
2018-06-11 11:36:03 +02:00
type BackendConfig struct {
2017-03-15 19:16:06 +01:00
Options
2018-11-27 17:42:04 +01:00
name string
2019-08-07 08:14:04 -07:00
disabledURLs [ ] backendURL
2016-11-26 19:48:49 +01:00
}
2018-06-11 11:36:03 +02:00
func ( b * BackendConfig ) newRequest ( serverURL * url . URL ) ( * http . Request , error ) {
2018-11-15 15:50:03 +01:00
u , err := serverURL . Parse ( b . Path )
if err != nil {
return nil , err
}
2016-11-26 19:48:49 +01:00
2018-06-11 11:36:03 +02:00
if len ( b . Scheme ) > 0 {
u . Scheme = b . Scheme
}
2016-11-26 19:48:49 +01:00
2018-06-11 11:36:03 +02:00
if b . Port != 0 {
u . Host = net . JoinHostPort ( u . Hostname ( ) , strconv . Itoa ( b . Port ) )
2017-03-09 16:27:31 +01:00
}
2016-11-26 19:48:49 +01:00
2018-10-23 10:10:04 +02:00
return http . NewRequest ( http . MethodGet , u . String ( ) , http . NoBody )
2018-01-26 11:58:03 +01:00
}
2022-08-08 10:22:07 -03:00
// setRequestOptions sets all request options present on the BackendConfig.
func ( b * BackendConfig ) setRequestOptions ( req * http . Request ) * http . Request {
2018-06-11 11:36:03 +02:00
if b . Options . Hostname != "" {
req . Host = b . Options . Hostname
}
for k , v := range b . Options . Headers {
req . Header . Set ( k , v )
2017-03-09 16:27:31 +01:00
}
2022-08-08 10:22:07 -03:00
if b . Options . Method != "" {
req . Method = strings . ToUpper ( b . Options . Method )
}
2018-06-11 11:36:03 +02:00
return req
}
2020-05-11 12:06:07 +02:00
// HealthCheck struct.
2018-06-11 11:36:03 +02:00
type HealthCheck struct {
Backends map [ string ] * BackendConfig
2020-09-26 13:30:03 +02:00
metrics metricsHealthcheck
2018-06-11 11:36:03 +02:00
cancel context . CancelFunc
2016-11-26 19:48:49 +01:00
}
2020-05-11 12:06:07 +02:00
// SetBackendsConfiguration set backends configuration.
2018-06-11 11:36:03 +02:00
func ( hc * HealthCheck ) SetBackendsConfiguration ( parentCtx context . Context , backends map [ string ] * BackendConfig ) {
2016-11-26 19:48:49 +01:00
hc . Backends = backends
2016-11-30 22:49:57 +01:00
if hc . cancel != nil {
hc . cancel ( )
}
2017-01-31 22:55:02 +01:00
ctx , cancel := context . WithCancel ( parentCtx )
2016-11-30 22:49:57 +01:00
hc . cancel = cancel
2016-11-26 19:48:49 +01:00
2018-01-15 17:27:37 +01:00
for _ , backend := range backends {
2017-01-31 22:55:02 +01:00
safe . Go ( func ( ) {
2024-03-20 10:26:03 +01:00
hc . execute ( ctx , backend )
2017-01-31 22:55:02 +01:00
} )
2016-11-30 22:49:57 +01:00
}
2016-11-26 19:48:49 +01:00
}
2018-06-11 11:36:03 +02:00
func ( hc * HealthCheck ) execute ( ctx context . Context , backend * BackendConfig ) {
2019-09-13 19:28:04 +02:00
logger := log . FromContext ( ctx )
2021-06-25 21:08:11 +02:00
2019-09-13 19:28:04 +02:00
logger . Debugf ( "Initial health check for backend: %q" , backend . name )
2021-06-25 21:08:11 +02:00
hc . checkServersLB ( ctx , backend )
2019-09-13 19:28:04 +02:00
2017-03-09 16:27:31 +01:00
ticker := time . NewTicker ( backend . Interval )
defer ticker . Stop ( )
2017-04-11 17:10:46 +02:00
for {
select {
case <- ctx . Done ( ) :
2019-09-13 19:28:04 +02:00
logger . Debugf ( "Stopping current health check goroutines of backend: %s" , backend . name )
2017-04-11 17:10:46 +02:00
return
case <- ticker . C :
2021-06-25 21:08:11 +02:00
logger . Debugf ( "Routine health check refresh for backend: %s" , backend . name )
hc . checkServersLB ( ctx , backend )
2017-03-09 16:27:31 +01:00
}
}
}
2021-06-25 21:08:11 +02:00
func ( hc * HealthCheck ) checkServersLB ( ctx context . Context , backend * BackendConfig ) {
2019-09-13 19:28:04 +02:00
logger := log . FromContext ( ctx )
2018-01-03 12:32:03 +01:00
enabledURLs := backend . LB . Servers ( )
2020-09-26 13:30:03 +02:00
2019-08-07 08:14:04 -07:00
var newDisabledURLs [ ] backendURL
2019-11-29 12:40:05 +01:00
for _ , disabledURL := range backend . disabledURLs {
2020-09-26 13:30:03 +02:00
serverUpMetricValue := float64 ( 0 )
2019-11-29 12:40:05 +01:00
if err := checkHealth ( disabledURL . url , backend ) ; err == nil {
2021-06-25 21:08:11 +02:00
logger . Warnf ( "Health check up: returning to server list. Backend: %q URL: %q Weight: %d" ,
2019-11-29 12:40:05 +01:00
backend . name , disabledURL . url . String ( ) , disabledURL . weight )
if err = backend . LB . UpsertServer ( disabledURL . url , roundrobin . Weight ( disabledURL . weight ) ) ; err != nil {
2019-09-13 19:28:04 +02:00
logger . Error ( err )
2018-08-06 20:00:03 +02:00
}
2020-09-26 13:30:03 +02:00
serverUpMetricValue = 1
2017-04-11 17:10:46 +02:00
} else {
2019-11-29 12:40:05 +01:00
logger . Warnf ( "Health check still failing. Backend: %q URL: %q Reason: %s" , backend . name , disabledURL . url . String ( ) , err )
newDisabledURLs = append ( newDisabledURLs , disabledURL )
2017-04-11 17:10:46 +02:00
}
2020-09-26 13:30:03 +02:00
labelValues := [ ] string { "service" , backend . name , "url" , disabledURL . url . String ( ) }
hc . metrics . serverUpGauge . With ( labelValues ... ) . Set ( serverUpMetricValue )
2017-04-11 17:10:46 +02:00
}
2020-09-26 13:30:03 +02:00
2018-01-03 12:32:03 +01:00
backend . disabledURLs = newDisabledURLs
2017-03-09 16:27:31 +01:00
2021-06-25 21:08:11 +02:00
for _ , enabledURL := range enabledURLs {
2020-09-26 13:30:03 +02:00
serverUpMetricValue := float64 ( 1 )
2021-06-25 21:08:11 +02:00
if err := checkHealth ( enabledURL , backend ) ; err != nil {
2019-08-07 08:14:04 -07:00
weight := 1
rr , ok := backend . LB . ( * roundrobin . RoundRobin )
if ok {
var gotWeight bool
2021-06-25 21:08:11 +02:00
weight , gotWeight = rr . ServerWeight ( enabledURL )
2019-08-07 08:14:04 -07:00
if ! gotWeight {
weight = 1
}
}
2020-09-26 13:30:03 +02:00
logger . Warnf ( "Health check failed, removing from server list. Backend: %q URL: %q Weight: %d Reason: %s" ,
2021-06-25 21:08:11 +02:00
backend . name , enabledURL . String ( ) , weight , err )
if err := backend . LB . RemoveServer ( enabledURL ) ; err != nil {
2019-09-13 19:28:04 +02:00
logger . Error ( err )
2018-08-06 20:00:03 +02:00
}
2020-09-26 13:30:03 +02:00
2021-06-25 21:08:11 +02:00
backend . disabledURLs = append ( backend . disabledURLs , backendURL { enabledURL , weight } )
2020-09-26 13:30:03 +02:00
serverUpMetricValue = 0
2017-04-11 17:10:46 +02:00
}
2020-09-26 13:30:03 +02:00
2021-06-25 21:08:11 +02:00
labelValues := [ ] string { "service" , backend . name , "url" , enabledURL . String ( ) }
2020-09-26 13:30:03 +02:00
hc . metrics . serverUpGauge . With ( labelValues ... ) . Set ( serverUpMetricValue )
2017-04-11 17:10:46 +02:00
}
2017-03-09 16:27:31 +01:00
}
2018-06-11 11:36:03 +02:00
// GetHealthCheck returns the health check which is guaranteed to be a singleton.
2020-09-26 13:30:03 +02:00
func GetHealthCheck ( registry metrics . Registry ) * HealthCheck {
2018-06-11 11:36:03 +02:00
once . Do ( func ( ) {
2020-09-26 13:30:03 +02:00
singleton = newHealthCheck ( registry )
2018-06-11 11:36:03 +02:00
} )
return singleton
2017-05-10 14:28:57 -04:00
}
2020-09-26 13:30:03 +02:00
func newHealthCheck ( registry metrics . Registry ) * HealthCheck {
2018-06-11 11:36:03 +02:00
return & HealthCheck {
Backends : make ( map [ string ] * BackendConfig ) ,
2020-09-26 13:30:03 +02:00
metrics : metricsHealthcheck {
serverUpGauge : registry . ServiceServerUpGauge ( ) ,
} ,
2018-04-16 11:40:03 +02:00
}
2018-06-11 11:36:03 +02:00
}
2018-05-22 09:22:03 +02:00
2020-05-11 12:06:07 +02:00
// NewBackendConfig Instantiate a new BackendConfig.
2018-06-11 11:36:03 +02:00
func NewBackendConfig ( options Options , backendName string ) * BackendConfig {
return & BackendConfig {
2018-09-27 13:16:03 -05:00
Options : options ,
name : backendName ,
2018-04-16 11:40:03 +02:00
}
}
2018-01-03 12:32:03 +01:00
// checkHealth returns a nil error in case it was successful and otherwise
// a non-nil error with a meaningful description why the health check failed.
2018-06-11 11:36:03 +02:00
func checkHealth ( serverURL * url . URL , backend * BackendConfig ) error {
2017-05-10 14:28:57 -04:00
req , err := backend . newRequest ( serverURL )
if err != nil {
2020-05-11 12:06:07 +02:00
return fmt . Errorf ( "failed to create HTTP request: %w" , err )
2017-05-10 14:28:57 -04:00
}
2018-05-22 09:22:03 +02:00
2022-08-08 10:22:07 -03:00
req = backend . setRequestOptions ( req )
2017-05-10 14:28:57 -04:00
2018-05-22 09:22:03 +02:00
client := http . Client {
2018-09-27 13:16:03 -05:00
Timeout : backend . Options . Timeout ,
2018-05-22 09:22:03 +02:00
Transport : backend . Options . Transport ,
2016-11-26 19:48:49 +01:00
}
2018-01-03 12:32:03 +01:00
2020-02-26 17:28:04 +01:00
if ! backend . FollowRedirects {
client . CheckRedirect = func ( req * http . Request , via [ ] * http . Request ) error {
return http . ErrUseLastResponse
}
}
2018-05-22 09:22:03 +02:00
resp , err := client . Do ( req )
if err != nil {
2020-05-11 12:06:07 +02:00
return fmt . Errorf ( "HTTP request failed: %w" , err )
2018-01-03 12:32:03 +01:00
}
2018-05-22 09:22:03 +02:00
defer resp . Body . Close ( )
2018-05-23 17:06:04 +02:00
if resp . StatusCode < http . StatusOK || resp . StatusCode >= http . StatusBadRequest {
return fmt . Errorf ( "received error status code: %v" , resp . StatusCode )
2018-05-22 09:22:03 +02:00
}
2018-01-03 12:32:03 +01:00
return nil
2016-11-26 19:48:49 +01:00
}
2019-05-16 10:58:06 +02:00
2021-06-25 21:08:11 +02:00
// StatusUpdater should be implemented by a service that, when its status
// changes (e.g. all if its children are down), needs to propagate upwards (to
// their parent(s)) that change.
type StatusUpdater interface {
RegisterStatusUpdater ( fn func ( up bool ) ) error
}
2020-05-11 12:06:07 +02:00
// NewLBStatusUpdater returns a new LbStatusUpdater.
2021-06-25 21:08:11 +02:00
func NewLBStatusUpdater ( bh BalancerHandler , info * runtime . ServiceInfo , hc * dynamic . ServerHealthCheck ) * LbStatusUpdater {
2019-05-16 10:58:06 +02:00
return & LbStatusUpdater {
2021-06-25 21:08:11 +02:00
BalancerHandler : bh ,
serviceInfo : info ,
wantsHealthCheck : hc != nil ,
2019-05-16 10:58:06 +02:00
}
}
// LbStatusUpdater wraps a BalancerHandler and a ServiceInfo,
// so it can keep track of the status of a server in the ServiceInfo.
type LbStatusUpdater struct {
BalancerHandler
2021-06-25 21:08:11 +02:00
serviceInfo * runtime . ServiceInfo // can be nil
updaters [ ] func ( up bool )
wantsHealthCheck bool
}
// RegisterStatusUpdater adds fn to the list of hooks that are run when the
// status of the Balancer changes.
// Not thread safe.
func ( lb * LbStatusUpdater ) RegisterStatusUpdater ( fn func ( up bool ) ) error {
if ! lb . wantsHealthCheck {
return errors . New ( "healthCheck not enabled in config for this loadbalancer service" )
}
lb . updaters = append ( lb . updaters , fn )
return nil
2019-05-16 10:58:06 +02:00
}
// RemoveServer removes the given server from the BalancerHandler,
// and updates the status of the server to "DOWN".
func ( lb * LbStatusUpdater ) RemoveServer ( u * url . URL ) error {
2021-06-25 21:08:11 +02:00
// TODO(mpl): when we have the freedom to change the signature of RemoveServer
// (kinda stuck because of oxy for now), let's pass around a context to improve
// logging.
ctx := context . TODO ( )
upBefore := len ( lb . BalancerHandler . Servers ( ) ) > 0
2019-05-16 10:58:06 +02:00
err := lb . BalancerHandler . RemoveServer ( u )
2021-06-25 21:08:11 +02:00
if err != nil {
return err
}
if lb . serviceInfo != nil {
2019-07-15 17:04:04 +02:00
lb . serviceInfo . UpdateServerStatus ( u . String ( ) , serverDown )
2019-05-16 10:58:06 +02:00
}
2021-06-25 21:08:11 +02:00
log . FromContext ( ctx ) . Debugf ( "child %s now %s" , u . String ( ) , serverDown )
if ! upBefore {
// we were already down, and we still are, no need to propagate.
log . FromContext ( ctx ) . Debugf ( "Still %s, no need to propagate" , serverDown )
return nil
}
if len ( lb . BalancerHandler . Servers ( ) ) > 0 {
// we were up, and we still are, no need to propagate
log . FromContext ( ctx ) . Debugf ( "Still %s, no need to propagate" , serverUp )
return nil
}
log . FromContext ( ctx ) . Debugf ( "Propagating new %s status" , serverDown )
for _ , fn := range lb . updaters {
fn ( false )
}
return nil
2019-05-16 10:58:06 +02:00
}
// UpsertServer adds the given server to the BalancerHandler,
// and updates the status of the server to "UP".
func ( lb * LbStatusUpdater ) UpsertServer ( u * url . URL , options ... roundrobin . ServerOption ) error {
2021-06-25 21:08:11 +02:00
ctx := context . TODO ( )
upBefore := len ( lb . BalancerHandler . Servers ( ) ) > 0
2019-05-16 10:58:06 +02:00
err := lb . BalancerHandler . UpsertServer ( u , options ... )
2021-06-25 21:08:11 +02:00
if err != nil {
return err
}
if lb . serviceInfo != nil {
2019-07-15 17:04:04 +02:00
lb . serviceInfo . UpdateServerStatus ( u . String ( ) , serverUp )
2019-05-16 10:58:06 +02:00
}
2021-06-25 21:08:11 +02:00
log . FromContext ( ctx ) . Debugf ( "child %s now %s" , u . String ( ) , serverUp )
if upBefore {
// we were up, and we still are, no need to propagate
log . FromContext ( ctx ) . Debugf ( "Still %s, no need to propagate" , serverUp )
return nil
}
log . FromContext ( ctx ) . Debugf ( "Propagating new %s status" , serverUp )
for _ , fn := range lb . updaters {
fn ( true )
}
return nil
2019-05-16 10:58:06 +02:00
}
2019-11-29 12:40:05 +01:00
// Balancers is a list of Balancers(s) that implements the Balancer interface.
type Balancers [ ] Balancer
2022-06-22 21:46:08 +02:00
// Servers returns the deduplicated server URLs from all the Balancer.
// Note that the deduplication is only possible because all the underlying
// balancers are of the same kind (the oxy implementation).
// The comparison property is the same as the one found at:
// https://github.com/vulcand/oxy/blob/fb2728c857b7973a27f8de2f2190729c0f22cf49/roundrobin/rr.go#L347.
2019-11-29 12:40:05 +01:00
func ( b Balancers ) Servers ( ) [ ] * url . URL {
2022-06-22 21:46:08 +02:00
seen := make ( map [ string ] struct { } )
2019-11-29 12:40:05 +01:00
var servers [ ] * url . URL
for _ , lb := range b {
2022-06-22 21:46:08 +02:00
for _ , server := range lb . Servers ( ) {
key := serverKey ( server )
if _ , ok := seen [ key ] ; ok {
continue
}
servers = append ( servers , server )
seen [ key ] = struct { } { }
}
2019-11-29 12:40:05 +01:00
}
return servers
}
2022-06-22 21:46:08 +02:00
// RemoveServer removes the given server from all the Balancer,
2019-11-29 12:40:05 +01:00
// and updates the status of the server to "DOWN".
func ( b Balancers ) RemoveServer ( u * url . URL ) error {
for _ , lb := range b {
if err := lb . RemoveServer ( u ) ; err != nil {
return err
}
}
return nil
}
2022-06-22 21:46:08 +02:00
// UpsertServer adds the given server to all the Balancer,
2019-11-29 12:40:05 +01:00
// and updates the status of the server to "UP".
func ( b Balancers ) UpsertServer ( u * url . URL , options ... roundrobin . ServerOption ) error {
for _ , lb := range b {
if err := lb . UpsertServer ( u , options ... ) ; err != nil {
return err
}
}
return nil
}
2022-06-22 21:46:08 +02:00
func serverKey ( u * url . URL ) string {
return u . Path + u . Host + u . Scheme
}