From e8f088fea63797e190909a8bd0ce05ad95798a4e Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Wed, 19 Aug 2015 17:57:18 -0700 Subject: [PATCH] Add a TCP health checker Also, add timeout and status code parameters to the HTTP checker, and remove the threshold parameter for the file checker. Signed-off-by: Aaron Lehmann --- configuration/configuration.go | 34 +++++-- docs/configuration.md | 154 +++++++++++++++++++++++++++---- health/checks/checks.go | 32 +++++-- health/checks/checks_test.go | 4 +- registry/handlers/app.go | 38 ++++++-- registry/handlers/health_test.go | 63 +++++++++++++ 6 files changed, 279 insertions(+), 46 deletions(-) diff --git a/configuration/configuration.go b/configuration/configuration.go index 970a6ef47..b96857410 100644 --- a/configuration/configuration.go +++ b/configuration/configuration.go @@ -181,9 +181,9 @@ type MailOptions struct { To []string `yaml:"to,omitempty"` } -// FileChecker is a type of entry in the checkers section for checking files +// FileChecker is a type of entry in the health section for checking files. type FileChecker struct { - // Interval is the number of seconds in between checks + // Interval is the duration in between checks Interval time.Duration `yaml:"interval,omitempty"` // File is the path to check File string `yaml:"file,omitempty"` @@ -192,10 +192,13 @@ type FileChecker struct { Threshold int `yaml:"threshold,omitempty"` } -// HTTPChecker is a type of entry in the checkers section for checking HTTP -// URIs +// HTTPChecker is a type of entry in the health section for checking HTTP URIs. type HTTPChecker struct { - // Interval is the number of seconds in between checks + // Timeout is the duration to wait before timing out the HTTP request + Timeout time.Duration `yaml:"interval,omitempty"` + // StatusCode is the expected status code + StatusCode int + // Interval is the duration in between checks Interval time.Duration `yaml:"interval,omitempty"` // URI is the HTTP URI to check URI string `yaml:"uri,omitempty"` @@ -204,18 +207,33 @@ type HTTPChecker struct { Threshold int `yaml:"threshold,omitempty"` } +// TCPChecker is a type of entry in the health section for checking TCP servers. +type TCPChecker struct { + // Timeout is the duration to wait before timing out the TCP connection + Timeout time.Duration `yaml:"interval,omitempty"` + // Interval is the duration in between checks + Interval time.Duration `yaml:"interval,omitempty"` + // Addr is the TCP address to check + Addr string `yaml:"addr,omitempty"` + // Threshold is the number of times a check must fail to trigger an + // unhealthy state + Threshold int `yaml:"threshold,omitempty"` +} + // Health provides the configuration section for health checks. type Health struct { - // FileChecker is a list of paths to check + // FileCheckers is a list of paths to check FileCheckers []FileChecker `yaml:"file,omitempty"` - // HTTPChecker is a list of URIs to check + // HTTPCheckers is a list of URIs to check HTTPCheckers []HTTPChecker `yaml:"http,omitempty"` + // TCPCheckers is a list of URIs to check + TCPCheckers []TCPChecker `yaml:"tcp,omitempty"` // StorageDriver configures a health check on the configured storage // driver StorageDriver struct { // Enabled turns on the health check for the storage driver Enabled bool `yaml:"enabled,omitempty"` - // Interval is the number of seconds in between checks + // Interval is the duration in between checks Interval time.Duration `yaml:"interval,omitempty"` // Threshold is the number of times a check must fail to trigger an // unhealthy state diff --git a/docs/configuration.md b/docs/configuration.md index a0ddc6fd5..3e4bacc82 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -203,9 +203,15 @@ information about each option that appears later in this page. file: - file: /path/to/checked/file interval: 10s - threshold: 3 http: - uri: http://server.to.check/must/return/200 + statuscode: 200 + timeout: 3s + interval: 10s + threshold: 3 + tcp: + - addr: redis-server.domain.com:6379 + timeout: 3s interval: 10s threshold: 3 @@ -1611,17 +1617,23 @@ Configure the behavior of the Redis connection pool. file: - file: /path/to/checked/file interval: 10s - threshold: 3 http: - uri: http://server.to.check/must/return/200 + statuscode: 200 + timeout: 3s + interval: 10s + threshold: 3 + tcp: + - addr: redis-server.domain.com:6379 + timeout: 3s interval: 10s threshold: 3 The health option is **optional**. It may contain preferences for a periodic health check on the storage driver's backend storage, and optional periodic -checks on local files and/or HTTP URIs. The results of the health checks are -available at /debug/health on the debug HTTP server if the debug HTTP server is -enabled (see http section). +checks on local files, HTTP URIs, and/or TCP servers. The results of the health +checks are available at /debug/health on the debug HTTP server if the debug +HTTP server is enabled (see http section). ### storagedriver @@ -1730,25 +1742,13 @@ The path to check for the existence of a file. The default value is 10 seconds if this field is omitted. - - - threshold - - - no - - - An integer specifying the number of times the check must fail before the - check triggers an unhealthy state. If this filed is not specified, a - single failure will trigger an unhealthy state. - - ### http http is a list of HTTP URIs to be periodically checked with HEAD requests. If -a HEAD request returns a status code other than 200, the health check will fail. +a HEAD request doesn't complete or returns an unexpected status code, the +health check will fail. @@ -1767,6 +1767,122 @@ a HEAD request returns a status code other than 200, the health check will fail. The URI to check. + + + + + + + + + + + + + + + + + + + + +
+ statuscode + + no + +Expected status code from the HTTP URI. Defaults to 200. +
+ timeout + + no + + The length of time to wait before timing out the HTTP request. This field + takes a positive integer and an optional suffix indicating the unit of + time. Possible units are: +
    +
  • ns (nanoseconds)
  • +
  • us (microseconds)
  • +
  • ms (milliseconds)
  • +
  • s (seconds)
  • +
  • m (minutes)
  • +
  • h (hours)
  • +
+ If you omit the suffix, the system interprets the value as nanoseconds. +
+ interval + + no + + The length of time to wait between repetitions of the check. This field + takes a positive integer and an optional suffix indicating the unit of + time. Possible units are: +
    +
  • ns (nanoseconds)
  • +
  • us (microseconds)
  • +
  • ms (milliseconds)
  • +
  • s (seconds)
  • +
  • m (minutes)
  • +
  • h (hours)
  • +
+ If you omit the suffix, the system interprets the value as nanoseconds. + The default value is 10 seconds if this field is omitted. +
+ threshold + + no + + An integer specifying the number of times the check must fail before the + check triggers an unhealthy state. If this filed is not specified, a + single failure will trigger an unhealthy state. +
+ +### tcp + +tcp is a list of TCP addresses to be periodically checked with connection +attempts. The addresses must include port numbers. If a connection attempt +fails, the health check will fail. + + + + + + + + + + + + + + + + +
ParameterRequiredDescription
+ addr + + yes + +The TCP address to connect to, including a port number. +
+ timeout + + no + + The length of time to wait before timing out the TCP connection. This + field takes a positive integer and an optional suffix indicating the unit + of time. Possible units are: +
    +
  • ns (nanoseconds)
  • +
  • us (microseconds)
  • +
  • ms (milliseconds)
  • +
  • s (seconds)
  • +
  • m (minutes)
  • +
  • h (hours)
  • +
+ If you omit the suffix, the system interprets the value as nanoseconds. +
interval diff --git a/health/checks/checks.go b/health/checks/checks.go index 89d5f3db3..86e914b1b 100644 --- a/health/checks/checks.go +++ b/health/checks/checks.go @@ -2,15 +2,17 @@ package checks import ( "errors" + "net" "net/http" "os" "strconv" + "time" "github.com/docker/distribution/health" ) -// FileChecker checks the existence of a file and returns and error -// if the file exists, taking the application out of rotation +// FileChecker checks the existence of a file and returns an error +// if the file exists. func FileChecker(f string) health.Checker { return health.CheckFunc(func() error { if _, err := os.Stat(f); err == nil { @@ -20,18 +22,32 @@ func FileChecker(f string) health.Checker { }) } -// HTTPChecker does a HEAD request and verifies if the HTTP status -// code return is a 200, taking the application out of rotation if -// otherwise -func HTTPChecker(r string) health.Checker { +// HTTPChecker does a HEAD request and verifies that the HTTP status code +// returned matches statusCode. +func HTTPChecker(r string, statusCode int, timeout time.Duration) health.Checker { return health.CheckFunc(func() error { - response, err := http.Head(r) + client := http.Client{ + Timeout: timeout, + } + response, err := client.Head(r) if err != nil { return errors.New("error while checking: " + r) } - if response.StatusCode != http.StatusOK { + if response.StatusCode != statusCode { return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode)) } return nil }) } + +// TCPChecker attempts to open a TCP connection. +func TCPChecker(addr string, timeout time.Duration) health.Checker { + return health.CheckFunc(func() error { + conn, err := net.DialTimeout("tcp", addr, timeout) + if err != nil { + return errors.New("connection to " + addr + " failed") + } + conn.Close() + return nil + }) +} diff --git a/health/checks/checks_test.go b/health/checks/checks_test.go index 4e49d1182..8ba24d33f 100644 --- a/health/checks/checks_test.go +++ b/health/checks/checks_test.go @@ -15,11 +15,11 @@ func TestFileChecker(t *testing.T) { } func TestHTTPChecker(t *testing.T) { - if err := HTTPChecker("https://www.google.cybertron").Check(); err == nil { + if err := HTTPChecker("https://www.google.cybertron", 200, 0).Check(); err == nil { t.Errorf("Google on Cybertron was expected as not exists") } - if err := HTTPChecker("https://www.google.pt").Check(); err != nil { + if err := HTTPChecker("https://www.google.pt", 200, 0).Check(); err != nil { t.Errorf("Google at Portugal was expected as exists, error:%v", err) } } diff --git a/registry/handlers/app.go b/registry/handlers/app.go index 91f4e1a37..24f43f370 100644 --- a/registry/handlers/app.go +++ b/registry/handlers/app.go @@ -266,13 +266,8 @@ func (app *App) RegisterHealthChecks(healthRegistries ...*health.Registry) { if interval == 0 { interval = defaultCheckInterval } - if fileChecker.Threshold != 0 { - ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d, threshold=%d", fileChecker.File, interval/time.Second, fileChecker.Threshold) - healthRegistry.Register(fileChecker.File, health.PeriodicThresholdChecker(checks.FileChecker(fileChecker.File), interval, fileChecker.Threshold)) - } else { - ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second) - healthRegistry.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval)) - } + ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second) + healthRegistry.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval)) } for _, httpChecker := range app.Config.Health.HTTPCheckers { @@ -280,12 +275,37 @@ func (app *App) RegisterHealthChecks(healthRegistries ...*health.Registry) { if interval == 0 { interval = defaultCheckInterval } + + statusCode := httpChecker.StatusCode + if statusCode == 0 { + statusCode = 200 + } + + checker := checks.HTTPChecker(httpChecker.URI, statusCode, httpChecker.Timeout) + if httpChecker.Threshold != 0 { ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold) - healthRegistry.Register(httpChecker.URI, health.PeriodicThresholdChecker(checks.HTTPChecker(httpChecker.URI), interval, httpChecker.Threshold)) + healthRegistry.Register(httpChecker.URI, health.PeriodicThresholdChecker(checker, interval, httpChecker.Threshold)) } else { ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second) - healthRegistry.Register(httpChecker.URI, health.PeriodicChecker(checks.HTTPChecker(httpChecker.URI), interval)) + healthRegistry.Register(httpChecker.URI, health.PeriodicChecker(checker, interval)) + } + } + + for _, tcpChecker := range app.Config.Health.TCPCheckers { + interval := tcpChecker.Interval + if interval == 0 { + interval = defaultCheckInterval + } + + checker := checks.TCPChecker(tcpChecker.Addr, tcpChecker.Timeout) + + if tcpChecker.Threshold != 0 { + ctxu.GetLogger(app).Infof("configuring TCP health check addr=%s, interval=%d, threshold=%d", tcpChecker.Addr, interval/time.Second, tcpChecker.Threshold) + healthRegistry.Register(tcpChecker.Addr, health.PeriodicThresholdChecker(checker, interval, tcpChecker.Threshold)) + } else { + ctxu.GetLogger(app).Infof("configuring TCP health check addr=%s, interval=%d", tcpChecker.Addr, interval/time.Second) + healthRegistry.Register(tcpChecker.Addr, health.PeriodicChecker(checker, interval)) } } } diff --git a/registry/handlers/health_test.go b/registry/handlers/health_test.go index de2b71ccb..bb460b47a 100644 --- a/registry/handlers/health_test.go +++ b/registry/handlers/health_test.go @@ -2,6 +2,7 @@ package handlers import ( "io/ioutil" + "net" "net/http" "net/http/httptest" "os" @@ -61,6 +62,68 @@ func TestFileHealthCheck(t *testing.T) { } } +func TestTCPHealthCheck(t *testing.T) { + interval := time.Second + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("could not create listener: %v", err) + } + addrStr := ln.Addr().String() + + // Start accepting + go func() { + for { + conn, err := ln.Accept() + if err != nil { + // listener was closed + return + } + defer conn.Close() + } + }() + + config := configuration.Configuration{ + Storage: configuration.Storage{ + "inmemory": configuration.Parameters{}, + }, + Health: configuration.Health{ + TCPCheckers: []configuration.TCPChecker{ + { + Interval: interval, + Addr: addrStr, + Timeout: 500 * time.Millisecond, + }, + }, + }, + } + + ctx := context.Background() + + app := NewApp(ctx, config) + healthRegistry := health.NewRegistry() + app.RegisterHealthChecks(healthRegistry) + + // Wait for health check to happen + <-time.After(2 * interval) + + if len(healthRegistry.CheckStatus()) != 0 { + t.Fatal("expected 0 items in health check results") + } + + ln.Close() + <-time.After(2 * interval) + + // Health check should now fail + status := healthRegistry.CheckStatus() + if len(status) != 1 { + t.Fatal("expected 1 item in health check results") + } + if status[addrStr] != "connection to "+addrStr+" failed" { + t.Fatal(`did not get "connection failed" result for health check`) + } +} + func TestHTTPHealthCheck(t *testing.T) { interval := time.Second threshold := 3