From b09b0ffcf9466e924691ef289d83e0d08f25901a Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Tue, 18 Aug 2015 17:19:46 -0700 Subject: [PATCH] Add configurable file-existence and HTTP health checks Add a section to the config file called "health". Within this section, "filecheckers" and "httpcheckers" list checks to run. Each check specifies a file or URI, a time interval for the check, and a threshold specifying how many times the check must fail to reach an unhealthy state. Document the new options in docs/configuration.md. Add unit testing for both types of checkers. Add an UnregisterAll function in the health package to support the unit tests, and an Unregister function for consistency with Register. Fix a string conversion problem in the health package's HTTP checker. Signed-off-by: Aaron Lehmann --- configuration/configuration.go | 33 +++++ docs/configuration.md | 144 ++++++++++++++++++++++ health/checks/checks.go | 6 +- health/health.go | 14 +++ registry/handlers/app.go | 34 +++++- registry/handlers/health_test.go | 200 +++++++++++++++++++++++++++++++ 6 files changed, 428 insertions(+), 3 deletions(-) create mode 100644 registry/handlers/health_test.go diff --git a/configuration/configuration.go b/configuration/configuration.go index c6554f45..60440e05 100644 --- a/configuration/configuration.go +++ b/configuration/configuration.go @@ -135,6 +135,8 @@ type Configuration struct { } `yaml:"pool,omitempty"` } `yaml:"redis,omitempty"` + Health Health `yaml:"health,omitempty"` + Proxy Proxy `yaml:"proxy,omitempty"` } @@ -179,6 +181,37 @@ type MailOptions struct { To []string `yaml:"to,omitempty"` } +// FileChecker is a type of entry in the checkers section for checking files +type FileChecker struct { + // Interval is the number of seconds in between checks + Interval time.Duration `yaml:"interval,omitempty"` + // File is the path to check + File string `yaml:"file,omitempty"` + // Threshold is the number of times a check must fail to trigger an + // unhealthy state + Threshold int `yaml:"threshold,omitempty"` +} + +// HTTPChecker is a type of entry in the checkers section for checking HTTP +// URIs +type HTTPChecker struct { + // Interval is the number of seconds in between checks + Interval time.Duration `yaml:"interval,omitempty"` + // URI is the HTTP URI to check + URI string `yaml:"uri,omitempty"` + // Threshold is the number of times a check must fail to trigger an + // unhealthy state + Threshold int `yaml:"threshold,omitempty"` +} + +// Health provides the configuration section for health checks. +type Health struct { + // FileChecker is a list of paths to check + FileCheckers []FileChecker `yaml:"file,omitempty"` + // HTTPChecker is a list of URIs to check + HTTPCheckers []HTTPChecker `yaml:"http,omitempty"` +} + // v0_1Configuration is a Version 0.1 Configuration struct // This is currently aliased to Configuration, as it is the current version type v0_1Configuration Configuration diff --git a/docs/configuration.md b/docs/configuration.md index a9017bb5..3e16ca5d 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -195,6 +195,15 @@ information about each option that appears later in this page. maxidle: 16 maxactive: 64 idletimeout: 300s + health: + file: + - file: /path/to/checked/file + interval: 10s + threshold: 3 + http: + - uri: http://server.to.check/must/return/200 + interval: 10s + threshold: 3 In some instances a configuration option is **optional** but it contains child options marked as **required**. This indicates that you can omit the parent with @@ -1588,6 +1597,141 @@ Configure the behavior of the Redis connection pool. +## health + + health: + file: + - file: /path/to/checked/file + interval: 10s + threshold: 3 + http: + - uri: http://server.to.check/must/return/200 + interval: 10s + threshold: 3 + +The health option is **optional**. It may contain lists of file checkers +and/or HTTP checkers. + +### file + +file is a list of paths to be periodically checked for the existence of a file. +If a file exists at the given path, the health check will fail. This can be +used as a way of bringing a registry out of rotation by creating a file. + + + + + + + + + + + + + + + + + + + + + + +
ParameterRequiredDescription
+ file + + yes + +The path to check for the existence of a file. +
+ interval + + no + + The length of time to wait between repetitions of the check. This field + takes a positive integer and an optional suffix indicating the unit of + time. Possible units are: +
    +
  • ns (nanoseconds)
  • +
  • us (microseconds)
  • +
  • ms (milliseconds)
  • +
  • s (seconds)
  • +
  • m (minutes)
  • +
  • h (hours)
  • +
+ If you omit the suffix, the system interprets the value as nanoseconds. + The default value is 10 seconds if this field is omitted. +
+ threshold + + no + + An integer specifying the number of times the check must fail before the + check triggers an unhealthy state. If this filed is not specified, a + single failure will trigger an unhealthy state. +
+ +### http + +http is a list of HTTP URIs to be periodically checked with HEAD requests. If +a HEAD request returns a status code other than 200, the health check will fail. + + + + + + + + + + + + + + + + + + + + + + +
ParameterRequiredDescription
+ uri + + yes + +The URI to check. +
+ interval + + no + + The length of time to wait between repetitions of the check. This field + takes a positive integer and an optional suffix indicating the unit of + time. Possible units are: +
    +
  • ns (nanoseconds)
  • +
  • us (microseconds)
  • +
  • ms (milliseconds)
  • +
  • s (seconds)
  • +
  • m (minutes)
  • +
  • h (hours)
  • +
+ If you omit the suffix, the system interprets the value as nanoseconds. + The default value is 10 seconds if this field is omitted. +
+ threshold + + no + + An integer specifying the number of times the check must fail before the + check triggers an unhealthy state. If this filed is not specified, a + single failure will trigger an unhealthy state. +
## Example: Development configuration diff --git a/health/checks/checks.go b/health/checks/checks.go index 9de14010..89d5f3db 100644 --- a/health/checks/checks.go +++ b/health/checks/checks.go @@ -2,9 +2,11 @@ package checks import ( "errors" - "github.com/docker/distribution/health" "net/http" "os" + "strconv" + + "github.com/docker/distribution/health" ) // FileChecker checks the existence of a file and returns and error @@ -28,7 +30,7 @@ func HTTPChecker(r string) health.Checker { return errors.New("error while checking: " + r) } if response.StatusCode != http.StatusOK { - return errors.New("downstream service returned unexpected status: " + string(response.StatusCode)) + return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode)) } return nil }) diff --git a/health/health.go b/health/health.go index ba954919..dab2794d 100644 --- a/health/health.go +++ b/health/health.go @@ -170,6 +170,20 @@ func Register(name string, check Checker) { registeredChecks[name] = check } +// Unregister removes the named checker. +func Unregister(name string) { + mutex.Lock() + defer mutex.Unlock() + delete(registeredChecks, name) +} + +// UnregisterAll removes all registered checkers. +func UnregisterAll() { + mutex.Lock() + defer mutex.Unlock() + registeredChecks = make(map[string]Checker) +} + // RegisterFunc allows the convenience of registering a checker directly // from an arbitrary func() error func RegisterFunc(name string, check func() error) { diff --git a/registry/handlers/app.go b/registry/handlers/app.go index 7d1f1cf5..8b8543dd 100644 --- a/registry/handlers/app.go +++ b/registry/handlers/app.go @@ -15,6 +15,7 @@ import ( "github.com/docker/distribution/configuration" ctxu "github.com/docker/distribution/context" "github.com/docker/distribution/health" + "github.com/docker/distribution/health/checks" "github.com/docker/distribution/notifications" "github.com/docker/distribution/registry/api/errcode" "github.com/docker/distribution/registry/api/v2" @@ -37,6 +38,9 @@ import ( // was specified. const randomSecretSize = 32 +// defaultCheckInterval is the default time in between health checks +const defaultCheckInterval = 10 * time.Second + // App is a global registry application object. Shared resources can be placed // on this object that will be accessible from all requests. Any writable // fields should be protected. @@ -231,10 +235,38 @@ func NewApp(ctx context.Context, configuration configuration.Configuration) *App // implementing this properly will require a refactor. This method may panic // if called twice in the same process. func (app *App) RegisterHealthChecks() { - health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), 10*time.Second, 3, func() error { + health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), defaultCheckInterval, 3, func() error { _, err := app.driver.List(app, "/") // "/" should always exist return err // any error will be treated as failure }) + + for _, fileChecker := range app.Config.Health.FileCheckers { + interval := fileChecker.Interval + if interval == 0 { + interval = defaultCheckInterval + } + if fileChecker.Threshold != 0 { + ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d, threshold=%d", fileChecker.File, interval/time.Second, fileChecker.Threshold) + health.Register(fileChecker.File, health.PeriodicThresholdChecker(checks.FileChecker(fileChecker.File), interval, fileChecker.Threshold)) + } else { + ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second) + health.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval)) + } + } + + for _, httpChecker := range app.Config.Health.HTTPCheckers { + interval := httpChecker.Interval + if interval == 0 { + interval = defaultCheckInterval + } + if httpChecker.Threshold != 0 { + ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold) + health.Register(httpChecker.URI, health.PeriodicThresholdChecker(checks.HTTPChecker(httpChecker.URI), interval, httpChecker.Threshold)) + } else { + ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second) + health.Register(httpChecker.URI, health.PeriodicChecker(checks.HTTPChecker(httpChecker.URI), interval)) + } + } } // register a handler with the application, by route name. The handler will be diff --git a/registry/handlers/health_test.go b/registry/handlers/health_test.go new file mode 100644 index 00000000..ce5860a8 --- /dev/null +++ b/registry/handlers/health_test.go @@ -0,0 +1,200 @@ +package handlers + +import ( + "encoding/json" + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "testing" + "time" + + "github.com/docker/distribution/configuration" + "github.com/docker/distribution/health" + "golang.org/x/net/context" +) + +func TestFileHealthCheck(t *testing.T) { + // In case other tests registered checks before this one + health.UnregisterAll() + + interval := time.Second + + tmpfile, err := ioutil.TempFile(os.TempDir(), "healthcheck") + if err != nil { + t.Fatalf("could not create temporary file: %v", err) + } + defer tmpfile.Close() + + config := configuration.Configuration{ + Storage: configuration.Storage{ + "inmemory": configuration.Parameters{}, + }, + Health: configuration.Health{ + FileCheckers: []configuration.FileChecker{ + { + Interval: interval, + File: tmpfile.Name(), + }, + }, + }, + } + + ctx := context.Background() + + app := NewApp(ctx, config) + app.RegisterHealthChecks() + + debugServer := httptest.NewServer(nil) + + // Wait for health check to happen + <-time.After(2 * interval) + + resp, err := http.Get(debugServer.URL + "/debug/health") + if err != nil { + t.Fatalf("error performing HTTP GET: %v", err) + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatalf("error reading HTTP body: %v", err) + } + resp.Body.Close() + var decoded map[string]string + err = json.Unmarshal(body, &decoded) + if err != nil { + t.Fatalf("error unmarshaling json: %v", err) + } + if len(decoded) != 1 { + t.Fatal("expected 1 item in returned json") + } + if decoded[tmpfile.Name()] != "file exists" { + t.Fatal(`did not get "file exists" result for health check`) + } + + os.Remove(tmpfile.Name()) + + <-time.After(2 * interval) + resp, err = http.Get(debugServer.URL + "/debug/health") + if err != nil { + t.Fatalf("error performing HTTP GET: %v", err) + } + body, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatalf("error reading HTTP body: %v", err) + } + resp.Body.Close() + var decoded2 map[string]string + err = json.Unmarshal(body, &decoded2) + if err != nil { + t.Fatalf("error unmarshaling json: %v", err) + } + if len(decoded2) != 0 { + t.Fatal("expected 0 items in returned json") + } +} + +func TestHTTPHealthCheck(t *testing.T) { + // In case other tests registered checks before this one + health.UnregisterAll() + + interval := time.Second + threshold := 3 + + stopFailing := make(chan struct{}) + + checkedServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "HEAD" { + t.Fatalf("expected HEAD request, got %s", r.Method) + } + select { + case <-stopFailing: + w.WriteHeader(http.StatusOK) + default: + w.WriteHeader(http.StatusInternalServerError) + } + })) + + config := configuration.Configuration{ + Storage: configuration.Storage{ + "inmemory": configuration.Parameters{}, + }, + Health: configuration.Health{ + HTTPCheckers: []configuration.HTTPChecker{ + { + Interval: interval, + URI: checkedServer.URL, + Threshold: threshold, + }, + }, + }, + } + + ctx := context.Background() + + app := NewApp(ctx, config) + app.RegisterHealthChecks() + + debugServer := httptest.NewServer(nil) + + for i := 0; ; i++ { + <-time.After(interval) + + resp, err := http.Get(debugServer.URL + "/debug/health") + if err != nil { + t.Fatalf("error performing HTTP GET: %v", err) + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatalf("error reading HTTP body: %v", err) + } + resp.Body.Close() + var decoded map[string]string + err = json.Unmarshal(body, &decoded) + if err != nil { + t.Fatalf("error unmarshaling json: %v", err) + } + + if i < threshold-1 { + // definitely shouldn't have hit the threshold yet + if len(decoded) != 0 { + t.Fatal("expected 1 items in returned json") + } + continue + } + if i < threshold+1 { + // right on the threshold - don't expect a failure yet + continue + } + + if len(decoded) != 1 { + t.Fatal("expected 1 item in returned json") + } + if decoded[checkedServer.URL] != "downstream service returned unexpected status: 500" { + t.Fatal("did not get expected result for health check") + } + + break + } + + // Signal HTTP handler to start returning 200 + close(stopFailing) + + <-time.After(2 * interval) + resp, err := http.Get(debugServer.URL + "/debug/health") + if err != nil { + t.Fatalf("error performing HTTP GET: %v", err) + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatalf("error reading HTTP body: %v", err) + } + resp.Body.Close() + var decoded map[string]string + err = json.Unmarshal(body, &decoded) + if err != nil { + t.Fatalf("error unmarshaling json: %v", err) + } + if len(decoded) != 0 { + t.Fatal("expected 0 items in returned json") + } +}