Add configurable file-existence and HTTP health checks
Add a section to the config file called "health". Within this section, "filecheckers" and "httpcheckers" list checks to run. Each check specifies a file or URI, a time interval for the check, and a threshold specifying how many times the check must fail to reach an unhealthy state. Document the new options in docs/configuration.md. Add unit testing for both types of checkers. Add an UnregisterAll function in the health package to support the unit tests, and an Unregister function for consistency with Register. Fix a string conversion problem in the health package's HTTP checker. Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
		
							parent
							
								
									e4b93d1e6d
								
							
						
					
					
						commit
						b09b0ffcf9
					
				
					 6 changed files with 428 additions and 3 deletions
				
			
		|  | @ -135,6 +135,8 @@ type Configuration struct { | |||
| 		} `yaml:"pool,omitempty"` | ||||
| 	} `yaml:"redis,omitempty"` | ||||
| 
 | ||||
| 	Health Health `yaml:"health,omitempty"` | ||||
| 
 | ||||
| 	Proxy Proxy `yaml:"proxy,omitempty"` | ||||
| } | ||||
| 
 | ||||
|  | @ -179,6 +181,37 @@ type MailOptions struct { | |||
| 	To []string `yaml:"to,omitempty"` | ||||
| } | ||||
| 
 | ||||
| // FileChecker is a type of entry in the checkers section for checking files | ||||
| type FileChecker struct { | ||||
| 	// Interval is the number of seconds in between checks | ||||
| 	Interval time.Duration `yaml:"interval,omitempty"` | ||||
| 	// File is the path to check | ||||
| 	File string `yaml:"file,omitempty"` | ||||
| 	// Threshold is the number of times a check must fail to trigger an | ||||
| 	// unhealthy state | ||||
| 	Threshold int `yaml:"threshold,omitempty"` | ||||
| } | ||||
| 
 | ||||
| // HTTPChecker is a type of entry in the checkers section for checking HTTP | ||||
| // URIs | ||||
| type HTTPChecker struct { | ||||
| 	// Interval is the number of seconds in between checks | ||||
| 	Interval time.Duration `yaml:"interval,omitempty"` | ||||
| 	// URI is the HTTP URI to check | ||||
| 	URI string `yaml:"uri,omitempty"` | ||||
| 	// Threshold is the number of times a check must fail to trigger an | ||||
| 	// unhealthy state | ||||
| 	Threshold int `yaml:"threshold,omitempty"` | ||||
| } | ||||
| 
 | ||||
| // Health provides the configuration section for health checks. | ||||
| type Health struct { | ||||
| 	// FileChecker is a list of paths to check | ||||
| 	FileCheckers []FileChecker `yaml:"file,omitempty"` | ||||
| 	// HTTPChecker is a list of URIs to check | ||||
| 	HTTPCheckers []HTTPChecker `yaml:"http,omitempty"` | ||||
| } | ||||
| 
 | ||||
| // v0_1Configuration is a Version 0.1 Configuration struct | ||||
| // This is currently aliased to Configuration, as it is the current version | ||||
| type v0_1Configuration Configuration | ||||
|  |  | |||
|  | @ -195,6 +195,15 @@ information about each option that appears later in this page. | |||
|         maxidle: 16 | ||||
|         maxactive: 64 | ||||
|         idletimeout: 300s | ||||
|     health: | ||||
|       file: | ||||
|         - file: /path/to/checked/file | ||||
|           interval: 10s | ||||
|           threshold: 3 | ||||
|       http: | ||||
|         - uri: http://server.to.check/must/return/200 | ||||
|           interval: 10s | ||||
|           threshold: 3 | ||||
| 
 | ||||
| In some instances a configuration option is **optional** but it contains child | ||||
| options marked as **required**. This indicates that you can omit the parent with | ||||
|  | @ -1588,6 +1597,141 @@ Configure the behavior of the Redis connection pool. | |||
|   </tr> | ||||
| </table> | ||||
| 
 | ||||
| ## health | ||||
| 
 | ||||
|     health: | ||||
|       file: | ||||
|         - file: /path/to/checked/file | ||||
|           interval: 10s | ||||
|           threshold: 3 | ||||
|       http: | ||||
|         - uri: http://server.to.check/must/return/200 | ||||
|           interval: 10s | ||||
|           threshold: 3 | ||||
| 
 | ||||
| The health option is **optional**. It may contain lists of file checkers | ||||
| and/or HTTP checkers. | ||||
| 
 | ||||
| ### file | ||||
| 
 | ||||
| file is a list of paths to be periodically checked for the existence of a file. | ||||
| If a file exists at the given path, the health check will fail. This can be | ||||
| used as a way of bringing a registry out of rotation by creating a file. | ||||
| 
 | ||||
| <table> | ||||
|   <tr> | ||||
|     <th>Parameter</th> | ||||
|     <th>Required</th> | ||||
|     <th>Description</th> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>file</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       yes | ||||
|     </td> | ||||
|     <td> | ||||
| The path to check for the existence of a file. | ||||
| </td> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>interval</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       no | ||||
|     </td> | ||||
|     <td> | ||||
|       The length of time to wait between repetitions of the check. This field | ||||
|       takes a positive integer and an optional suffix indicating the unit of | ||||
|       time. Possible units are: | ||||
|       <ul> | ||||
|         <li><code>ns</code> (nanoseconds)</li> | ||||
|         <li><code>us</code> (microseconds)</li> | ||||
|         <li><code>ms</code> (milliseconds)</li> | ||||
|         <li><code>s</code> (seconds)</li> | ||||
|         <li><code>m</code> (minutes)</li> | ||||
|         <li><code>h</code> (hours)</li> | ||||
|       </ul> | ||||
|     If you omit the suffix, the system interprets the value as nanoseconds. | ||||
|     The default value is 10 seconds if this field is omitted. | ||||
|     </td> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>threshold</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       no | ||||
|     </td> | ||||
|     <td> | ||||
|       An integer specifying the number of times the check must fail before the | ||||
|       check triggers an unhealthy state. If this filed is not specified, a | ||||
|       single failure will trigger an unhealthy state. | ||||
|     </td> | ||||
|   </tr> | ||||
| </table> | ||||
| 
 | ||||
| ### http | ||||
| 
 | ||||
| http is a list of HTTP URIs to be periodically checked with HEAD requests. If | ||||
| a HEAD request returns a status code other than 200, the health check will fail. | ||||
| 
 | ||||
| <table> | ||||
|   <tr> | ||||
|     <th>Parameter</th> | ||||
|     <th>Required</th> | ||||
|     <th>Description</th> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>uri</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       yes | ||||
|     </td> | ||||
|     <td> | ||||
| The URI to check. | ||||
| </td> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>interval</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       no | ||||
|     </td> | ||||
|     <td> | ||||
|       The length of time to wait between repetitions of the check. This field | ||||
|       takes a positive integer and an optional suffix indicating the unit of | ||||
|       time. Possible units are: | ||||
|       <ul> | ||||
|         <li><code>ns</code> (nanoseconds)</li> | ||||
|         <li><code>us</code> (microseconds)</li> | ||||
|         <li><code>ms</code> (milliseconds)</li> | ||||
|         <li><code>s</code> (seconds)</li> | ||||
|         <li><code>m</code> (minutes)</li> | ||||
|         <li><code>h</code> (hours)</li> | ||||
|       </ul> | ||||
|     If you omit the suffix, the system interprets the value as nanoseconds. | ||||
|     The default value is 10 seconds if this field is omitted. | ||||
|     </td> | ||||
|   </tr> | ||||
|   <tr> | ||||
|     <td> | ||||
|       <code>threshold</code> | ||||
|     </td> | ||||
|     <td> | ||||
|       no | ||||
|     </td> | ||||
|     <td> | ||||
|       An integer specifying the number of times the check must fail before the | ||||
|       check triggers an unhealthy state. If this filed is not specified, a | ||||
|       single failure will trigger an unhealthy state. | ||||
|     </td> | ||||
|   </tr> | ||||
| </table> | ||||
| 
 | ||||
| ## Example: Development configuration | ||||
| 
 | ||||
|  |  | |||
|  | @ -2,9 +2,11 @@ package checks | |||
| 
 | ||||
| import ( | ||||
| 	"errors" | ||||
| 	"github.com/docker/distribution/health" | ||||
| 	"net/http" | ||||
| 	"os" | ||||
| 	"strconv" | ||||
| 
 | ||||
| 	"github.com/docker/distribution/health" | ||||
| ) | ||||
| 
 | ||||
| // FileChecker checks the existence of a file and returns and error | ||||
|  | @ -28,7 +30,7 @@ func HTTPChecker(r string) health.Checker { | |||
| 			return errors.New("error while checking: " + r) | ||||
| 		} | ||||
| 		if response.StatusCode != http.StatusOK { | ||||
| 			return errors.New("downstream service returned unexpected status: " + string(response.StatusCode)) | ||||
| 			return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode)) | ||||
| 		} | ||||
| 		return nil | ||||
| 	}) | ||||
|  |  | |||
|  | @ -170,6 +170,20 @@ func Register(name string, check Checker) { | |||
| 	registeredChecks[name] = check | ||||
| } | ||||
| 
 | ||||
| // Unregister removes the named checker. | ||||
| func Unregister(name string) { | ||||
| 	mutex.Lock() | ||||
| 	defer mutex.Unlock() | ||||
| 	delete(registeredChecks, name) | ||||
| } | ||||
| 
 | ||||
| // UnregisterAll removes all registered checkers. | ||||
| func UnregisterAll() { | ||||
| 	mutex.Lock() | ||||
| 	defer mutex.Unlock() | ||||
| 	registeredChecks = make(map[string]Checker) | ||||
| } | ||||
| 
 | ||||
| // RegisterFunc allows the convenience of registering a checker directly | ||||
| // from an arbitrary func() error | ||||
| func RegisterFunc(name string, check func() error) { | ||||
|  |  | |||
|  | @ -15,6 +15,7 @@ import ( | |||
| 	"github.com/docker/distribution/configuration" | ||||
| 	ctxu "github.com/docker/distribution/context" | ||||
| 	"github.com/docker/distribution/health" | ||||
| 	"github.com/docker/distribution/health/checks" | ||||
| 	"github.com/docker/distribution/notifications" | ||||
| 	"github.com/docker/distribution/registry/api/errcode" | ||||
| 	"github.com/docker/distribution/registry/api/v2" | ||||
|  | @ -37,6 +38,9 @@ import ( | |||
| // was specified. | ||||
| const randomSecretSize = 32 | ||||
| 
 | ||||
| // defaultCheckInterval is the default time in between health checks | ||||
| const defaultCheckInterval = 10 * time.Second | ||||
| 
 | ||||
| // App is a global registry application object. Shared resources can be placed | ||||
| // on this object that will be accessible from all requests. Any writable | ||||
| // fields should be protected. | ||||
|  | @ -231,10 +235,38 @@ func NewApp(ctx context.Context, configuration configuration.Configuration) *App | |||
| // implementing this properly will require a refactor. This method may panic | ||||
| // if called twice in the same process. | ||||
| func (app *App) RegisterHealthChecks() { | ||||
| 	health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), 10*time.Second, 3, func() error { | ||||
| 	health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), defaultCheckInterval, 3, func() error { | ||||
| 		_, err := app.driver.List(app, "/") // "/" should always exist | ||||
| 		return err                          // any error will be treated as failure | ||||
| 	}) | ||||
| 
 | ||||
| 	for _, fileChecker := range app.Config.Health.FileCheckers { | ||||
| 		interval := fileChecker.Interval | ||||
| 		if interval == 0 { | ||||
| 			interval = defaultCheckInterval | ||||
| 		} | ||||
| 		if fileChecker.Threshold != 0 { | ||||
| 			ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d, threshold=%d", fileChecker.File, interval/time.Second, fileChecker.Threshold) | ||||
| 			health.Register(fileChecker.File, health.PeriodicThresholdChecker(checks.FileChecker(fileChecker.File), interval, fileChecker.Threshold)) | ||||
| 		} else { | ||||
| 			ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second) | ||||
| 			health.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval)) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for _, httpChecker := range app.Config.Health.HTTPCheckers { | ||||
| 		interval := httpChecker.Interval | ||||
| 		if interval == 0 { | ||||
| 			interval = defaultCheckInterval | ||||
| 		} | ||||
| 		if httpChecker.Threshold != 0 { | ||||
| 			ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold) | ||||
| 			health.Register(httpChecker.URI, health.PeriodicThresholdChecker(checks.HTTPChecker(httpChecker.URI), interval, httpChecker.Threshold)) | ||||
| 		} else { | ||||
| 			ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second) | ||||
| 			health.Register(httpChecker.URI, health.PeriodicChecker(checks.HTTPChecker(httpChecker.URI), interval)) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // register a handler with the application, by route name. The handler will be | ||||
|  |  | |||
							
								
								
									
										200
									
								
								registry/handlers/health_test.go
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										200
									
								
								registry/handlers/health_test.go
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,200 @@ | |||
| package handlers | ||||
| 
 | ||||
| import ( | ||||
| 	"encoding/json" | ||||
| 	"io/ioutil" | ||||
| 	"net/http" | ||||
| 	"net/http/httptest" | ||||
| 	"os" | ||||
| 	"testing" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/docker/distribution/configuration" | ||||
| 	"github.com/docker/distribution/health" | ||||
| 	"golang.org/x/net/context" | ||||
| ) | ||||
| 
 | ||||
| func TestFileHealthCheck(t *testing.T) { | ||||
| 	// In case other tests registered checks before this one | ||||
| 	health.UnregisterAll() | ||||
| 
 | ||||
| 	interval := time.Second | ||||
| 
 | ||||
| 	tmpfile, err := ioutil.TempFile(os.TempDir(), "healthcheck") | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("could not create temporary file: %v", err) | ||||
| 	} | ||||
| 	defer tmpfile.Close() | ||||
| 
 | ||||
| 	config := configuration.Configuration{ | ||||
| 		Storage: configuration.Storage{ | ||||
| 			"inmemory": configuration.Parameters{}, | ||||
| 		}, | ||||
| 		Health: configuration.Health{ | ||||
| 			FileCheckers: []configuration.FileChecker{ | ||||
| 				{ | ||||
| 					Interval: interval, | ||||
| 					File:     tmpfile.Name(), | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	ctx := context.Background() | ||||
| 
 | ||||
| 	app := NewApp(ctx, config) | ||||
| 	app.RegisterHealthChecks() | ||||
| 
 | ||||
| 	debugServer := httptest.NewServer(nil) | ||||
| 
 | ||||
| 	// Wait for health check to happen | ||||
| 	<-time.After(2 * interval) | ||||
| 
 | ||||
| 	resp, err := http.Get(debugServer.URL + "/debug/health") | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error performing HTTP GET: %v", err) | ||||
| 	} | ||||
| 	body, err := ioutil.ReadAll(resp.Body) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error reading HTTP body: %v", err) | ||||
| 	} | ||||
| 	resp.Body.Close() | ||||
| 	var decoded map[string]string | ||||
| 	err = json.Unmarshal(body, &decoded) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error unmarshaling json: %v", err) | ||||
| 	} | ||||
| 	if len(decoded) != 1 { | ||||
| 		t.Fatal("expected 1 item in returned json") | ||||
| 	} | ||||
| 	if decoded[tmpfile.Name()] != "file exists" { | ||||
| 		t.Fatal(`did not get "file exists" result for health check`) | ||||
| 	} | ||||
| 
 | ||||
| 	os.Remove(tmpfile.Name()) | ||||
| 
 | ||||
| 	<-time.After(2 * interval) | ||||
| 	resp, err = http.Get(debugServer.URL + "/debug/health") | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error performing HTTP GET: %v", err) | ||||
| 	} | ||||
| 	body, err = ioutil.ReadAll(resp.Body) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error reading HTTP body: %v", err) | ||||
| 	} | ||||
| 	resp.Body.Close() | ||||
| 	var decoded2 map[string]string | ||||
| 	err = json.Unmarshal(body, &decoded2) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error unmarshaling json: %v", err) | ||||
| 	} | ||||
| 	if len(decoded2) != 0 { | ||||
| 		t.Fatal("expected 0 items in returned json") | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func TestHTTPHealthCheck(t *testing.T) { | ||||
| 	// In case other tests registered checks before this one | ||||
| 	health.UnregisterAll() | ||||
| 
 | ||||
| 	interval := time.Second | ||||
| 	threshold := 3 | ||||
| 
 | ||||
| 	stopFailing := make(chan struct{}) | ||||
| 
 | ||||
| 	checkedServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { | ||||
| 		if r.Method != "HEAD" { | ||||
| 			t.Fatalf("expected HEAD request, got %s", r.Method) | ||||
| 		} | ||||
| 		select { | ||||
| 		case <-stopFailing: | ||||
| 			w.WriteHeader(http.StatusOK) | ||||
| 		default: | ||||
| 			w.WriteHeader(http.StatusInternalServerError) | ||||
| 		} | ||||
| 	})) | ||||
| 
 | ||||
| 	config := configuration.Configuration{ | ||||
| 		Storage: configuration.Storage{ | ||||
| 			"inmemory": configuration.Parameters{}, | ||||
| 		}, | ||||
| 		Health: configuration.Health{ | ||||
| 			HTTPCheckers: []configuration.HTTPChecker{ | ||||
| 				{ | ||||
| 					Interval:  interval, | ||||
| 					URI:       checkedServer.URL, | ||||
| 					Threshold: threshold, | ||||
| 				}, | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	ctx := context.Background() | ||||
| 
 | ||||
| 	app := NewApp(ctx, config) | ||||
| 	app.RegisterHealthChecks() | ||||
| 
 | ||||
| 	debugServer := httptest.NewServer(nil) | ||||
| 
 | ||||
| 	for i := 0; ; i++ { | ||||
| 		<-time.After(interval) | ||||
| 
 | ||||
| 		resp, err := http.Get(debugServer.URL + "/debug/health") | ||||
| 		if err != nil { | ||||
| 			t.Fatalf("error performing HTTP GET: %v", err) | ||||
| 		} | ||||
| 		body, err := ioutil.ReadAll(resp.Body) | ||||
| 		if err != nil { | ||||
| 			t.Fatalf("error reading HTTP body: %v", err) | ||||
| 		} | ||||
| 		resp.Body.Close() | ||||
| 		var decoded map[string]string | ||||
| 		err = json.Unmarshal(body, &decoded) | ||||
| 		if err != nil { | ||||
| 			t.Fatalf("error unmarshaling json: %v", err) | ||||
| 		} | ||||
| 
 | ||||
| 		if i < threshold-1 { | ||||
| 			// definitely shouldn't have hit the threshold yet | ||||
| 			if len(decoded) != 0 { | ||||
| 				t.Fatal("expected 1 items in returned json") | ||||
| 			} | ||||
| 			continue | ||||
| 		} | ||||
| 		if i < threshold+1 { | ||||
| 			// right on the threshold - don't expect a failure yet | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		if len(decoded) != 1 { | ||||
| 			t.Fatal("expected 1 item in returned json") | ||||
| 		} | ||||
| 		if decoded[checkedServer.URL] != "downstream service returned unexpected status: 500" { | ||||
| 			t.Fatal("did not get expected result for health check") | ||||
| 		} | ||||
| 
 | ||||
| 		break | ||||
| 	} | ||||
| 
 | ||||
| 	// Signal HTTP handler to start returning 200 | ||||
| 	close(stopFailing) | ||||
| 
 | ||||
| 	<-time.After(2 * interval) | ||||
| 	resp, err := http.Get(debugServer.URL + "/debug/health") | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error performing HTTP GET: %v", err) | ||||
| 	} | ||||
| 	body, err := ioutil.ReadAll(resp.Body) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error reading HTTP body: %v", err) | ||||
| 	} | ||||
| 	resp.Body.Close() | ||||
| 	var decoded map[string]string | ||||
| 	err = json.Unmarshal(body, &decoded) | ||||
| 	if err != nil { | ||||
| 		t.Fatalf("error unmarshaling json: %v", err) | ||||
| 	} | ||||
| 	if len(decoded) != 0 { | ||||
| 		t.Fatal("expected 0 items in returned json") | ||||
| 	} | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue