Add a TCP health checker

Also, add timeout and status code parameters to the HTTP checker, and
remove the threshold parameter for the file checker.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
Aaron Lehmann 2015-08-19 17:57:18 -07:00
parent b9b9cafa8f
commit e8f088fea6
6 changed files with 279 additions and 46 deletions

View file

@ -181,9 +181,9 @@ type MailOptions struct {
To []string `yaml:"to,omitempty"` To []string `yaml:"to,omitempty"`
} }
// FileChecker is a type of entry in the checkers section for checking files // FileChecker is a type of entry in the health section for checking files.
type FileChecker struct { type FileChecker struct {
// Interval is the number of seconds in between checks // Interval is the duration in between checks
Interval time.Duration `yaml:"interval,omitempty"` Interval time.Duration `yaml:"interval,omitempty"`
// File is the path to check // File is the path to check
File string `yaml:"file,omitempty"` File string `yaml:"file,omitempty"`
@ -192,10 +192,13 @@ type FileChecker struct {
Threshold int `yaml:"threshold,omitempty"` Threshold int `yaml:"threshold,omitempty"`
} }
// HTTPChecker is a type of entry in the checkers section for checking HTTP // HTTPChecker is a type of entry in the health section for checking HTTP URIs.
// URIs
type HTTPChecker struct { type HTTPChecker struct {
// Interval is the number of seconds in between checks // Timeout is the duration to wait before timing out the HTTP request
Timeout time.Duration `yaml:"interval,omitempty"`
// StatusCode is the expected status code
StatusCode int
// Interval is the duration in between checks
Interval time.Duration `yaml:"interval,omitempty"` Interval time.Duration `yaml:"interval,omitempty"`
// URI is the HTTP URI to check // URI is the HTTP URI to check
URI string `yaml:"uri,omitempty"` URI string `yaml:"uri,omitempty"`
@ -204,18 +207,33 @@ type HTTPChecker struct {
Threshold int `yaml:"threshold,omitempty"` Threshold int `yaml:"threshold,omitempty"`
} }
// TCPChecker is a type of entry in the health section for checking TCP servers.
type TCPChecker struct {
// Timeout is the duration to wait before timing out the TCP connection
Timeout time.Duration `yaml:"interval,omitempty"`
// Interval is the duration in between checks
Interval time.Duration `yaml:"interval,omitempty"`
// Addr is the TCP address to check
Addr string `yaml:"addr,omitempty"`
// Threshold is the number of times a check must fail to trigger an
// unhealthy state
Threshold int `yaml:"threshold,omitempty"`
}
// Health provides the configuration section for health checks. // Health provides the configuration section for health checks.
type Health struct { type Health struct {
// FileChecker is a list of paths to check // FileCheckers is a list of paths to check
FileCheckers []FileChecker `yaml:"file,omitempty"` FileCheckers []FileChecker `yaml:"file,omitempty"`
// HTTPChecker is a list of URIs to check // HTTPCheckers is a list of URIs to check
HTTPCheckers []HTTPChecker `yaml:"http,omitempty"` HTTPCheckers []HTTPChecker `yaml:"http,omitempty"`
// TCPCheckers is a list of URIs to check
TCPCheckers []TCPChecker `yaml:"tcp,omitempty"`
// StorageDriver configures a health check on the configured storage // StorageDriver configures a health check on the configured storage
// driver // driver
StorageDriver struct { StorageDriver struct {
// Enabled turns on the health check for the storage driver // Enabled turns on the health check for the storage driver
Enabled bool `yaml:"enabled,omitempty"` Enabled bool `yaml:"enabled,omitempty"`
// Interval is the number of seconds in between checks // Interval is the duration in between checks
Interval time.Duration `yaml:"interval,omitempty"` Interval time.Duration `yaml:"interval,omitempty"`
// Threshold is the number of times a check must fail to trigger an // Threshold is the number of times a check must fail to trigger an
// unhealthy state // unhealthy state

View file

@ -203,9 +203,15 @@ information about each option that appears later in this page.
file: file:
- file: /path/to/checked/file - file: /path/to/checked/file
interval: 10s interval: 10s
threshold: 3
http: http:
- uri: http://server.to.check/must/return/200 - uri: http://server.to.check/must/return/200
statuscode: 200
timeout: 3s
interval: 10s
threshold: 3
tcp:
- addr: redis-server.domain.com:6379
timeout: 3s
interval: 10s interval: 10s
threshold: 3 threshold: 3
@ -1611,17 +1617,23 @@ Configure the behavior of the Redis connection pool.
file: file:
- file: /path/to/checked/file - file: /path/to/checked/file
interval: 10s interval: 10s
threshold: 3
http: http:
- uri: http://server.to.check/must/return/200 - uri: http://server.to.check/must/return/200
statuscode: 200
timeout: 3s
interval: 10s
threshold: 3
tcp:
- addr: redis-server.domain.com:6379
timeout: 3s
interval: 10s interval: 10s
threshold: 3 threshold: 3
The health option is **optional**. It may contain preferences for a periodic The health option is **optional**. It may contain preferences for a periodic
health check on the storage driver's backend storage, and optional periodic health check on the storage driver's backend storage, and optional periodic
checks on local files and/or HTTP URIs. The results of the health checks are checks on local files, HTTP URIs, and/or TCP servers. The results of the health
available at /debug/health on the debug HTTP server if the debug HTTP server is checks are available at /debug/health on the debug HTTP server if the debug
enabled (see http section). HTTP server is enabled (see http section).
### storagedriver ### storagedriver
@ -1730,25 +1742,13 @@ The path to check for the existence of a file.
The default value is 10 seconds if this field is omitted. The default value is 10 seconds if this field is omitted.
</td> </td>
</tr> </tr>
<tr>
<td>
<code>threshold</code>
</td>
<td>
no
</td>
<td>
An integer specifying the number of times the check must fail before the
check triggers an unhealthy state. If this filed is not specified, a
single failure will trigger an unhealthy state.
</td>
</tr>
</table> </table>
### http ### http
http is a list of HTTP URIs to be periodically checked with HEAD requests. If http is a list of HTTP URIs to be periodically checked with HEAD requests. If
a HEAD request returns a status code other than 200, the health check will fail. a HEAD request doesn't complete or returns an unexpected status code, the
health check will fail.
<table> <table>
<tr> <tr>
@ -1767,6 +1767,122 @@ a HEAD request returns a status code other than 200, the health check will fail.
The URI to check. The URI to check.
</td> </td>
</tr> </tr>
<tr>
<td>
<code>statuscode</code>
</td>
<td>
no
</td>
<td>
Expected status code from the HTTP URI. Defaults to 200.
</td>
</tr>
<tr>
<td>
<code>timeout</code>
</td>
<td>
no
</td>
<td>
The length of time to wait before timing out the HTTP request. This field
takes a positive integer and an optional suffix indicating the unit of
time. Possible units are:
<ul>
<li><code>ns</code> (nanoseconds)</li>
<li><code>us</code> (microseconds)</li>
<li><code>ms</code> (milliseconds)</li>
<li><code>s</code> (seconds)</li>
<li><code>m</code> (minutes)</li>
<li><code>h</code> (hours)</li>
</ul>
If you omit the suffix, the system interprets the value as nanoseconds.
</td>
</tr>
<tr>
<td>
<code>interval</code>
</td>
<td>
no
</td>
<td>
The length of time to wait between repetitions of the check. This field
takes a positive integer and an optional suffix indicating the unit of
time. Possible units are:
<ul>
<li><code>ns</code> (nanoseconds)</li>
<li><code>us</code> (microseconds)</li>
<li><code>ms</code> (milliseconds)</li>
<li><code>s</code> (seconds)</li>
<li><code>m</code> (minutes)</li>
<li><code>h</code> (hours)</li>
</ul>
If you omit the suffix, the system interprets the value as nanoseconds.
The default value is 10 seconds if this field is omitted.
</td>
</tr>
<tr>
<td>
<code>threshold</code>
</td>
<td>
no
</td>
<td>
An integer specifying the number of times the check must fail before the
check triggers an unhealthy state. If this filed is not specified, a
single failure will trigger an unhealthy state.
</td>
</tr>
</table>
### tcp
tcp is a list of TCP addresses to be periodically checked with connection
attempts. The addresses must include port numbers. If a connection attempt
fails, the health check will fail.
<table>
<tr>
<th>Parameter</th>
<th>Required</th>
<th>Description</th>
</tr>
<tr>
<td>
<code>addr</code>
</td>
<td>
yes
</td>
<td>
The TCP address to connect to, including a port number.
</td>
</tr>
<tr>
<td>
<code>timeout</code>
</td>
<td>
no
</td>
<td>
The length of time to wait before timing out the TCP connection. This
field takes a positive integer and an optional suffix indicating the unit
of time. Possible units are:
<ul>
<li><code>ns</code> (nanoseconds)</li>
<li><code>us</code> (microseconds)</li>
<li><code>ms</code> (milliseconds)</li>
<li><code>s</code> (seconds)</li>
<li><code>m</code> (minutes)</li>
<li><code>h</code> (hours)</li>
</ul>
If you omit the suffix, the system interprets the value as nanoseconds.
</td>
</tr>
<tr> <tr>
<td> <td>
<code>interval</code> <code>interval</code>

View file

@ -2,15 +2,17 @@ package checks
import ( import (
"errors" "errors"
"net"
"net/http" "net/http"
"os" "os"
"strconv" "strconv"
"time"
"github.com/docker/distribution/health" "github.com/docker/distribution/health"
) )
// FileChecker checks the existence of a file and returns and error // FileChecker checks the existence of a file and returns an error
// if the file exists, taking the application out of rotation // if the file exists.
func FileChecker(f string) health.Checker { func FileChecker(f string) health.Checker {
return health.CheckFunc(func() error { return health.CheckFunc(func() error {
if _, err := os.Stat(f); err == nil { if _, err := os.Stat(f); err == nil {
@ -20,18 +22,32 @@ func FileChecker(f string) health.Checker {
}) })
} }
// HTTPChecker does a HEAD request and verifies if the HTTP status // HTTPChecker does a HEAD request and verifies that the HTTP status code
// code return is a 200, taking the application out of rotation if // returned matches statusCode.
// otherwise func HTTPChecker(r string, statusCode int, timeout time.Duration) health.Checker {
func HTTPChecker(r string) health.Checker {
return health.CheckFunc(func() error { return health.CheckFunc(func() error {
response, err := http.Head(r) client := http.Client{
Timeout: timeout,
}
response, err := client.Head(r)
if err != nil { if err != nil {
return errors.New("error while checking: " + r) return errors.New("error while checking: " + r)
} }
if response.StatusCode != http.StatusOK { if response.StatusCode != statusCode {
return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode)) return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode))
} }
return nil return nil
}) })
} }
// TCPChecker attempts to open a TCP connection.
func TCPChecker(addr string, timeout time.Duration) health.Checker {
return health.CheckFunc(func() error {
conn, err := net.DialTimeout("tcp", addr, timeout)
if err != nil {
return errors.New("connection to " + addr + " failed")
}
conn.Close()
return nil
})
}

View file

@ -15,11 +15,11 @@ func TestFileChecker(t *testing.T) {
} }
func TestHTTPChecker(t *testing.T) { func TestHTTPChecker(t *testing.T) {
if err := HTTPChecker("https://www.google.cybertron").Check(); err == nil { if err := HTTPChecker("https://www.google.cybertron", 200, 0).Check(); err == nil {
t.Errorf("Google on Cybertron was expected as not exists") t.Errorf("Google on Cybertron was expected as not exists")
} }
if err := HTTPChecker("https://www.google.pt").Check(); err != nil { if err := HTTPChecker("https://www.google.pt", 200, 0).Check(); err != nil {
t.Errorf("Google at Portugal was expected as exists, error:%v", err) t.Errorf("Google at Portugal was expected as exists, error:%v", err)
} }
} }

View file

@ -266,26 +266,46 @@ func (app *App) RegisterHealthChecks(healthRegistries ...*health.Registry) {
if interval == 0 { if interval == 0 {
interval = defaultCheckInterval interval = defaultCheckInterval
} }
if fileChecker.Threshold != 0 {
ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d, threshold=%d", fileChecker.File, interval/time.Second, fileChecker.Threshold)
healthRegistry.Register(fileChecker.File, health.PeriodicThresholdChecker(checks.FileChecker(fileChecker.File), interval, fileChecker.Threshold))
} else {
ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second) ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second)
healthRegistry.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval)) healthRegistry.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval))
} }
}
for _, httpChecker := range app.Config.Health.HTTPCheckers { for _, httpChecker := range app.Config.Health.HTTPCheckers {
interval := httpChecker.Interval interval := httpChecker.Interval
if interval == 0 { if interval == 0 {
interval = defaultCheckInterval interval = defaultCheckInterval
} }
statusCode := httpChecker.StatusCode
if statusCode == 0 {
statusCode = 200
}
checker := checks.HTTPChecker(httpChecker.URI, statusCode, httpChecker.Timeout)
if httpChecker.Threshold != 0 { if httpChecker.Threshold != 0 {
ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold) ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold)
healthRegistry.Register(httpChecker.URI, health.PeriodicThresholdChecker(checks.HTTPChecker(httpChecker.URI), interval, httpChecker.Threshold)) healthRegistry.Register(httpChecker.URI, health.PeriodicThresholdChecker(checker, interval, httpChecker.Threshold))
} else { } else {
ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second) ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second)
healthRegistry.Register(httpChecker.URI, health.PeriodicChecker(checks.HTTPChecker(httpChecker.URI), interval)) healthRegistry.Register(httpChecker.URI, health.PeriodicChecker(checker, interval))
}
}
for _, tcpChecker := range app.Config.Health.TCPCheckers {
interval := tcpChecker.Interval
if interval == 0 {
interval = defaultCheckInterval
}
checker := checks.TCPChecker(tcpChecker.Addr, tcpChecker.Timeout)
if tcpChecker.Threshold != 0 {
ctxu.GetLogger(app).Infof("configuring TCP health check addr=%s, interval=%d, threshold=%d", tcpChecker.Addr, interval/time.Second, tcpChecker.Threshold)
healthRegistry.Register(tcpChecker.Addr, health.PeriodicThresholdChecker(checker, interval, tcpChecker.Threshold))
} else {
ctxu.GetLogger(app).Infof("configuring TCP health check addr=%s, interval=%d", tcpChecker.Addr, interval/time.Second)
healthRegistry.Register(tcpChecker.Addr, health.PeriodicChecker(checker, interval))
} }
} }
} }

View file

@ -2,6 +2,7 @@ package handlers
import ( import (
"io/ioutil" "io/ioutil"
"net"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
@ -61,6 +62,68 @@ func TestFileHealthCheck(t *testing.T) {
} }
} }
func TestTCPHealthCheck(t *testing.T) {
interval := time.Second
ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("could not create listener: %v", err)
}
addrStr := ln.Addr().String()
// Start accepting
go func() {
for {
conn, err := ln.Accept()
if err != nil {
// listener was closed
return
}
defer conn.Close()
}
}()
config := configuration.Configuration{
Storage: configuration.Storage{
"inmemory": configuration.Parameters{},
},
Health: configuration.Health{
TCPCheckers: []configuration.TCPChecker{
{
Interval: interval,
Addr: addrStr,
Timeout: 500 * time.Millisecond,
},
},
},
}
ctx := context.Background()
app := NewApp(ctx, config)
healthRegistry := health.NewRegistry()
app.RegisterHealthChecks(healthRegistry)
// Wait for health check to happen
<-time.After(2 * interval)
if len(healthRegistry.CheckStatus()) != 0 {
t.Fatal("expected 0 items in health check results")
}
ln.Close()
<-time.After(2 * interval)
// Health check should now fail
status := healthRegistry.CheckStatus()
if len(status) != 1 {
t.Fatal("expected 1 item in health check results")
}
if status[addrStr] != "connection to "+addrStr+" failed" {
t.Fatal(`did not get "connection failed" result for health check`)
}
}
func TestHTTPHealthCheck(t *testing.T) { func TestHTTPHealthCheck(t *testing.T) {
interval := time.Second interval := time.Second
threshold := 3 threshold := 3