Add timeout flag for container start times
This currently depends on a runc PR: https://github.com/opencontainers/runc/pull/703 We need this pr because we have to SIGKILL runc and the container root dir will still be left around. As for the containerd changes this adds a flag to containerd so that you can configure the timeout without any more code changes. It also adds better handling in the error cases and will kill the containerd-shim and runc ( as well as the user process if it exists ) if the timeout is hit. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
604c8d7832
commit
3742ae3ec8
6 changed files with 79 additions and 37 deletions
|
@ -59,6 +59,11 @@ var daemonFlags = []cli.Flag{
|
||||||
Name: "pprof-address",
|
Name: "pprof-address",
|
||||||
Usage: "http address to listen for pprof events",
|
Usage: "http address to listen for pprof events",
|
||||||
},
|
},
|
||||||
|
cli.DurationFlag{
|
||||||
|
Name: "start-timeout",
|
||||||
|
Value: 15 * time.Second,
|
||||||
|
Usage: "timeout duration for waiting on a container to start before it is killed",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -81,6 +86,7 @@ func main() {
|
||||||
10,
|
10,
|
||||||
context.String("runtime"),
|
context.String("runtime"),
|
||||||
context.StringSlice("runtime-args"),
|
context.StringSlice("runtime-args"),
|
||||||
|
context.Duration("start-timeout"),
|
||||||
); err != nil {
|
); err != nil {
|
||||||
logrus.Fatal(err)
|
logrus.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -90,7 +96,7 @@ func main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func daemon(address, stateDir string, concurrency int, runtimeName string, runtimeArgs []string) error {
|
func daemon(address, stateDir string, concurrency int, runtimeName string, runtimeArgs []string, timeout time.Duration) error {
|
||||||
// setup a standard reaper so that we don't leave any zombies if we are still alive
|
// setup a standard reaper so that we don't leave any zombies if we are still alive
|
||||||
// this is just good practice because we are spawning new processes
|
// this is just good practice because we are spawning new processes
|
||||||
s := make(chan os.Signal, 2048)
|
s := make(chan os.Signal, 2048)
|
||||||
|
@ -98,7 +104,7 @@ func daemon(address, stateDir string, concurrency int, runtimeName string, runti
|
||||||
if err := osutils.SetSubreaper(1); err != nil {
|
if err := osutils.SetSubreaper(1); err != nil {
|
||||||
logrus.WithField("error", err).Error("containerd: set subpreaper")
|
logrus.WithField("error", err).Error("containerd: set subpreaper")
|
||||||
}
|
}
|
||||||
sv, err := supervisor.New(stateDir, runtimeName, runtimeArgs)
|
sv, err := supervisor.New(stateDir, runtimeName, runtimeArgs, timeout)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Sirupsen/logrus"
|
"github.com/Sirupsen/logrus"
|
||||||
"github.com/docker/containerd/specs"
|
"github.com/docker/containerd/specs"
|
||||||
|
@ -90,6 +91,7 @@ type ContainerOpts struct {
|
||||||
RuntimeArgs []string
|
RuntimeArgs []string
|
||||||
Labels []string
|
Labels []string
|
||||||
NoPivotRoot bool
|
NoPivotRoot bool
|
||||||
|
Timeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// New returns a new container
|
// New returns a new container
|
||||||
|
@ -103,6 +105,7 @@ func New(opts ContainerOpts) (Container, error) {
|
||||||
runtime: opts.Runtime,
|
runtime: opts.Runtime,
|
||||||
runtimeArgs: opts.RuntimeArgs,
|
runtimeArgs: opts.RuntimeArgs,
|
||||||
noPivotRoot: opts.NoPivotRoot,
|
noPivotRoot: opts.NoPivotRoot,
|
||||||
|
timeout: opts.Timeout,
|
||||||
}
|
}
|
||||||
if err := os.Mkdir(filepath.Join(c.root, c.id), 0755); err != nil {
|
if err := os.Mkdir(filepath.Join(c.root, c.id), 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -191,6 +194,7 @@ type container struct {
|
||||||
labels []string
|
labels []string
|
||||||
oomFds []int
|
oomFds []int
|
||||||
noPivotRoot bool
|
noPivotRoot bool
|
||||||
|
timeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *container) ID() string {
|
func (c *container) ID() string {
|
||||||
|
@ -223,8 +227,9 @@ func (c *container) Delete() error {
|
||||||
|
|
||||||
args := c.runtimeArgs
|
args := c.runtimeArgs
|
||||||
args = append(args, "delete", c.id)
|
args = append(args, "delete", c.id)
|
||||||
exec.Command(c.runtime, args...).Run()
|
if derr := exec.Command(c.runtime, args...).Run(); err == nil {
|
||||||
|
err = derr
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -224,7 +224,7 @@ func (c *container) startCmd(pid string, cmd *exec.Cmd, p *process) error {
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := waitForStart(p, cmd); err != nil {
|
if err := c.waitForStart(p, cmd); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
c.processes[pid] = p
|
c.processes[pid] = p
|
||||||
|
@ -335,49 +335,76 @@ func (c *container) writeEventFD(root string, cfd, efd int) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func waitForStart(p *process, cmd *exec.Cmd) error {
|
type waitArgs struct {
|
||||||
for i := 0; i < 300; i++ {
|
pid int
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *container) waitForStart(p *process, cmd *exec.Cmd) error {
|
||||||
|
wc := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
if _, err := p.getPidFromFile(); err != nil {
|
if _, err := p.getPidFromFile(); err != nil {
|
||||||
if os.IsNotExist(err) || err == errInvalidPidInt {
|
if os.IsNotExist(err) || err == errInvalidPidInt {
|
||||||
alive, err := isAlive(cmd)
|
alive, err := isAlive(cmd)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
wc <- err
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if !alive {
|
if !alive {
|
||||||
// runc could have failed to run the container so lets get the error
|
// runc could have failed to run the container so lets get the error
|
||||||
// out of the logs or the shim could have encountered an error
|
// out of the logs or the shim could have encountered an error
|
||||||
messages, err := readLogMessages(filepath.Join(p.root, "shim-log.json"))
|
messages, err := readLogMessages(filepath.Join(p.root, "shim-log.json"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
wc <- err
|
||||||
|
return
|
||||||
}
|
}
|
||||||
for _, m := range messages {
|
for _, m := range messages {
|
||||||
if m.Level == "error" {
|
if m.Level == "error" {
|
||||||
return fmt.Errorf("shim error: %v", m.Msg)
|
wc <- fmt.Errorf("shim error: %v", m.Msg)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// no errors reported back from shim, check for runc/runtime errors
|
// no errors reported back from shim, check for runc/runtime errors
|
||||||
messages, err = readLogMessages(filepath.Join(p.root, "log.json"))
|
messages, err = readLogMessages(filepath.Join(p.root, "log.json"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
return ErrContainerNotStarted
|
err = ErrContainerNotStarted
|
||||||
}
|
}
|
||||||
return err
|
wc <- err
|
||||||
|
return
|
||||||
}
|
}
|
||||||
for _, m := range messages {
|
for _, m := range messages {
|
||||||
if m.Level == "error" {
|
if m.Level == "error" {
|
||||||
return fmt.Errorf("oci runtime error: %v", m.Msg)
|
wc <- fmt.Errorf("oci runtime error: %v", m.Msg)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ErrContainerNotStarted
|
wc <- ErrContainerNotStarted
|
||||||
|
return
|
||||||
}
|
}
|
||||||
time.Sleep(50 * time.Millisecond)
|
time.Sleep(50 * time.Millisecond)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
wc <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// the pid file was read successfully
|
||||||
|
wc <- nil
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
select {
|
||||||
|
case err := <-wc:
|
||||||
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
case <-time.After(c.timeout):
|
||||||
|
cmd.Process.Kill()
|
||||||
|
cmd.Wait()
|
||||||
|
return ErrContainerStartTimeout
|
||||||
}
|
}
|
||||||
return errNoPidFile
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// isAlive checks if the shim that launched the container is still alive
|
// isAlive checks if the shim that launched the container is still alive
|
||||||
|
|
|
@ -17,6 +17,7 @@ var (
|
||||||
ErrProcessNotExited = errors.New("containerd: process has not exited")
|
ErrProcessNotExited = errors.New("containerd: process has not exited")
|
||||||
ErrProcessExited = errors.New("containerd: process has exited")
|
ErrProcessExited = errors.New("containerd: process has exited")
|
||||||
ErrContainerNotStarted = errors.New("containerd: container not started")
|
ErrContainerNotStarted = errors.New("containerd: container not started")
|
||||||
|
ErrContainerStartTimeout = errors.New("containerd: container did not start before the specified timeout")
|
||||||
|
|
||||||
errNoPidFile = errors.New("containerd: no process pid file found")
|
errNoPidFile = errors.New("containerd: no process pid file found")
|
||||||
errInvalidPidInt = errors.New("containerd: process pid is invalid")
|
errInvalidPidInt = errors.New("containerd: process pid is invalid")
|
||||||
|
|
|
@ -29,6 +29,7 @@ func (s *Supervisor) start(t *StartTask) error {
|
||||||
RuntimeArgs: s.runtimeArgs,
|
RuntimeArgs: s.runtimeArgs,
|
||||||
Labels: t.Labels,
|
Labels: t.Labels,
|
||||||
NoPivotRoot: t.NoPivotRoot,
|
NoPivotRoot: t.NoPivotRoot,
|
||||||
|
Timeout: s.timeout,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -18,7 +18,7 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
// New returns an initialized Process supervisor.
|
// New returns an initialized Process supervisor.
|
||||||
func New(stateDir string, runtimeName string, runtimeArgs []string) (*Supervisor, error) {
|
func New(stateDir string, runtimeName string, runtimeArgs []string, timeout time.Duration) (*Supervisor, error) {
|
||||||
startTasks := make(chan *startTask, 10)
|
startTasks := make(chan *startTask, 10)
|
||||||
if err := os.MkdirAll(stateDir, 0755); err != nil {
|
if err := os.MkdirAll(stateDir, 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -41,6 +41,7 @@ func New(stateDir string, runtimeName string, runtimeArgs []string) (*Supervisor
|
||||||
monitor: monitor,
|
monitor: monitor,
|
||||||
runtime: runtimeName,
|
runtime: runtimeName,
|
||||||
runtimeArgs: runtimeArgs,
|
runtimeArgs: runtimeArgs,
|
||||||
|
timeout: timeout,
|
||||||
}
|
}
|
||||||
if err := setupEventLog(s); err != nil {
|
if err := setupEventLog(s); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -118,6 +119,7 @@ type Supervisor struct {
|
||||||
tasks chan Task
|
tasks chan Task
|
||||||
monitor *Monitor
|
monitor *Monitor
|
||||||
eventLog []Event
|
eventLog []Event
|
||||||
|
timeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop closes all startTasks and sends a SIGTERM to each container's pid1 then waits for they to
|
// Stop closes all startTasks and sends a SIGTERM to each container's pid1 then waits for they to
|
||||||
|
|
Loading…
Reference in a new issue