From 3742ae3ec888da95554edc7705272bc42d193680 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 28 Mar 2016 13:30:37 -0700 Subject: [PATCH] Add timeout flag for container start times This currently depends on a runc PR: https://github.com/opencontainers/runc/pull/703 We need this pr because we have to SIGKILL runc and the container root dir will still be left around. As for the containerd changes this adds a flag to containerd so that you can configure the timeout without any more code changes. It also adds better handling in the error cases and will kill the containerd-shim and runc ( as well as the user process if it exists ) if the timeout is hit. Signed-off-by: Michael Crosby --- containerd/main.go | 10 ++++- runtime/container.go | 9 +++- runtime/container_linux.go | 91 ++++++++++++++++++++++++-------------- runtime/runtime.go | 1 + supervisor/create.go | 1 + supervisor/supervisor.go | 4 +- 6 files changed, 79 insertions(+), 37 deletions(-) diff --git a/containerd/main.go b/containerd/main.go index ee2931b..6f238bb 100644 --- a/containerd/main.go +++ b/containerd/main.go @@ -59,6 +59,11 @@ var daemonFlags = []cli.Flag{ Name: "pprof-address", Usage: "http address to listen for pprof events", }, + cli.DurationFlag{ + Name: "start-timeout", + Value: 15 * time.Second, + Usage: "timeout duration for waiting on a container to start before it is killed", + }, } func main() { @@ -81,6 +86,7 @@ func main() { 10, context.String("runtime"), context.StringSlice("runtime-args"), + context.Duration("start-timeout"), ); err != nil { logrus.Fatal(err) } @@ -90,7 +96,7 @@ func main() { } } -func daemon(address, stateDir string, concurrency int, runtimeName string, runtimeArgs []string) error { +func daemon(address, stateDir string, concurrency int, runtimeName string, runtimeArgs []string, timeout time.Duration) error { // setup a standard reaper so that we don't leave any zombies if we are still alive // this is just good practice because we are spawning new processes s := make(chan os.Signal, 2048) @@ -98,7 +104,7 @@ func daemon(address, stateDir string, concurrency int, runtimeName string, runti if err := osutils.SetSubreaper(1); err != nil { logrus.WithField("error", err).Error("containerd: set subpreaper") } - sv, err := supervisor.New(stateDir, runtimeName, runtimeArgs) + sv, err := supervisor.New(stateDir, runtimeName, runtimeArgs, timeout) if err != nil { return err } diff --git a/runtime/container.go b/runtime/container.go index 1fa4218..0d38ee9 100644 --- a/runtime/container.go +++ b/runtime/container.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "time" "github.com/Sirupsen/logrus" "github.com/docker/containerd/specs" @@ -90,6 +91,7 @@ type ContainerOpts struct { RuntimeArgs []string Labels []string NoPivotRoot bool + Timeout time.Duration } // New returns a new container @@ -103,6 +105,7 @@ func New(opts ContainerOpts) (Container, error) { runtime: opts.Runtime, runtimeArgs: opts.RuntimeArgs, noPivotRoot: opts.NoPivotRoot, + timeout: opts.Timeout, } if err := os.Mkdir(filepath.Join(c.root, c.id), 0755); err != nil { return nil, err @@ -191,6 +194,7 @@ type container struct { labels []string oomFds []int noPivotRoot bool + timeout time.Duration } func (c *container) ID() string { @@ -223,8 +227,9 @@ func (c *container) Delete() error { args := c.runtimeArgs args = append(args, "delete", c.id) - exec.Command(c.runtime, args...).Run() - + if derr := exec.Command(c.runtime, args...).Run(); err == nil { + err = derr + } return err } diff --git a/runtime/container_linux.go b/runtime/container_linux.go index 2969076..721b4b9 100644 --- a/runtime/container_linux.go +++ b/runtime/container_linux.go @@ -224,7 +224,7 @@ func (c *container) startCmd(pid string, cmd *exec.Cmd, p *process) error { } return err } - if err := waitForStart(p, cmd); err != nil { + if err := c.waitForStart(p, cmd); err != nil { return err } c.processes[pid] = p @@ -335,49 +335,76 @@ func (c *container) writeEventFD(root string, cfd, efd int) error { return err } -func waitForStart(p *process, cmd *exec.Cmd) error { - for i := 0; i < 300; i++ { - if _, err := p.getPidFromFile(); err != nil { - if os.IsNotExist(err) || err == errInvalidPidInt { - alive, err := isAlive(cmd) - if err != nil { - return err - } - if !alive { - // runc could have failed to run the container so lets get the error - // out of the logs or the shim could have encountered an error - messages, err := readLogMessages(filepath.Join(p.root, "shim-log.json")) +type waitArgs struct { + pid int + err error +} + +func (c *container) waitForStart(p *process, cmd *exec.Cmd) error { + wc := make(chan error, 1) + go func() { + for { + if _, err := p.getPidFromFile(); err != nil { + if os.IsNotExist(err) || err == errInvalidPidInt { + alive, err := isAlive(cmd) if err != nil { - return err + wc <- err + return } - for _, m := range messages { - if m.Level == "error" { - return fmt.Errorf("shim error: %v", m.Msg) + if !alive { + // runc could have failed to run the container so lets get the error + // out of the logs or the shim could have encountered an error + messages, err := readLogMessages(filepath.Join(p.root, "shim-log.json")) + if err != nil { + wc <- err + return } - } - // no errors reported back from shim, check for runc/runtime errors - messages, err = readLogMessages(filepath.Join(p.root, "log.json")) - if err != nil { - if os.IsNotExist(err) { - return ErrContainerNotStarted + for _, m := range messages { + if m.Level == "error" { + wc <- fmt.Errorf("shim error: %v", m.Msg) + return + } } - return err - } - for _, m := range messages { - if m.Level == "error" { - return fmt.Errorf("oci runtime error: %v", m.Msg) + // no errors reported back from shim, check for runc/runtime errors + messages, err = readLogMessages(filepath.Join(p.root, "log.json")) + if err != nil { + if os.IsNotExist(err) { + err = ErrContainerNotStarted + } + wc <- err + return } + for _, m := range messages { + if m.Level == "error" { + wc <- fmt.Errorf("oci runtime error: %v", m.Msg) + return + } + } + wc <- ErrContainerNotStarted + return } - return ErrContainerNotStarted + time.Sleep(50 * time.Millisecond) + continue } - time.Sleep(50 * time.Millisecond) - continue + wc <- err + return } + // the pid file was read successfully + wc <- nil + return + } + }() + select { + case err := <-wc: + if err != nil { return err } return nil + case <-time.After(c.timeout): + cmd.Process.Kill() + cmd.Wait() + return ErrContainerStartTimeout } - return errNoPidFile } // isAlive checks if the shim that launched the container is still alive diff --git a/runtime/runtime.go b/runtime/runtime.go index b37a401..e6d2060 100644 --- a/runtime/runtime.go +++ b/runtime/runtime.go @@ -17,6 +17,7 @@ var ( ErrProcessNotExited = errors.New("containerd: process has not exited") ErrProcessExited = errors.New("containerd: process has exited") ErrContainerNotStarted = errors.New("containerd: container not started") + ErrContainerStartTimeout = errors.New("containerd: container did not start before the specified timeout") errNoPidFile = errors.New("containerd: no process pid file found") errInvalidPidInt = errors.New("containerd: process pid is invalid") diff --git a/supervisor/create.go b/supervisor/create.go index f173e6e..7561fa7 100644 --- a/supervisor/create.go +++ b/supervisor/create.go @@ -29,6 +29,7 @@ func (s *Supervisor) start(t *StartTask) error { RuntimeArgs: s.runtimeArgs, Labels: t.Labels, NoPivotRoot: t.NoPivotRoot, + Timeout: s.timeout, }) if err != nil { return err diff --git a/supervisor/supervisor.go b/supervisor/supervisor.go index c4e3dcc..c642a18 100644 --- a/supervisor/supervisor.go +++ b/supervisor/supervisor.go @@ -18,7 +18,7 @@ const ( ) // New returns an initialized Process supervisor. -func New(stateDir string, runtimeName string, runtimeArgs []string) (*Supervisor, error) { +func New(stateDir string, runtimeName string, runtimeArgs []string, timeout time.Duration) (*Supervisor, error) { startTasks := make(chan *startTask, 10) if err := os.MkdirAll(stateDir, 0755); err != nil { return nil, err @@ -41,6 +41,7 @@ func New(stateDir string, runtimeName string, runtimeArgs []string) (*Supervisor monitor: monitor, runtime: runtimeName, runtimeArgs: runtimeArgs, + timeout: timeout, } if err := setupEventLog(s); err != nil { return nil, err @@ -118,6 +119,7 @@ type Supervisor struct { tasks chan Task monitor *Monitor eventLog []Event + timeout time.Duration } // Stop closes all startTasks and sends a SIGTERM to each container's pid1 then waits for they to