From 7482962b9d458ee78481e64926852bb14af2271b Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Apr 2017 18:22:30 -0700 Subject: [PATCH] Handle restore with dead shim Add functionality for restoring containers after containerd dies and is restarted with terminated shims. This ensures that on restore, if a container no longer has a running shim, containerd will kill and cleanup the container. Signed-off-by: Michael Crosby --- linux/runtime.go | 54 +++++++++++++++++++++++++++++++++-- linux/shim.go | 3 -- runtime.go | 2 +- services/execution/service.go | 2 +- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/linux/runtime.go b/linux/runtime.go index 39d62b5..75861ee 100644 --- a/linux/runtime.go +++ b/linux/runtime.go @@ -9,6 +9,7 @@ import ( "io/ioutil" "os" "path/filepath" + "syscall" "time" "github.com/containerd/containerd" @@ -17,6 +18,7 @@ import ( "github.com/containerd/containerd/api/types/mount" "github.com/containerd/containerd/log" "github.com/containerd/containerd/plugin" + runc "github.com/crosbymichael/go-runc" "golang.org/x/net/context" ) @@ -35,6 +37,8 @@ func init() { }) } +var _ = (containerd.Runtime)(&Runtime{}) + type Config struct { // Runtime is a path or name of an OCI runtime used by the shim Runtime string `toml:"runtime"` @@ -137,7 +141,7 @@ func (r *Runtime) Delete(ctx context.Context, c containerd.Container) (uint32, e return rsp.ExitStatus, r.deleteBundle(lc.id) } -func (r *Runtime) Containers() ([]containerd.Container, error) { +func (r *Runtime) Containers(ctx context.Context) ([]containerd.Container, error) { dir, err := ioutil.ReadDir(r.root) if err != nil { return nil, err @@ -147,9 +151,16 @@ func (r *Runtime) Containers() ([]containerd.Container, error) { if !fi.IsDir() { continue } - c, err := r.loadContainer(filepath.Join(r.root, fi.Name())) + id := fi.Name() + // TODO: optimize this if it is call frequently to list all containers + // i.e. dont' reconnect to the the shim's ever time + c, err := r.loadContainer(filepath.Join(r.root, id)) if err != nil { - return nil, err + log.G(ctx).WithError(err).Warnf("failed to load container %s", id) + // if we fail to load the container, connect to the shim, make sure if the shim has + // been killed and cleanup the resources still being held by the container + r.killContainer(ctx, id) + continue } o = append(o, c) } @@ -232,3 +243,40 @@ func (r *Runtime) loadContainer(path string) (*Container, error) { shim: s, }, nil } + +// killContainer is used whenever the runtime fails to connect to a shim (it died) +// and needs to cleanup the container resources in the underlying runtime (runc, etc...) +func (r *Runtime) killContainer(ctx context.Context, id string) { + log.G(ctx).Debug("terminating container after failed load") + runtime := &runc.Runc{ + // TODO: get Command provided for initial container creation + // Command: r.Runtime, + LogFormat: runc.JSON, + PdeathSignal: syscall.SIGKILL, + } + if err := runtime.Kill(ctx, id, int(syscall.SIGKILL), &runc.KillOpts{ + All: true, + }); err != nil { + log.G(ctx).WithError(err).Warnf("kill all processes for %s", id) + } + // it can take a while for the container to be killed so poll for the container's status + // until it is in a stopped state + status := "running" + for status != "stopped" { + c, err := runtime.State(ctx, id) + if err != nil { + break + } + status = c.Status + time.Sleep(10 * time.Millisecond) + } + if err := runtime.Delete(ctx, id); err != nil { + log.G(ctx).WithError(err).Warnf("delete container %s", id) + } + // try to unmount the rootfs is it was not held by an external shim + syscall.Unmount(filepath.Join(r.root, id, "rootfs"), 0) + // remove container bundle + if err := r.deleteBundle(id); err != nil { + log.G(ctx).WithError(err).Warnf("delete container bundle %s", id) + } +} diff --git a/linux/shim.go b/linux/shim.go index 2d62663..eb06d65 100644 --- a/linux/shim.go +++ b/linux/shim.go @@ -60,9 +60,6 @@ func loadShim(path string, remote bool) (shim.ShimClient, error) { } socket := filepath.Join(path, "shim.sock") return connectShim(socket) - // TODO: failed to connect to the shim, check if it's alive - // - if it is kill it - // - in both case call runc killall and runc delete on the id } func connectShim(socket string) (shim.ShimClient, error) { diff --git a/runtime.go b/runtime.go index 797e985..7bb00c0 100644 --- a/runtime.go +++ b/runtime.go @@ -24,7 +24,7 @@ type Runtime interface { // Create creates a container with the provided id and options Create(ctx context.Context, id string, opts CreateOpts) (Container, error) // Containers returns all the current containers for the runtime - Containers() ([]Container, error) + Containers(context.Context) ([]Container, error) // Delete removes the container in the runtime Delete(context.Context, Container) (uint32, error) // Events returns events for the runtime and all containers created by the runtime diff --git a/services/execution/service.go b/services/execution/service.go index 0d8fa4a..7937916 100644 --- a/services/execution/service.go +++ b/services/execution/service.go @@ -49,7 +49,7 @@ func (s *Service) Register(server *grpc.Server) error { api.RegisterContainerServiceServer(server, s) // load all containers for _, r := range s.runtimes { - containers, err := r.Containers() + containers, err := r.Containers(context.Background()) if err != nil { return err }