Handle restore with dead shim

Add functionality for restoring containers after containerd dies and is
restarted with terminated shims.

This ensures that on restore, if a container no longer has a running
shim, containerd will kill and cleanup the container.

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
Michael Crosby 2017-04-06 18:22:30 -07:00
parent c1325a5aa9
commit 7482962b9d
4 changed files with 53 additions and 8 deletions

View file

@ -9,6 +9,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"syscall"
"time"
"github.com/containerd/containerd"
@ -17,6 +18,7 @@ import (
"github.com/containerd/containerd/api/types/mount"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/plugin"
runc "github.com/crosbymichael/go-runc"
"golang.org/x/net/context"
)
@ -35,6 +37,8 @@ func init() {
})
}
var _ = (containerd.Runtime)(&Runtime{})
type Config struct {
// Runtime is a path or name of an OCI runtime used by the shim
Runtime string `toml:"runtime"`
@ -137,7 +141,7 @@ func (r *Runtime) Delete(ctx context.Context, c containerd.Container) (uint32, e
return rsp.ExitStatus, r.deleteBundle(lc.id)
}
func (r *Runtime) Containers() ([]containerd.Container, error) {
func (r *Runtime) Containers(ctx context.Context) ([]containerd.Container, error) {
dir, err := ioutil.ReadDir(r.root)
if err != nil {
return nil, err
@ -147,9 +151,16 @@ func (r *Runtime) Containers() ([]containerd.Container, error) {
if !fi.IsDir() {
continue
}
c, err := r.loadContainer(filepath.Join(r.root, fi.Name()))
id := fi.Name()
// TODO: optimize this if it is call frequently to list all containers
// i.e. dont' reconnect to the the shim's ever time
c, err := r.loadContainer(filepath.Join(r.root, id))
if err != nil {
return nil, err
log.G(ctx).WithError(err).Warnf("failed to load container %s", id)
// if we fail to load the container, connect to the shim, make sure if the shim has
// been killed and cleanup the resources still being held by the container
r.killContainer(ctx, id)
continue
}
o = append(o, c)
}
@ -232,3 +243,40 @@ func (r *Runtime) loadContainer(path string) (*Container, error) {
shim: s,
}, nil
}
// killContainer is used whenever the runtime fails to connect to a shim (it died)
// and needs to cleanup the container resources in the underlying runtime (runc, etc...)
func (r *Runtime) killContainer(ctx context.Context, id string) {
log.G(ctx).Debug("terminating container after failed load")
runtime := &runc.Runc{
// TODO: get Command provided for initial container creation
// Command: r.Runtime,
LogFormat: runc.JSON,
PdeathSignal: syscall.SIGKILL,
}
if err := runtime.Kill(ctx, id, int(syscall.SIGKILL), &runc.KillOpts{
All: true,
}); err != nil {
log.G(ctx).WithError(err).Warnf("kill all processes for %s", id)
}
// it can take a while for the container to be killed so poll for the container's status
// until it is in a stopped state
status := "running"
for status != "stopped" {
c, err := runtime.State(ctx, id)
if err != nil {
break
}
status = c.Status
time.Sleep(10 * time.Millisecond)
}
if err := runtime.Delete(ctx, id); err != nil {
log.G(ctx).WithError(err).Warnf("delete container %s", id)
}
// try to unmount the rootfs is it was not held by an external shim
syscall.Unmount(filepath.Join(r.root, id, "rootfs"), 0)
// remove container bundle
if err := r.deleteBundle(id); err != nil {
log.G(ctx).WithError(err).Warnf("delete container bundle %s", id)
}
}

View file

@ -60,9 +60,6 @@ func loadShim(path string, remote bool) (shim.ShimClient, error) {
}
socket := filepath.Join(path, "shim.sock")
return connectShim(socket)
// TODO: failed to connect to the shim, check if it's alive
// - if it is kill it
// - in both case call runc killall and runc delete on the id
}
func connectShim(socket string) (shim.ShimClient, error) {

View file

@ -24,7 +24,7 @@ type Runtime interface {
// Create creates a container with the provided id and options
Create(ctx context.Context, id string, opts CreateOpts) (Container, error)
// Containers returns all the current containers for the runtime
Containers() ([]Container, error)
Containers(context.Context) ([]Container, error)
// Delete removes the container in the runtime
Delete(context.Context, Container) (uint32, error)
// Events returns events for the runtime and all containers created by the runtime

View file

@ -49,7 +49,7 @@ func (s *Service) Register(server *grpc.Server) error {
api.RegisterContainerServiceServer(server, s)
// load all containers
for _, r := range s.runtimes {
containers, err := r.Containers()
containers, err := r.Containers(context.Background())
if err != nil {
return err
}