Handle restore with dead shim
Add functionality for restoring containers after containerd dies and is restarted with terminated shims. This ensures that on restore, if a container no longer has a running shim, containerd will kill and cleanup the container. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
c1325a5aa9
commit
7482962b9d
4 changed files with 53 additions and 8 deletions
|
@ -9,6 +9,7 @@ import (
|
|||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd"
|
||||
|
@ -17,6 +18,7 @@ import (
|
|||
"github.com/containerd/containerd/api/types/mount"
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/containerd/containerd/plugin"
|
||||
runc "github.com/crosbymichael/go-runc"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
@ -35,6 +37,8 @@ func init() {
|
|||
})
|
||||
}
|
||||
|
||||
var _ = (containerd.Runtime)(&Runtime{})
|
||||
|
||||
type Config struct {
|
||||
// Runtime is a path or name of an OCI runtime used by the shim
|
||||
Runtime string `toml:"runtime"`
|
||||
|
@ -137,7 +141,7 @@ func (r *Runtime) Delete(ctx context.Context, c containerd.Container) (uint32, e
|
|||
return rsp.ExitStatus, r.deleteBundle(lc.id)
|
||||
}
|
||||
|
||||
func (r *Runtime) Containers() ([]containerd.Container, error) {
|
||||
func (r *Runtime) Containers(ctx context.Context) ([]containerd.Container, error) {
|
||||
dir, err := ioutil.ReadDir(r.root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -147,9 +151,16 @@ func (r *Runtime) Containers() ([]containerd.Container, error) {
|
|||
if !fi.IsDir() {
|
||||
continue
|
||||
}
|
||||
c, err := r.loadContainer(filepath.Join(r.root, fi.Name()))
|
||||
id := fi.Name()
|
||||
// TODO: optimize this if it is call frequently to list all containers
|
||||
// i.e. dont' reconnect to the the shim's ever time
|
||||
c, err := r.loadContainer(filepath.Join(r.root, id))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
log.G(ctx).WithError(err).Warnf("failed to load container %s", id)
|
||||
// if we fail to load the container, connect to the shim, make sure if the shim has
|
||||
// been killed and cleanup the resources still being held by the container
|
||||
r.killContainer(ctx, id)
|
||||
continue
|
||||
}
|
||||
o = append(o, c)
|
||||
}
|
||||
|
@ -232,3 +243,40 @@ func (r *Runtime) loadContainer(path string) (*Container, error) {
|
|||
shim: s,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// killContainer is used whenever the runtime fails to connect to a shim (it died)
|
||||
// and needs to cleanup the container resources in the underlying runtime (runc, etc...)
|
||||
func (r *Runtime) killContainer(ctx context.Context, id string) {
|
||||
log.G(ctx).Debug("terminating container after failed load")
|
||||
runtime := &runc.Runc{
|
||||
// TODO: get Command provided for initial container creation
|
||||
// Command: r.Runtime,
|
||||
LogFormat: runc.JSON,
|
||||
PdeathSignal: syscall.SIGKILL,
|
||||
}
|
||||
if err := runtime.Kill(ctx, id, int(syscall.SIGKILL), &runc.KillOpts{
|
||||
All: true,
|
||||
}); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("kill all processes for %s", id)
|
||||
}
|
||||
// it can take a while for the container to be killed so poll for the container's status
|
||||
// until it is in a stopped state
|
||||
status := "running"
|
||||
for status != "stopped" {
|
||||
c, err := runtime.State(ctx, id)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
status = c.Status
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
if err := runtime.Delete(ctx, id); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("delete container %s", id)
|
||||
}
|
||||
// try to unmount the rootfs is it was not held by an external shim
|
||||
syscall.Unmount(filepath.Join(r.root, id, "rootfs"), 0)
|
||||
// remove container bundle
|
||||
if err := r.deleteBundle(id); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("delete container bundle %s", id)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,9 +60,6 @@ func loadShim(path string, remote bool) (shim.ShimClient, error) {
|
|||
}
|
||||
socket := filepath.Join(path, "shim.sock")
|
||||
return connectShim(socket)
|
||||
// TODO: failed to connect to the shim, check if it's alive
|
||||
// - if it is kill it
|
||||
// - in both case call runc killall and runc delete on the id
|
||||
}
|
||||
|
||||
func connectShim(socket string) (shim.ShimClient, error) {
|
||||
|
|
|
@ -24,7 +24,7 @@ type Runtime interface {
|
|||
// Create creates a container with the provided id and options
|
||||
Create(ctx context.Context, id string, opts CreateOpts) (Container, error)
|
||||
// Containers returns all the current containers for the runtime
|
||||
Containers() ([]Container, error)
|
||||
Containers(context.Context) ([]Container, error)
|
||||
// Delete removes the container in the runtime
|
||||
Delete(context.Context, Container) (uint32, error)
|
||||
// Events returns events for the runtime and all containers created by the runtime
|
||||
|
|
|
@ -49,7 +49,7 @@ func (s *Service) Register(server *grpc.Server) error {
|
|||
api.RegisterContainerServiceServer(server, s)
|
||||
// load all containers
|
||||
for _, r := range s.runtimes {
|
||||
containers, err := r.Containers()
|
||||
containers, err := r.Containers(context.Background())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue