Handle restore with dead shim
Add functionality for restoring containers after containerd dies and is restarted with terminated shims. This ensures that on restore, if a container no longer has a running shim, containerd will kill and cleanup the container. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
c1325a5aa9
commit
7482962b9d
4 changed files with 53 additions and 8 deletions
|
@ -9,6 +9,7 @@ import (
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/containerd/containerd"
|
"github.com/containerd/containerd"
|
||||||
|
@ -17,6 +18,7 @@ import (
|
||||||
"github.com/containerd/containerd/api/types/mount"
|
"github.com/containerd/containerd/api/types/mount"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/plugin"
|
"github.com/containerd/containerd/plugin"
|
||||||
|
runc "github.com/crosbymichael/go-runc"
|
||||||
|
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
)
|
)
|
||||||
|
@ -35,6 +37,8 @@ func init() {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var _ = (containerd.Runtime)(&Runtime{})
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
// Runtime is a path or name of an OCI runtime used by the shim
|
// Runtime is a path or name of an OCI runtime used by the shim
|
||||||
Runtime string `toml:"runtime"`
|
Runtime string `toml:"runtime"`
|
||||||
|
@ -137,7 +141,7 @@ func (r *Runtime) Delete(ctx context.Context, c containerd.Container) (uint32, e
|
||||||
return rsp.ExitStatus, r.deleteBundle(lc.id)
|
return rsp.ExitStatus, r.deleteBundle(lc.id)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Runtime) Containers() ([]containerd.Container, error) {
|
func (r *Runtime) Containers(ctx context.Context) ([]containerd.Container, error) {
|
||||||
dir, err := ioutil.ReadDir(r.root)
|
dir, err := ioutil.ReadDir(r.root)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -147,9 +151,16 @@ func (r *Runtime) Containers() ([]containerd.Container, error) {
|
||||||
if !fi.IsDir() {
|
if !fi.IsDir() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
c, err := r.loadContainer(filepath.Join(r.root, fi.Name()))
|
id := fi.Name()
|
||||||
|
// TODO: optimize this if it is call frequently to list all containers
|
||||||
|
// i.e. dont' reconnect to the the shim's ever time
|
||||||
|
c, err := r.loadContainer(filepath.Join(r.root, id))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
log.G(ctx).WithError(err).Warnf("failed to load container %s", id)
|
||||||
|
// if we fail to load the container, connect to the shim, make sure if the shim has
|
||||||
|
// been killed and cleanup the resources still being held by the container
|
||||||
|
r.killContainer(ctx, id)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
o = append(o, c)
|
o = append(o, c)
|
||||||
}
|
}
|
||||||
|
@ -232,3 +243,40 @@ func (r *Runtime) loadContainer(path string) (*Container, error) {
|
||||||
shim: s,
|
shim: s,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// killContainer is used whenever the runtime fails to connect to a shim (it died)
|
||||||
|
// and needs to cleanup the container resources in the underlying runtime (runc, etc...)
|
||||||
|
func (r *Runtime) killContainer(ctx context.Context, id string) {
|
||||||
|
log.G(ctx).Debug("terminating container after failed load")
|
||||||
|
runtime := &runc.Runc{
|
||||||
|
// TODO: get Command provided for initial container creation
|
||||||
|
// Command: r.Runtime,
|
||||||
|
LogFormat: runc.JSON,
|
||||||
|
PdeathSignal: syscall.SIGKILL,
|
||||||
|
}
|
||||||
|
if err := runtime.Kill(ctx, id, int(syscall.SIGKILL), &runc.KillOpts{
|
||||||
|
All: true,
|
||||||
|
}); err != nil {
|
||||||
|
log.G(ctx).WithError(err).Warnf("kill all processes for %s", id)
|
||||||
|
}
|
||||||
|
// it can take a while for the container to be killed so poll for the container's status
|
||||||
|
// until it is in a stopped state
|
||||||
|
status := "running"
|
||||||
|
for status != "stopped" {
|
||||||
|
c, err := runtime.State(ctx, id)
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
status = c.Status
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
}
|
||||||
|
if err := runtime.Delete(ctx, id); err != nil {
|
||||||
|
log.G(ctx).WithError(err).Warnf("delete container %s", id)
|
||||||
|
}
|
||||||
|
// try to unmount the rootfs is it was not held by an external shim
|
||||||
|
syscall.Unmount(filepath.Join(r.root, id, "rootfs"), 0)
|
||||||
|
// remove container bundle
|
||||||
|
if err := r.deleteBundle(id); err != nil {
|
||||||
|
log.G(ctx).WithError(err).Warnf("delete container bundle %s", id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -60,9 +60,6 @@ func loadShim(path string, remote bool) (shim.ShimClient, error) {
|
||||||
}
|
}
|
||||||
socket := filepath.Join(path, "shim.sock")
|
socket := filepath.Join(path, "shim.sock")
|
||||||
return connectShim(socket)
|
return connectShim(socket)
|
||||||
// TODO: failed to connect to the shim, check if it's alive
|
|
||||||
// - if it is kill it
|
|
||||||
// - in both case call runc killall and runc delete on the id
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func connectShim(socket string) (shim.ShimClient, error) {
|
func connectShim(socket string) (shim.ShimClient, error) {
|
||||||
|
|
|
@ -24,7 +24,7 @@ type Runtime interface {
|
||||||
// Create creates a container with the provided id and options
|
// Create creates a container with the provided id and options
|
||||||
Create(ctx context.Context, id string, opts CreateOpts) (Container, error)
|
Create(ctx context.Context, id string, opts CreateOpts) (Container, error)
|
||||||
// Containers returns all the current containers for the runtime
|
// Containers returns all the current containers for the runtime
|
||||||
Containers() ([]Container, error)
|
Containers(context.Context) ([]Container, error)
|
||||||
// Delete removes the container in the runtime
|
// Delete removes the container in the runtime
|
||||||
Delete(context.Context, Container) (uint32, error)
|
Delete(context.Context, Container) (uint32, error)
|
||||||
// Events returns events for the runtime and all containers created by the runtime
|
// Events returns events for the runtime and all containers created by the runtime
|
||||||
|
|
|
@ -49,7 +49,7 @@ func (s *Service) Register(server *grpc.Server) error {
|
||||||
api.RegisterContainerServiceServer(server, s)
|
api.RegisterContainerServiceServer(server, s)
|
||||||
// load all containers
|
// load all containers
|
||||||
for _, r := range s.runtimes {
|
for _, r := range s.runtimes {
|
||||||
containers, err := r.Containers()
|
containers, err := r.Containers(context.Background())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue