supervisor: implement monitoring

Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
This commit is contained in:
Kenfe-Mickael Laventure 2017-02-06 14:57:43 -08:00
parent 31f26fed18
commit 78d7e8b256
13 changed files with 1311 additions and 276 deletions

View file

@ -3,12 +3,19 @@ package supervisor
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"sync"
"time"
api "github.com/docker/containerd/api/execution"
"github.com/docker/containerd/api/shim"
"github.com/docker/containerd/events"
"github.com/docker/containerd/execution"
"github.com/docker/containerd/log"
google_protobuf "github.com/golang/protobuf/ptypes/empty"
"github.com/pkg/errors"
"golang.org/x/net/context"
)
@ -19,16 +26,22 @@ var (
// New creates a new GRPC services for execution
func New(ctx context.Context, root string) (*Service, error) {
clients, err := loadClients(root)
ctx = log.WithModule(ctx, "supervisor")
log.G(ctx).WithField("root", root).Debugf("New()")
if err := os.MkdirAll(root, 0700); err != nil {
return nil, errors.Wrapf(err, "unable to create root directory %q", root)
}
clients, err := loadClients(ctx, root)
if err != nil {
return nil, err
}
s := &Service{
root: root,
shims: clients,
ctx: ctx,
}
for _, c := range clients {
if err := s.monitor(c); err != nil {
if err := s.monitor(events.GetPoster(ctx), c); err != nil {
return nil, err
}
}
@ -38,24 +51,23 @@ func New(ctx context.Context, root string) (*Service, error) {
type Service struct {
mu sync.Mutex
ctx context.Context
root string
shims map[string]shim.ShimClient
shims map[string]*shimClient
}
func (s *Service) CreateContainer(ctx context.Context, r *api.CreateContainerRequest) (*api.CreateContainerResponse, error) {
s.mu.Lock()
if _, ok := s.shims[r.ID]; ok {
s.mu.Unlock()
return nil, fmt.Errorf("container already exists %q", r.ID)
}
client, err := newShimClient(filepath.Join(s.root, r.ID))
client, err := s.newShim(r.ID)
if err != nil {
s.mu.Unlock()
return nil, err
}
s.shims[r.ID] = client
s.mu.Unlock()
if err := s.monitor(client); err != nil {
defer func() {
if err != nil {
s.removeShim(r.ID)
}
}()
if err := s.monitor(events.GetPoster(ctx), client); err != nil {
return nil, err
}
createResponse, err := client.Create(ctx, &shim.CreateRequest{
@ -67,8 +79,9 @@ func (s *Service) CreateContainer(ctx context.Context, r *api.CreateContainerReq
Stderr: r.Stderr,
})
if err != nil {
return nil, err
return nil, errors.Wrapf(err, "shim create request failed")
}
client.initPid = createResponse.Pid
return &api.CreateContainerResponse{
Container: &api.Container{
ID: r.ID,
@ -96,11 +109,12 @@ func (s *Service) DeleteContainer(ctx context.Context, r *api.DeleteContainerReq
return nil, err
}
_, err = client.Delete(ctx, &shim.DeleteRequest{
Pid: r.Pid,
Pid: client.initPid,
})
if err != nil {
return nil, err
}
s.removeShim(r.ID)
return empty, nil
}
@ -180,13 +194,65 @@ func (s *Service) ListProcesses(ctx context.Context, r *api.ListProcessesRequest
// monitor monitors the shim's event rpc and forwards container and process
// events to callers
func (s *Service) monitor(client shim.ShimClient) error {
func (s *Service) monitor(poster events.Poster, client *shimClient) error {
// we use the service context here because we don't want to be
// tied to the Create rpc call
stream, err := client.Events(s.ctx, &shim.EventsRequest{})
if err != nil {
return errors.Wrapf(err, "failed to get events stream for client at %q", client.root)
}
go func() {
for {
e, err := stream.Recv()
if err != nil {
if err.Error() == "EOF" || strings.Contains(err.Error(), "transport is closing") {
break
}
log.G(s.ctx).WithError(err).WithField("container", client.id).
Warnf("event stream for client at %q got terminated", client.root)
break
}
var topic string
if e.Type == shim.EventType_CREATE {
topic = "containers"
} else {
topic = fmt.Sprintf("containers.%s", e.ID)
}
ctx := events.WithTopic(s.ctx, topic)
poster.Post(ctx, execution.ContainerEvent{
Timestamp: time.Now(),
ID: e.ID,
Type: toExecutionEventType(e.Type),
Pid: e.Pid,
ExitStatus: e.ExitStatus,
})
}
}()
return nil
}
func (s *Service) getShim(id string) (shim.ShimClient, error) {
func (s *Service) newShim(id string) (*shimClient, error) {
s.mu.Lock()
defer s.mu.Unlock()
if _, ok := s.shims[id]; ok {
return nil, errors.Errorf("container %q already exists", id)
}
client, err := newShimClient(filepath.Join(s.root, id), id)
if err != nil {
return nil, err
}
s.shims[id] = client
return client, nil
}
func (s *Service) getShim(id string) (*shimClient, error) {
s.mu.Lock()
defer s.mu.Unlock()
client, ok := s.shims[id]
if !ok {
return nil, fmt.Errorf("container does not exist %q", id)
@ -194,22 +260,40 @@ func (s *Service) getShim(id string) (shim.ShimClient, error) {
return client, nil
}
func loadClients(root string) (map[string]shim.ShimClient, error) {
func (s *Service) removeShim(id string) {
s.mu.Lock()
defer s.mu.Unlock()
client, ok := s.shims[id]
if ok {
client.stop()
delete(s.shims, id)
}
}
func loadClients(ctx context.Context, root string) (map[string]*shimClient, error) {
files, err := ioutil.ReadDir(root)
if err != nil {
return nil, err
}
out := make(map[string]shim.ShimClient)
out := make(map[string]*shimClient)
for _, f := range files {
if !f.IsDir() {
continue
}
socket := filepath.Join(root, f.Name(), "shim.sock")
client, err := connectToShim(socket)
//
id := f.Name()
client, err := loadShimClient(filepath.Join(root, id), id)
if err != nil {
return nil, err
log.G(ctx).WithError(err).WithField("id", id).Warn("failed to load container")
// TODO: send an exit event with 255 as exit status
continue
}
out[f.Name()] = client
}
return out, nil
}
func toExecutionEventType(et shim.EventType) string {
return strings.Replace(strings.ToLower(et.String()), "_", "-", -1)
}

View file

@ -1,29 +1,106 @@
package supervisor
import (
"context"
"fmt"
"io/ioutil"
"log"
"net"
"os"
"os/exec"
"path/filepath"
"syscall"
"time"
"github.com/docker/containerd/api/shim"
"github.com/pkg/errors"
"google.golang.org/grpc"
"google.golang.org/grpc/grpclog"
)
func newShimClient(root string) (shim.ShimClient, error) {
// TODO: start the shim process
cmd := exec.Command("containerd-shim")
if err := cmd.Start(); err != nil {
return nil, err
func newShimClient(root, id string) (*shimClient, error) {
if err := os.Mkdir(root, 0700); err != nil {
return nil, errors.Wrap(err, "failed to create shim working dir")
}
cmd := exec.Command("containerd-shim")
cmd.Dir = root
cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
}
if err := cmd.Start(); err != nil {
return nil, errors.Wrapf(err, "failed to start shim")
}
socket := filepath.Join(root, "shim.sock")
return connectToShim(socket)
sc, err := connectToShim(socket)
if err != nil {
syscall.Kill(cmd.Process.Pid, syscall.SIGKILL)
cmd.Wait()
return nil, err
}
s := &shimClient{
ShimClient: sc,
shimCmd: cmd,
syncCh: make(chan struct{}),
root: root,
id: id,
}
go func() {
cmd.Wait()
close(s.syncCh)
}()
return s, nil
}
func loadShimClient(root, id string) (*shimClient, error) {
socket := filepath.Join(root, "shim.sock")
client, err := connectToShim(socket)
if err != nil {
// TODO: failed to connect to the shim, check if it's alive
// - if it is kill it
// - in both case call runc killall and runc delete on the id
return nil, err
}
resp, err := client.State(context.Background(), &shim.StateRequest{})
if err != nil {
return nil, errors.Wrapf(err, "failed to fetch state for container %s", id)
}
return &shimClient{
ShimClient: client,
root: root,
id: id,
initPid: resp.InitPid,
}, nil
}
type shimClient struct {
shim.ShimClient
shimCmd *exec.Cmd
syncCh chan struct{}
root string
id string
initPid uint32
}
func (s *shimClient) stop() {
if s.shimCmd != nil {
select {
case <-s.syncCh:
default:
syscall.Kill(s.shimCmd.Process.Pid, syscall.SIGTERM)
select {
case <-s.syncCh:
case <-time.After(10 * time.Second):
syscall.Kill(s.shimCmd.Process.Pid, syscall.SIGKILL)
}
}
}
os.RemoveAll(s.root)
}
func connectToShim(socket string) (shim.ShimClient, error) {
@ -33,12 +110,13 @@ func connectToShim(socket string) (shim.ShimClient, error) {
dialOpts = append(dialOpts,
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", socket, timeout)
},
))
// FIXME: probably need a retry here
}),
grpc.WithBlock(),
grpc.WithTimeout(2*time.Second),
)
conn, err := grpc.Dial(fmt.Sprintf("unix://%s", socket), dialOpts...)
if err != nil {
return nil, err
return nil, errors.Wrapf(err, "failed to connect to shim via \"%s\"", fmt.Sprintf("unix://%s", socket))
}
return shim.NewShimClient(conn), nil
}