supervisor: implement monitoring
Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
This commit is contained in:
parent
31f26fed18
commit
78d7e8b256
13 changed files with 1311 additions and 276 deletions
|
@ -3,12 +3,19 @@ package supervisor
|
|||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
api "github.com/docker/containerd/api/execution"
|
||||
"github.com/docker/containerd/api/shim"
|
||||
"github.com/docker/containerd/events"
|
||||
"github.com/docker/containerd/execution"
|
||||
"github.com/docker/containerd/log"
|
||||
google_protobuf "github.com/golang/protobuf/ptypes/empty"
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
|
@ -19,16 +26,22 @@ var (
|
|||
|
||||
// New creates a new GRPC services for execution
|
||||
func New(ctx context.Context, root string) (*Service, error) {
|
||||
clients, err := loadClients(root)
|
||||
ctx = log.WithModule(ctx, "supervisor")
|
||||
log.G(ctx).WithField("root", root).Debugf("New()")
|
||||
if err := os.MkdirAll(root, 0700); err != nil {
|
||||
return nil, errors.Wrapf(err, "unable to create root directory %q", root)
|
||||
}
|
||||
clients, err := loadClients(ctx, root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s := &Service{
|
||||
root: root,
|
||||
shims: clients,
|
||||
ctx: ctx,
|
||||
}
|
||||
for _, c := range clients {
|
||||
if err := s.monitor(c); err != nil {
|
||||
if err := s.monitor(events.GetPoster(ctx), c); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
@ -38,24 +51,23 @@ func New(ctx context.Context, root string) (*Service, error) {
|
|||
type Service struct {
|
||||
mu sync.Mutex
|
||||
|
||||
ctx context.Context
|
||||
root string
|
||||
shims map[string]shim.ShimClient
|
||||
shims map[string]*shimClient
|
||||
}
|
||||
|
||||
func (s *Service) CreateContainer(ctx context.Context, r *api.CreateContainerRequest) (*api.CreateContainerResponse, error) {
|
||||
s.mu.Lock()
|
||||
if _, ok := s.shims[r.ID]; ok {
|
||||
s.mu.Unlock()
|
||||
return nil, fmt.Errorf("container already exists %q", r.ID)
|
||||
}
|
||||
client, err := newShimClient(filepath.Join(s.root, r.ID))
|
||||
client, err := s.newShim(r.ID)
|
||||
if err != nil {
|
||||
s.mu.Unlock()
|
||||
return nil, err
|
||||
}
|
||||
s.shims[r.ID] = client
|
||||
s.mu.Unlock()
|
||||
if err := s.monitor(client); err != nil {
|
||||
defer func() {
|
||||
if err != nil {
|
||||
s.removeShim(r.ID)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := s.monitor(events.GetPoster(ctx), client); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
createResponse, err := client.Create(ctx, &shim.CreateRequest{
|
||||
|
@ -67,8 +79,9 @@ func (s *Service) CreateContainer(ctx context.Context, r *api.CreateContainerReq
|
|||
Stderr: r.Stderr,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, errors.Wrapf(err, "shim create request failed")
|
||||
}
|
||||
client.initPid = createResponse.Pid
|
||||
return &api.CreateContainerResponse{
|
||||
Container: &api.Container{
|
||||
ID: r.ID,
|
||||
|
@ -96,11 +109,12 @@ func (s *Service) DeleteContainer(ctx context.Context, r *api.DeleteContainerReq
|
|||
return nil, err
|
||||
}
|
||||
_, err = client.Delete(ctx, &shim.DeleteRequest{
|
||||
Pid: r.Pid,
|
||||
Pid: client.initPid,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.removeShim(r.ID)
|
||||
return empty, nil
|
||||
}
|
||||
|
||||
|
@ -180,13 +194,65 @@ func (s *Service) ListProcesses(ctx context.Context, r *api.ListProcessesRequest
|
|||
|
||||
// monitor monitors the shim's event rpc and forwards container and process
|
||||
// events to callers
|
||||
func (s *Service) monitor(client shim.ShimClient) error {
|
||||
func (s *Service) monitor(poster events.Poster, client *shimClient) error {
|
||||
// we use the service context here because we don't want to be
|
||||
// tied to the Create rpc call
|
||||
stream, err := client.Events(s.ctx, &shim.EventsRequest{})
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "failed to get events stream for client at %q", client.root)
|
||||
}
|
||||
|
||||
go func() {
|
||||
for {
|
||||
e, err := stream.Recv()
|
||||
if err != nil {
|
||||
if err.Error() == "EOF" || strings.Contains(err.Error(), "transport is closing") {
|
||||
break
|
||||
}
|
||||
log.G(s.ctx).WithError(err).WithField("container", client.id).
|
||||
Warnf("event stream for client at %q got terminated", client.root)
|
||||
break
|
||||
}
|
||||
|
||||
var topic string
|
||||
if e.Type == shim.EventType_CREATE {
|
||||
topic = "containers"
|
||||
} else {
|
||||
topic = fmt.Sprintf("containers.%s", e.ID)
|
||||
}
|
||||
|
||||
ctx := events.WithTopic(s.ctx, topic)
|
||||
poster.Post(ctx, execution.ContainerEvent{
|
||||
Timestamp: time.Now(),
|
||||
ID: e.ID,
|
||||
Type: toExecutionEventType(e.Type),
|
||||
Pid: e.Pid,
|
||||
ExitStatus: e.ExitStatus,
|
||||
})
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Service) getShim(id string) (shim.ShimClient, error) {
|
||||
func (s *Service) newShim(id string) (*shimClient, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if _, ok := s.shims[id]; ok {
|
||||
return nil, errors.Errorf("container %q already exists", id)
|
||||
}
|
||||
client, err := newShimClient(filepath.Join(s.root, id), id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.shims[id] = client
|
||||
return client, nil
|
||||
}
|
||||
|
||||
func (s *Service) getShim(id string) (*shimClient, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
client, ok := s.shims[id]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("container does not exist %q", id)
|
||||
|
@ -194,22 +260,40 @@ func (s *Service) getShim(id string) (shim.ShimClient, error) {
|
|||
return client, nil
|
||||
}
|
||||
|
||||
func loadClients(root string) (map[string]shim.ShimClient, error) {
|
||||
func (s *Service) removeShim(id string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
client, ok := s.shims[id]
|
||||
if ok {
|
||||
client.stop()
|
||||
delete(s.shims, id)
|
||||
}
|
||||
}
|
||||
|
||||
func loadClients(ctx context.Context, root string) (map[string]*shimClient, error) {
|
||||
files, err := ioutil.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := make(map[string]shim.ShimClient)
|
||||
out := make(map[string]*shimClient)
|
||||
for _, f := range files {
|
||||
if !f.IsDir() {
|
||||
continue
|
||||
}
|
||||
socket := filepath.Join(root, f.Name(), "shim.sock")
|
||||
client, err := connectToShim(socket)
|
||||
//
|
||||
id := f.Name()
|
||||
client, err := loadShimClient(filepath.Join(root, id), id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
log.G(ctx).WithError(err).WithField("id", id).Warn("failed to load container")
|
||||
// TODO: send an exit event with 255 as exit status
|
||||
continue
|
||||
}
|
||||
out[f.Name()] = client
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func toExecutionEventType(et shim.EventType) string {
|
||||
return strings.Replace(strings.ToLower(et.String()), "_", "-", -1)
|
||||
}
|
||||
|
|
|
@ -1,29 +1,106 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/docker/containerd/api/shim"
|
||||
"github.com/pkg/errors"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/grpclog"
|
||||
)
|
||||
|
||||
func newShimClient(root string) (shim.ShimClient, error) {
|
||||
// TODO: start the shim process
|
||||
cmd := exec.Command("containerd-shim")
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, err
|
||||
func newShimClient(root, id string) (*shimClient, error) {
|
||||
if err := os.Mkdir(root, 0700); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to create shim working dir")
|
||||
}
|
||||
|
||||
cmd := exec.Command("containerd-shim")
|
||||
cmd.Dir = root
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Setpgid: true,
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, errors.Wrapf(err, "failed to start shim")
|
||||
}
|
||||
|
||||
socket := filepath.Join(root, "shim.sock")
|
||||
return connectToShim(socket)
|
||||
sc, err := connectToShim(socket)
|
||||
if err != nil {
|
||||
syscall.Kill(cmd.Process.Pid, syscall.SIGKILL)
|
||||
cmd.Wait()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s := &shimClient{
|
||||
ShimClient: sc,
|
||||
shimCmd: cmd,
|
||||
syncCh: make(chan struct{}),
|
||||
root: root,
|
||||
id: id,
|
||||
}
|
||||
go func() {
|
||||
cmd.Wait()
|
||||
close(s.syncCh)
|
||||
}()
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func loadShimClient(root, id string) (*shimClient, error) {
|
||||
socket := filepath.Join(root, "shim.sock")
|
||||
client, err := connectToShim(socket)
|
||||
if err != nil {
|
||||
// TODO: failed to connect to the shim, check if it's alive
|
||||
// - if it is kill it
|
||||
// - in both case call runc killall and runc delete on the id
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := client.State(context.Background(), &shim.StateRequest{})
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "failed to fetch state for container %s", id)
|
||||
}
|
||||
|
||||
return &shimClient{
|
||||
ShimClient: client,
|
||||
root: root,
|
||||
id: id,
|
||||
initPid: resp.InitPid,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type shimClient struct {
|
||||
shim.ShimClient
|
||||
shimCmd *exec.Cmd
|
||||
syncCh chan struct{}
|
||||
root string
|
||||
id string
|
||||
initPid uint32
|
||||
}
|
||||
|
||||
func (s *shimClient) stop() {
|
||||
if s.shimCmd != nil {
|
||||
select {
|
||||
case <-s.syncCh:
|
||||
default:
|
||||
syscall.Kill(s.shimCmd.Process.Pid, syscall.SIGTERM)
|
||||
select {
|
||||
case <-s.syncCh:
|
||||
case <-time.After(10 * time.Second):
|
||||
syscall.Kill(s.shimCmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
}
|
||||
}
|
||||
os.RemoveAll(s.root)
|
||||
}
|
||||
|
||||
func connectToShim(socket string) (shim.ShimClient, error) {
|
||||
|
@ -33,12 +110,13 @@ func connectToShim(socket string) (shim.ShimClient, error) {
|
|||
dialOpts = append(dialOpts,
|
||||
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
|
||||
return net.DialTimeout("unix", socket, timeout)
|
||||
},
|
||||
))
|
||||
// FIXME: probably need a retry here
|
||||
}),
|
||||
grpc.WithBlock(),
|
||||
grpc.WithTimeout(2*time.Second),
|
||||
)
|
||||
conn, err := grpc.Dial(fmt.Sprintf("unix://%s", socket), dialOpts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, errors.Wrapf(err, "failed to connect to shim via \"%s\"", fmt.Sprintf("unix://%s", socket))
|
||||
}
|
||||
return shim.NewShimClient(conn), nil
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue