containerd/supervisor/supervisor.go

247 lines
7 KiB
Go
Raw Normal View History

package supervisor
2015-11-05 23:29:53 +00:00
import (
"os"
"os/signal"
"path/filepath"
"sync"
"syscall"
"time"
2015-11-05 23:29:53 +00:00
"github.com/Sirupsen/logrus"
"github.com/docker/containerd/chanotify"
"github.com/docker/containerd/eventloop"
"github.com/docker/containerd/runtime"
2015-11-06 21:01:55 +00:00
"github.com/opencontainers/runc/libcontainer"
2015-11-05 23:29:53 +00:00
)
const (
statsInterval = 1 * time.Second
defaultBufferSize = 2048 // size of queue in eventloop
)
// New returns an initialized Process supervisor.
func New(id, stateDir string, tasks chan *StartTask, oom bool) (*Supervisor, error) {
2015-11-05 23:29:53 +00:00
if err := os.MkdirAll(stateDir, 0755); err != nil {
return nil, err
}
2015-11-06 00:16:11 +00:00
// register counters
r, err := newRuntime(filepath.Join(stateDir, id))
2015-11-05 23:49:13 +00:00
if err != nil {
return nil, err
2015-11-05 23:29:53 +00:00
}
machine, err := CollectMachineInformation(id)
if err != nil {
return nil, err
}
2015-11-05 23:49:13 +00:00
s := &Supervisor{
stateDir: stateDir,
containers: make(map[string]*containerInfo),
processes: make(map[int]*containerInfo),
runtime: r,
tasks: tasks,
machine: machine,
subscribers: make(map[chan *Event]struct{}),
statsCollector: newStatsCollector(statsInterval),
el: eventloop.NewChanLoop(defaultBufferSize),
2015-11-07 00:44:52 +00:00
}
if oom {
s.notifier = chanotify.New()
go func() {
for id := range s.notifier.Chan() {
e := NewEvent(OOMEventType)
e.ID = id
s.SendEvent(e)
}
}()
}
// register default event handlers
s.handlers = map[EventType]Handler{
ExecExitEventType: &ExecExitEvent{s},
ExitEventType: &ExitEvent{s},
StartContainerEventType: &StartEvent{s},
DeleteEventType: &DeleteEvent{s},
GetContainerEventType: &GetContainersEvent{s},
SignalEventType: &SignalEvent{s},
AddProcessEventType: &AddProcessEvent{s},
UpdateContainerEventType: &UpdateEvent{s},
CreateCheckpointEventType: &CreateCheckpointEvent{s},
DeleteCheckpointEventType: &DeleteCheckpointEvent{s},
StatsEventType: &StatsEvent{s},
UnsubscribeStatsEventType: &UnsubscribeStatsEvent{s},
StopStatsEventType: &StopStatsEvent{s},
}
// start the container workers for concurrent container starts
2015-11-05 23:29:53 +00:00
return s, nil
}
type containerInfo struct {
container runtime.Container
copier *copier
}
2015-11-05 23:29:53 +00:00
type Supervisor struct {
// stateDir is the directory on the system to store container runtime state information.
stateDir string
containers map[string]*containerInfo
processes map[int]*containerInfo
handlers map[EventType]Handler
runtime runtime.Runtime
events chan *Event
tasks chan *StartTask
// we need a lock around the subscribers map only because additions and deletions from
// the map are via the API so we cannot really control the concurrency
subscriberLock sync.RWMutex
subscribers map[chan *Event]struct{}
machine Machine
containerGroup sync.WaitGroup
statsCollector *statsCollector
notifier *chanotify.Notifier
el eventloop.EventLoop
2015-11-05 23:29:53 +00:00
}
// Stop closes all tasks and sends a SIGTERM to each container's pid1 then waits for they to
// terminate. After it has handled all the SIGCHILD events it will close the signals chan
// and exit. Stop is a non-blocking call and will return after the containers have been signaled
func (s *Supervisor) Stop(sig chan os.Signal) {
// Close the tasks channel so that no new containers get started
close(s.tasks)
// send a SIGTERM to all containers
for id, i := range s.containers {
c := i.container
logrus.WithField("id", id).Debug("sending TERM to container processes")
procs, err := c.Processes()
if err != nil {
logrus.WithField("id", id).Warn("get container processes")
continue
}
if len(procs) == 0 {
continue
}
mainProc := procs[0]
if err := mainProc.Signal(syscall.SIGTERM); err != nil {
pid, _ := mainProc.Pid()
logrus.WithFields(logrus.Fields{
"id": id,
"pid": pid,
"error": err,
}).Error("send SIGTERM to process")
}
}
go func() {
logrus.Debug("waiting for containers to exit")
s.containerGroup.Wait()
logrus.Debug("all containers exited")
if s.notifier != nil {
s.notifier.Close()
}
// stop receiving signals and close the channel
signal.Stop(sig)
close(sig)
}()
}
// Close closes any open files in the supervisor but expects that Stop has been
// callsed so that no more containers are started.
2015-11-10 22:24:34 +00:00
func (s *Supervisor) Close() error {
return nil
2015-11-10 22:24:34 +00:00
}
// Events returns an event channel that external consumers can use to receive updates
// on container events
func (s *Supervisor) Events() chan *Event {
s.subscriberLock.Lock()
defer s.subscriberLock.Unlock()
c := make(chan *Event, defaultBufferSize)
EventSubscriberCounter.Inc(1)
s.subscribers[c] = struct{}{}
return c
}
// Unsubscribe removes the provided channel from receiving any more events
func (s *Supervisor) Unsubscribe(sub chan *Event) {
s.subscriberLock.Lock()
defer s.subscriberLock.Unlock()
delete(s.subscribers, sub)
close(sub)
EventSubscriberCounter.Dec(1)
}
// notifySubscribers will send the provided event to the external subscribers
// of the events channel
func (s *Supervisor) notifySubscribers(e *Event) {
s.subscriberLock.RLock()
defer s.subscriberLock.RUnlock()
for sub := range s.subscribers {
// do a non-blocking send for the channel
select {
case sub <- e:
default:
logrus.WithField("event", e.Type).Warn("event not sent to subscriber")
}
}
}
2015-11-06 00:40:57 +00:00
// Start is a non-blocking call that runs the supervisor for monitoring contianer processes and
2015-11-05 23:29:53 +00:00
// executing new containers.
//
// This event loop is the only thing that is allowed to modify state of containers and processes
// therefore it is save to do operations in the handlers that modify state of the system or
// state of the Supervisor
func (s *Supervisor) Start() error {
logrus.WithFields(logrus.Fields{
"runtime": s.runtime.Type(),
"stateDir": s.stateDir,
}).Debug("Supervisor started")
return s.el.Start()
2015-11-05 23:29:53 +00:00
}
// Machine returns the machine information for which the
// supervisor is executing on.
func (s *Supervisor) Machine() Machine {
return s.machine
}
// getContainerForPid returns the container where the provided pid is the pid1 or main
// process in the container
func (s *Supervisor) getContainerForPid(pid int) (runtime.Container, error) {
for _, i := range s.containers {
container := i.container
2015-11-06 21:01:55 +00:00
cpid, err := container.Pid()
if err != nil {
if lerr, ok := err.(libcontainer.Error); ok {
if lerr.Code() == libcontainer.ProcessNotExecuted {
continue
}
}
logrus.WithField("error", err).Error("containerd: get container pid")
}
if pid == cpid {
return container, nil
}
}
return nil, errNoContainerForPid
}
// SendEvent sends the provided event the the supervisors main event loop
2015-11-10 22:24:34 +00:00
func (s *Supervisor) SendEvent(evt *Event) {
EventsCounter.Inc(1)
s.el.Send(&commonEvent{data: evt, sv: s})
2015-11-05 23:29:53 +00:00
}
func (s *Supervisor) copyIO(stdin, stdout, stderr string, i *runtime.IO) (*copier, error) {
config := &ioConfig{
Stdin: i.Stdin,
Stdout: i.Stdout,
Stderr: i.Stderr,
StdoutPath: stdout,
StderrPath: stderr,
StdinPath: stdin,
}
l, err := newCopier(config)
if err != nil {
return nil, err
}
return l, nil
}