Move supervisor to it's own package
It allows to keep main namespace cleaner Signed-off-by: Alexander Morozov <lk4d4@docker.com>
This commit is contained in:
parent
b296d50493
commit
69f8f566a2
24 changed files with 61 additions and 59 deletions
36
supervisor/add_process.go
Normal file
36
supervisor/add_process.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
package supervisor
|
||||
|
||||
import "github.com/Sirupsen/logrus"
|
||||
|
||||
type AddProcessEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
// TODO: add this to worker for concurrent starts??? maybe not because of races where the container
|
||||
// could be stopped and removed...
|
||||
func (h *AddProcessEvent) Handle(e *Event) error {
|
||||
ci, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
p, io, err := h.s.runtime.StartProcess(ci.container, *e.Process, e.Console)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if e.Pid, err = p.Pid(); err != nil {
|
||||
return err
|
||||
}
|
||||
h.s.processes[e.Pid] = &containerInfo{
|
||||
container: ci.container,
|
||||
}
|
||||
l, err := h.s.copyIO(e.Stdin, e.Stdout, e.Stderr, io)
|
||||
if err != nil {
|
||||
// log the error but continue with the other commands
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"error": err,
|
||||
"id": e.ID,
|
||||
}).Error("log stdio")
|
||||
}
|
||||
h.s.processes[e.Pid].copier = l
|
||||
return nil
|
||||
}
|
25
supervisor/checkpoint.go
Normal file
25
supervisor/checkpoint.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
package supervisor
|
||||
|
||||
type CreateCheckpointEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *CreateCheckpointEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
return i.container.Checkpoint(*e.Checkpoint)
|
||||
}
|
||||
|
||||
type DeleteCheckpointEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *DeleteCheckpointEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
return i.container.DeleteCheckpoint(e.Checkpoint.Name)
|
||||
}
|
31
supervisor/create.go
Normal file
31
supervisor/create.go
Normal file
|
@ -0,0 +1,31 @@
|
|||
package supervisor
|
||||
|
||||
type StartEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *StartEvent) Handle(e *Event) error {
|
||||
container, io, err := h.s.runtime.Create(e.ID, e.BundlePath, e.Console)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
h.s.containerGroup.Add(1)
|
||||
h.s.containers[e.ID] = &containerInfo{
|
||||
container: container,
|
||||
}
|
||||
ContainersCounter.Inc(1)
|
||||
task := &StartTask{
|
||||
Err: e.Err,
|
||||
IO: io,
|
||||
Container: container,
|
||||
Stdin: e.Stdin,
|
||||
Stdout: e.Stdout,
|
||||
Stderr: e.Stderr,
|
||||
StartResponse: e.StartResponse,
|
||||
}
|
||||
if e.Checkpoint != nil {
|
||||
task.Checkpoint = e.Checkpoint.Name
|
||||
}
|
||||
h.s.tasks <- task
|
||||
return errDeferedResponse
|
||||
}
|
37
supervisor/delete.go
Normal file
37
supervisor/delete.go
Normal file
|
@ -0,0 +1,37 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/containerd/runtime"
|
||||
)
|
||||
|
||||
type DeleteEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *DeleteEvent) Handle(e *Event) error {
|
||||
if i, ok := h.s.containers[e.ID]; ok {
|
||||
if err := h.deleteContainer(i.container); err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: deleting container")
|
||||
}
|
||||
if i.copier != nil {
|
||||
if err := i.copier.Close(); err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: close container copier")
|
||||
}
|
||||
}
|
||||
h.s.notifySubscribers(&Event{
|
||||
Type: ExitEventType,
|
||||
ID: e.ID,
|
||||
Status: e.Status,
|
||||
Pid: e.Pid,
|
||||
})
|
||||
ContainersCounter.Dec(1)
|
||||
h.s.containerGroup.Done()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *DeleteEvent) deleteContainer(container runtime.Container) error {
|
||||
delete(h.s.containers, container.ID())
|
||||
return container.Delete()
|
||||
}
|
24
supervisor/errors.go
Normal file
24
supervisor/errors.go
Normal file
|
@ -0,0 +1,24 @@
|
|||
package supervisor
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
// External errors
|
||||
ErrEventChanNil = errors.New("containerd: event channel is nil")
|
||||
ErrBundleNotFound = errors.New("containerd: bundle not found")
|
||||
ErrContainerNotFound = errors.New("containerd: container not found")
|
||||
ErrContainerExists = errors.New("containerd: container already exists")
|
||||
ErrProcessNotFound = errors.New("containerd: processs not found for container")
|
||||
ErrUnknownContainerStatus = errors.New("containerd: unknown container status ")
|
||||
ErrUnknownEvent = errors.New("containerd: unknown event type")
|
||||
|
||||
// Internal errors
|
||||
errShutdown = errors.New("containerd: supervisor is shutdown")
|
||||
errRootNotAbs = errors.New("containerd: rootfs path is not an absolute path")
|
||||
errNoContainerForPid = errors.New("containerd: pid not registered for any container")
|
||||
// internal error where the handler will defer to another for the final response
|
||||
//
|
||||
// TODO: we could probably do a typed error with another error channel for this to make it
|
||||
// less like magic
|
||||
errDeferedResponse = errors.New("containerd: defered response")
|
||||
)
|
84
supervisor/event.go
Normal file
84
supervisor/event.go
Normal file
|
@ -0,0 +1,84 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/docker/containerd/runtime"
|
||||
"github.com/opencontainers/specs"
|
||||
)
|
||||
|
||||
type EventType string
|
||||
|
||||
const (
|
||||
ExecExitEventType EventType = "execExit"
|
||||
ExitEventType EventType = "exit"
|
||||
StartContainerEventType EventType = "startContainer"
|
||||
DeleteEventType EventType = "deleteContainerEvent"
|
||||
GetContainerEventType EventType = "getContainer"
|
||||
SignalEventType EventType = "signal"
|
||||
AddProcessEventType EventType = "addProcess"
|
||||
UpdateContainerEventType EventType = "updateContainer"
|
||||
CreateCheckpointEventType EventType = "createCheckpoint"
|
||||
DeleteCheckpointEventType EventType = "deleteCheckpoint"
|
||||
StatsEventType EventType = "events"
|
||||
UnsubscribeStatsEventType EventType = "unsubscribeStats"
|
||||
StopStatsEventType EventType = "stopStats"
|
||||
OOMEventType EventType = "oom"
|
||||
)
|
||||
|
||||
func NewEvent(t EventType) *Event {
|
||||
return &Event{
|
||||
Type: t,
|
||||
Timestamp: time.Now(),
|
||||
Err: make(chan error, 1),
|
||||
}
|
||||
}
|
||||
|
||||
type StartResponse struct {
|
||||
Pid int
|
||||
}
|
||||
|
||||
type Event struct {
|
||||
Type EventType
|
||||
Timestamp time.Time
|
||||
ID string
|
||||
BundlePath string
|
||||
Stdout string
|
||||
Stderr string
|
||||
Stdin string
|
||||
Console string
|
||||
Pid int
|
||||
Status int
|
||||
Signal os.Signal
|
||||
Process *specs.Process
|
||||
State *runtime.State
|
||||
Containers []runtime.Container
|
||||
Checkpoint *runtime.Checkpoint
|
||||
Err chan error
|
||||
StartResponse chan StartResponse
|
||||
Stats chan interface{}
|
||||
}
|
||||
|
||||
type Handler interface {
|
||||
Handle(*Event) error
|
||||
}
|
||||
|
||||
type commonEvent struct {
|
||||
data *Event
|
||||
sv *Supervisor
|
||||
}
|
||||
|
||||
func (e *commonEvent) Handle() {
|
||||
h, ok := e.sv.handlers[e.data.Type]
|
||||
if !ok {
|
||||
e.data.Err <- ErrUnknownEvent
|
||||
return
|
||||
}
|
||||
err := h.Handle(e.data)
|
||||
if err != errDeferedResponse {
|
||||
e.data.Err <- err
|
||||
close(e.data.Err)
|
||||
return
|
||||
}
|
||||
}
|
58
supervisor/exit.go
Normal file
58
supervisor/exit.go
Normal file
|
@ -0,0 +1,58 @@
|
|||
package supervisor
|
||||
|
||||
import "github.com/Sirupsen/logrus"
|
||||
|
||||
type ExitEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *ExitEvent) Handle(e *Event) error {
|
||||
logrus.WithFields(logrus.Fields{"pid": e.Pid, "status": e.Status}).
|
||||
Debug("containerd: process exited")
|
||||
// is it the child process of a container
|
||||
if info, ok := h.s.processes[e.Pid]; ok {
|
||||
ne := NewEvent(ExecExitEventType)
|
||||
ne.ID = info.container.ID()
|
||||
ne.Pid = e.Pid
|
||||
ne.Status = e.Status
|
||||
h.s.SendEvent(ne)
|
||||
return nil
|
||||
}
|
||||
// is it the main container's process
|
||||
container, err := h.s.getContainerForPid(e.Pid)
|
||||
if err != nil {
|
||||
if err != errNoContainerForPid {
|
||||
logrus.WithField("error", err).Error("containerd: find containers main pid")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
container.SetExited(e.Status)
|
||||
ne := NewEvent(DeleteEventType)
|
||||
ne.ID = container.ID()
|
||||
ne.Pid = e.Pid
|
||||
ne.Status = e.Status
|
||||
h.s.SendEvent(ne)
|
||||
|
||||
stopCollect := NewEvent(StopStatsEventType)
|
||||
stopCollect.ID = container.ID()
|
||||
h.s.SendEvent(stopCollect)
|
||||
return nil
|
||||
}
|
||||
|
||||
type ExecExitEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *ExecExitEvent) Handle(e *Event) error {
|
||||
// exec process: we remove this process without notifying the main event loop
|
||||
info := h.s.processes[e.Pid]
|
||||
if err := info.container.RemoveProcess(e.Pid); err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: find container for pid")
|
||||
}
|
||||
if err := info.copier.Close(); err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: close process IO")
|
||||
}
|
||||
delete(h.s.processes, e.Pid)
|
||||
h.s.notifySubscribers(e)
|
||||
return nil
|
||||
}
|
12
supervisor/get_containers.go
Normal file
12
supervisor/get_containers.go
Normal file
|
@ -0,0 +1,12 @@
|
|||
package supervisor
|
||||
|
||||
type GetContainersEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *GetContainersEvent) Handle(e *Event) error {
|
||||
for _, i := range h.s.containers {
|
||||
e.Containers = append(e.Containers, i.container)
|
||||
}
|
||||
return nil
|
||||
}
|
66
supervisor/io.go
Normal file
66
supervisor/io.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
type ioConfig struct {
|
||||
StdoutPath string
|
||||
StderrPath string
|
||||
StdinPath string
|
||||
|
||||
Stdin io.WriteCloser
|
||||
Stdout io.ReadCloser
|
||||
Stderr io.ReadCloser
|
||||
}
|
||||
|
||||
func newCopier(i *ioConfig) (*copier, error) {
|
||||
l := &copier{
|
||||
config: i,
|
||||
}
|
||||
if i.StdinPath != "" {
|
||||
f, err := os.OpenFile(i.StdinPath, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
l.closers = append(l.closers, f)
|
||||
go func() {
|
||||
io.Copy(i.Stdin, f)
|
||||
i.Stdin.Close()
|
||||
}()
|
||||
}
|
||||
if i.StdoutPath != "" {
|
||||
f, err := os.OpenFile(i.StdoutPath, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
l.closers = append(l.closers, f)
|
||||
go io.Copy(f, i.Stdout)
|
||||
}
|
||||
if i.StderrPath != "" {
|
||||
f, err := os.OpenFile(i.StderrPath, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
l.closers = append(l.closers, f)
|
||||
go io.Copy(f, i.Stderr)
|
||||
}
|
||||
return l, nil
|
||||
}
|
||||
|
||||
type copier struct {
|
||||
config *ioConfig
|
||||
closers []io.Closer
|
||||
}
|
||||
|
||||
func (l *copier) Close() (err error) {
|
||||
for _, c := range append(l.closers, l.config.Stdin, l.config.Stdout, l.config.Stderr) {
|
||||
if c != nil {
|
||||
if cerr := c.Close(); err == nil {
|
||||
err = cerr
|
||||
}
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
26
supervisor/machine.go
Normal file
26
supervisor/machine.go
Normal file
|
@ -0,0 +1,26 @@
|
|||
package supervisor
|
||||
|
||||
import "github.com/cloudfoundry/gosigar"
|
||||
|
||||
type Machine struct {
|
||||
ID string
|
||||
Cpus int
|
||||
Memory int64
|
||||
}
|
||||
|
||||
func CollectMachineInformation(id string) (Machine, error) {
|
||||
m := Machine{
|
||||
ID: id,
|
||||
}
|
||||
cpu := sigar.CpuList{}
|
||||
if err := cpu.Get(); err != nil {
|
||||
return m, err
|
||||
}
|
||||
m.Cpus = len(cpu.List)
|
||||
mem := sigar.Mem{}
|
||||
if err := mem.Get(); err != nil {
|
||||
return m, err
|
||||
}
|
||||
m.Memory = int64(mem.Total)
|
||||
return m, nil
|
||||
}
|
80
supervisor/oom.go
Normal file
80
supervisor/oom.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sync"
|
||||
)
|
||||
|
||||
func newNotifier(s *Supervisor) *notifier {
|
||||
n := ¬ifier{
|
||||
s: s,
|
||||
channels: make(map[<-chan struct{}]string),
|
||||
controller: make(chan struct{}),
|
||||
}
|
||||
go n.start()
|
||||
return n
|
||||
}
|
||||
|
||||
type notifier struct {
|
||||
m sync.Mutex
|
||||
channels map[<-chan struct{}]string
|
||||
controller chan struct{}
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (n *notifier) start() {
|
||||
for {
|
||||
c := n.createCase()
|
||||
i, _, ok := reflect.Select(c)
|
||||
if i == 0 {
|
||||
continue
|
||||
}
|
||||
if ok {
|
||||
ch := c[i].Chan.Interface().(<-chan struct{})
|
||||
id := n.channels[ch]
|
||||
e := NewEvent(OOMEventType)
|
||||
e.ID = id
|
||||
n.s.SendEvent(e)
|
||||
continue
|
||||
}
|
||||
// the channel was closed and we should remove it
|
||||
ch := c[i].Chan.Interface().(<-chan struct{})
|
||||
n.removeChan(ch)
|
||||
}
|
||||
}
|
||||
|
||||
func (n *notifier) Add(ch <-chan struct{}, id string) {
|
||||
n.m.Lock()
|
||||
n.channels[ch] = id
|
||||
n.m.Unlock()
|
||||
// signal the main loop to break and add the new
|
||||
// channels
|
||||
n.controller <- struct{}{}
|
||||
}
|
||||
|
||||
func (n *notifier) createCase() []reflect.SelectCase {
|
||||
var out []reflect.SelectCase
|
||||
// add controller chan so that we can signal when we need to make
|
||||
// changes in the select. The controller chan will always be at
|
||||
// index 0 in the slice
|
||||
out = append(out, reflect.SelectCase{
|
||||
Dir: reflect.SelectRecv,
|
||||
Chan: reflect.ValueOf(n.controller),
|
||||
})
|
||||
n.m.Lock()
|
||||
for ch := range n.channels {
|
||||
v := reflect.ValueOf(ch)
|
||||
out = append(out, reflect.SelectCase{
|
||||
Dir: reflect.SelectRecv,
|
||||
Chan: v,
|
||||
})
|
||||
}
|
||||
n.m.Unlock()
|
||||
return out
|
||||
}
|
||||
|
||||
func (n *notifier) removeChan(ch <-chan struct{}) {
|
||||
n.m.Lock()
|
||||
delete(n.channels, ch)
|
||||
n.m.Unlock()
|
||||
}
|
22
supervisor/signal.go
Normal file
22
supervisor/signal.go
Normal file
|
@ -0,0 +1,22 @@
|
|||
package supervisor
|
||||
|
||||
type SignalEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *SignalEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
processes, err := i.container.Processes()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, p := range processes {
|
||||
if pid, err := p.Pid(); err == nil && pid == e.Pid {
|
||||
return p.Signal(e.Signal)
|
||||
}
|
||||
}
|
||||
return ErrProcessNotFound
|
||||
}
|
58
supervisor/stats.go
Normal file
58
supervisor/stats.go
Normal file
|
@ -0,0 +1,58 @@
|
|||
package supervisor
|
||||
|
||||
import "github.com/rcrowley/go-metrics"
|
||||
|
||||
var (
|
||||
ContainerStartTimer = metrics.NewTimer()
|
||||
ContainersCounter = metrics.NewCounter()
|
||||
EventsCounter = metrics.NewCounter()
|
||||
EventSubscriberCounter = metrics.NewCounter()
|
||||
)
|
||||
|
||||
func Metrics() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"container-start-time": ContainerStartTimer,
|
||||
"containers": ContainersCounter,
|
||||
"events": EventsCounter,
|
||||
"events-subscribers": EventSubscriberCounter,
|
||||
}
|
||||
}
|
||||
|
||||
type StatsEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
type UnsubscribeStatsEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
type StopStatsEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *StatsEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
e.Stats = h.s.statsCollector.collect(i.container)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *UnsubscribeStatsEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
h.s.statsCollector.unsubscribe(i.container, e.Stats)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *StopStatsEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
h.s.statsCollector.stopCollection(i.container)
|
||||
return nil
|
||||
}
|
240
supervisor/stats_collector.go
Normal file
240
supervisor/stats_collector.go
Normal file
|
@ -0,0 +1,240 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/containerd/api/grpc/types"
|
||||
"github.com/docker/containerd/runtime"
|
||||
"github.com/docker/docker/pkg/pubsub"
|
||||
"github.com/opencontainers/runc/libcontainer"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
func convertBlkioEntryToPb(b []cgroups.BlkioStatEntry) []*types.BlkioStatsEntry {
|
||||
var pbEs []*types.BlkioStatsEntry
|
||||
for _, e := range b {
|
||||
pbEs = append(pbEs, &types.BlkioStatsEntry{
|
||||
Major: e.Major,
|
||||
Minor: e.Minor,
|
||||
Op: e.Op,
|
||||
Value: e.Value,
|
||||
})
|
||||
}
|
||||
return pbEs
|
||||
}
|
||||
|
||||
func convertToPb(st *runtime.Stat) *types.Stats {
|
||||
pbSt := &types.Stats{
|
||||
Timestamp: uint64(st.Timestamp.Unix()),
|
||||
CgroupStats: &types.CgroupStats{},
|
||||
}
|
||||
lcSt, ok := st.Data.(*libcontainer.Stats)
|
||||
if !ok {
|
||||
return pbSt
|
||||
}
|
||||
cpuSt := lcSt.CgroupStats.CpuStats
|
||||
pbSt.CgroupStats.CpuStats = &types.CpuStats{
|
||||
CpuUsage: &types.CpuUsage{
|
||||
TotalUsage: cpuSt.CpuUsage.TotalUsage,
|
||||
PercpuUsage: cpuSt.CpuUsage.PercpuUsage,
|
||||
UsageInKernelmode: cpuSt.CpuUsage.UsageInKernelmode,
|
||||
UsageInUsermode: cpuSt.CpuUsage.UsageInUsermode,
|
||||
},
|
||||
ThrottlingData: &types.ThrottlingData{
|
||||
Periods: cpuSt.ThrottlingData.Periods,
|
||||
ThrottledPeriods: cpuSt.ThrottlingData.ThrottledPeriods,
|
||||
ThrottledTime: cpuSt.ThrottlingData.ThrottledTime,
|
||||
},
|
||||
}
|
||||
memSt := lcSt.CgroupStats.MemoryStats
|
||||
pbSt.CgroupStats.MemoryStats = &types.MemoryStats{
|
||||
Cache: memSt.Cache,
|
||||
Usage: &types.MemoryData{
|
||||
Usage: memSt.Usage.Usage,
|
||||
MaxUsage: memSt.Usage.MaxUsage,
|
||||
Failcnt: memSt.Usage.Failcnt,
|
||||
},
|
||||
SwapUsage: &types.MemoryData{
|
||||
Usage: memSt.SwapUsage.Usage,
|
||||
MaxUsage: memSt.SwapUsage.MaxUsage,
|
||||
Failcnt: memSt.SwapUsage.Failcnt,
|
||||
},
|
||||
}
|
||||
blkSt := lcSt.CgroupStats.BlkioStats
|
||||
pbSt.CgroupStats.BlkioStats = &types.BlkioStats{
|
||||
IoServiceBytesRecursive: convertBlkioEntryToPb(blkSt.IoServiceBytesRecursive),
|
||||
IoServicedRecursive: convertBlkioEntryToPb(blkSt.IoServicedRecursive),
|
||||
IoQueuedRecursive: convertBlkioEntryToPb(blkSt.IoQueuedRecursive),
|
||||
IoServiceTimeRecursive: convertBlkioEntryToPb(blkSt.IoServiceTimeRecursive),
|
||||
IoWaitTimeRecursive: convertBlkioEntryToPb(blkSt.IoWaitTimeRecursive),
|
||||
IoMergedRecursive: convertBlkioEntryToPb(blkSt.IoMergedRecursive),
|
||||
IoTimeRecursive: convertBlkioEntryToPb(blkSt.IoTimeRecursive),
|
||||
SectorsRecursive: convertBlkioEntryToPb(blkSt.SectorsRecursive),
|
||||
}
|
||||
pbSt.CgroupStats.HugetlbStats = make(map[string]*types.HugetlbStats)
|
||||
for k, st := range lcSt.CgroupStats.HugetlbStats {
|
||||
pbSt.CgroupStats.HugetlbStats[k] = &types.HugetlbStats{
|
||||
Usage: st.Usage,
|
||||
MaxUsage: st.MaxUsage,
|
||||
Failcnt: st.Failcnt,
|
||||
}
|
||||
}
|
||||
return pbSt
|
||||
}
|
||||
|
||||
type statsPair struct {
|
||||
ct runtime.Container
|
||||
pub *pubsub.Publisher
|
||||
}
|
||||
|
||||
func newStatsCollector(interval time.Duration) *statsCollector {
|
||||
s := &statsCollector{
|
||||
interval: interval,
|
||||
clockTicksPerSecond: uint64(system.GetClockTicks()),
|
||||
bufReader: bufio.NewReaderSize(nil, 128),
|
||||
publishers: make(map[string]*statsPair),
|
||||
}
|
||||
go s.run()
|
||||
return s
|
||||
}
|
||||
|
||||
// statsCollector manages and provides container resource stats
|
||||
type statsCollector struct {
|
||||
m sync.Mutex
|
||||
supervisor *Supervisor
|
||||
interval time.Duration
|
||||
clockTicksPerSecond uint64
|
||||
publishers map[string]*statsPair
|
||||
bufReader *bufio.Reader
|
||||
}
|
||||
|
||||
// collect registers the container with the collector and adds it to
|
||||
// the event loop for collection on the specified interval returning
|
||||
// a channel for the subscriber to receive on.
|
||||
func (s *statsCollector) collect(c runtime.Container) chan interface{} {
|
||||
s.m.Lock()
|
||||
defer s.m.Unlock()
|
||||
publisher, exists := s.publishers[c.ID()]
|
||||
if !exists {
|
||||
pub := pubsub.NewPublisher(100*time.Millisecond, 1024)
|
||||
publisher = &statsPair{ct: c, pub: pub}
|
||||
s.publishers[c.ID()] = publisher
|
||||
}
|
||||
return publisher.pub.Subscribe()
|
||||
}
|
||||
|
||||
// stopCollection closes the channels for all subscribers and removes
|
||||
// the container from metrics collection.
|
||||
func (s *statsCollector) stopCollection(c runtime.Container) {
|
||||
s.m.Lock()
|
||||
if publisher, exists := s.publishers[c.ID()]; exists {
|
||||
publisher.pub.Close()
|
||||
delete(s.publishers, c.ID())
|
||||
}
|
||||
s.m.Unlock()
|
||||
}
|
||||
|
||||
// unsubscribe removes a specific subscriber from receiving updates for a container's stats.
|
||||
func (s *statsCollector) unsubscribe(c runtime.Container, ch chan interface{}) {
|
||||
s.m.Lock()
|
||||
publisher := s.publishers[c.ID()]
|
||||
if publisher != nil {
|
||||
publisher.pub.Evict(ch)
|
||||
if publisher.pub.Len() == 0 {
|
||||
delete(s.publishers, c.ID())
|
||||
}
|
||||
}
|
||||
s.m.Unlock()
|
||||
}
|
||||
|
||||
func (s *statsCollector) run() {
|
||||
type publishersPair struct {
|
||||
container runtime.Container
|
||||
publisher *pubsub.Publisher
|
||||
}
|
||||
// we cannot determine the capacity here.
|
||||
// it will grow enough in first iteration
|
||||
var pairs []*statsPair
|
||||
|
||||
for range time.Tick(s.interval) {
|
||||
// it does not make sense in the first iteration,
|
||||
// but saves allocations in further iterations
|
||||
pairs = pairs[:0]
|
||||
|
||||
s.m.Lock()
|
||||
for _, publisher := range s.publishers {
|
||||
// copy pointers here to release the lock ASAP
|
||||
pairs = append(pairs, publisher)
|
||||
}
|
||||
s.m.Unlock()
|
||||
if len(pairs) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, pair := range pairs {
|
||||
stats, err := pair.ct.Stats()
|
||||
if err != nil {
|
||||
logrus.Errorf("Error getting stats for container ID %s", pair.ct.ID())
|
||||
continue
|
||||
}
|
||||
|
||||
pair.pub.Publish(convertToPb(stats))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const nanoSecondsPerSecond = 1e9
|
||||
|
||||
// getSystemCPUUsage returns the host system's cpu usage in
|
||||
// nanoseconds. An error is returned if the format of the underlying
|
||||
// file does not match.
|
||||
//
|
||||
// Uses /proc/stat defined by POSIX. Looks for the cpu
|
||||
// statistics line and then sums up the first seven fields
|
||||
// provided. See `man 5 proc` for details on specific field
|
||||
// information.
|
||||
func (s *statsCollector) getSystemCPUUsage() (uint64, error) {
|
||||
var line string
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer func() {
|
||||
s.bufReader.Reset(nil)
|
||||
f.Close()
|
||||
}()
|
||||
s.bufReader.Reset(f)
|
||||
err = nil
|
||||
for err == nil {
|
||||
line, err = s.bufReader.ReadString('\n')
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
switch parts[0] {
|
||||
case "cpu":
|
||||
if len(parts) < 8 {
|
||||
return 0, fmt.Errorf("bad format of cpu stats")
|
||||
}
|
||||
var totalClockTicks uint64
|
||||
for _, i := range parts[1:8] {
|
||||
v, err := strconv.ParseUint(i, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error parsing cpu stats")
|
||||
}
|
||||
totalClockTicks += v
|
||||
}
|
||||
return (totalClockTicks * nanoSecondsPerSecond) /
|
||||
s.clockTicksPerSecond, nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("bad stats format")
|
||||
}
|
235
supervisor/supervisor.go
Normal file
235
supervisor/supervisor.go
Normal file
|
@ -0,0 +1,235 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/containerd/eventloop"
|
||||
"github.com/docker/containerd/runtime"
|
||||
"github.com/opencontainers/runc/libcontainer"
|
||||
)
|
||||
|
||||
const (
|
||||
statsInterval = 1 * time.Second
|
||||
defaultBufferSize = 2048 // size of queue in eventloop
|
||||
)
|
||||
|
||||
// New returns an initialized Process supervisor.
|
||||
func New(id, stateDir string, tasks chan *StartTask, oom bool) (*Supervisor, error) {
|
||||
if err := os.MkdirAll(stateDir, 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// register counters
|
||||
r, err := newRuntime(filepath.Join(stateDir, id))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
machine, err := CollectMachineInformation(id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s := &Supervisor{
|
||||
stateDir: stateDir,
|
||||
containers: make(map[string]*containerInfo),
|
||||
processes: make(map[int]*containerInfo),
|
||||
runtime: r,
|
||||
tasks: tasks,
|
||||
machine: machine,
|
||||
subscribers: make(map[chan *Event]struct{}),
|
||||
statsCollector: newStatsCollector(statsInterval),
|
||||
el: eventloop.NewChanLoop(defaultBufferSize),
|
||||
}
|
||||
if oom {
|
||||
s.notifier = newNotifier(s)
|
||||
}
|
||||
// register default event handlers
|
||||
s.handlers = map[EventType]Handler{
|
||||
ExecExitEventType: &ExecExitEvent{s},
|
||||
ExitEventType: &ExitEvent{s},
|
||||
StartContainerEventType: &StartEvent{s},
|
||||
DeleteEventType: &DeleteEvent{s},
|
||||
GetContainerEventType: &GetContainersEvent{s},
|
||||
SignalEventType: &SignalEvent{s},
|
||||
AddProcessEventType: &AddProcessEvent{s},
|
||||
UpdateContainerEventType: &UpdateEvent{s},
|
||||
CreateCheckpointEventType: &CreateCheckpointEvent{s},
|
||||
DeleteCheckpointEventType: &DeleteCheckpointEvent{s},
|
||||
StatsEventType: &StatsEvent{s},
|
||||
UnsubscribeStatsEventType: &UnsubscribeStatsEvent{s},
|
||||
StopStatsEventType: &StopStatsEvent{s},
|
||||
}
|
||||
// start the container workers for concurrent container starts
|
||||
return s, nil
|
||||
}
|
||||
|
||||
type containerInfo struct {
|
||||
container runtime.Container
|
||||
copier *copier
|
||||
}
|
||||
|
||||
type Supervisor struct {
|
||||
// stateDir is the directory on the system to store container runtime state information.
|
||||
stateDir string
|
||||
containers map[string]*containerInfo
|
||||
processes map[int]*containerInfo
|
||||
handlers map[EventType]Handler
|
||||
runtime runtime.Runtime
|
||||
events chan *Event
|
||||
tasks chan *StartTask
|
||||
// we need a lock around the subscribers map only because additions and deletions from
|
||||
// the map are via the API so we cannot really control the concurrency
|
||||
subscriberLock sync.RWMutex
|
||||
subscribers map[chan *Event]struct{}
|
||||
machine Machine
|
||||
containerGroup sync.WaitGroup
|
||||
statsCollector *statsCollector
|
||||
notifier *notifier
|
||||
el eventloop.EventLoop
|
||||
}
|
||||
|
||||
// Stop closes all tasks and sends a SIGTERM to each container's pid1 then waits for they to
|
||||
// terminate. After it has handled all the SIGCHILD events it will close the signals chan
|
||||
// and exit. Stop is a non-blocking call and will return after the containers have been signaled
|
||||
func (s *Supervisor) Stop(sig chan os.Signal) {
|
||||
// Close the tasks channel so that no new containers get started
|
||||
close(s.tasks)
|
||||
// send a SIGTERM to all containers
|
||||
for id, i := range s.containers {
|
||||
c := i.container
|
||||
logrus.WithField("id", id).Debug("sending TERM to container processes")
|
||||
procs, err := c.Processes()
|
||||
if err != nil {
|
||||
logrus.WithField("id", id).Warn("get container processes")
|
||||
continue
|
||||
}
|
||||
if len(procs) == 0 {
|
||||
continue
|
||||
}
|
||||
mainProc := procs[0]
|
||||
if err := mainProc.Signal(syscall.SIGTERM); err != nil {
|
||||
pid, _ := mainProc.Pid()
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"id": id,
|
||||
"pid": pid,
|
||||
"error": err,
|
||||
}).Error("send SIGTERM to process")
|
||||
}
|
||||
}
|
||||
go func() {
|
||||
logrus.Debug("waiting for containers to exit")
|
||||
s.containerGroup.Wait()
|
||||
logrus.Debug("all containers exited")
|
||||
// stop receiving signals and close the channel
|
||||
signal.Stop(sig)
|
||||
close(sig)
|
||||
}()
|
||||
}
|
||||
|
||||
// Close closes any open files in the supervisor but expects that Stop has been
|
||||
// callsed so that no more containers are started.
|
||||
func (s *Supervisor) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Events returns an event channel that external consumers can use to receive updates
|
||||
// on container events
|
||||
func (s *Supervisor) Events() chan *Event {
|
||||
s.subscriberLock.Lock()
|
||||
defer s.subscriberLock.Unlock()
|
||||
c := make(chan *Event, defaultBufferSize)
|
||||
EventSubscriberCounter.Inc(1)
|
||||
s.subscribers[c] = struct{}{}
|
||||
return c
|
||||
}
|
||||
|
||||
// Unsubscribe removes the provided channel from receiving any more events
|
||||
func (s *Supervisor) Unsubscribe(sub chan *Event) {
|
||||
s.subscriberLock.Lock()
|
||||
defer s.subscriberLock.Unlock()
|
||||
delete(s.subscribers, sub)
|
||||
close(sub)
|
||||
EventSubscriberCounter.Dec(1)
|
||||
}
|
||||
|
||||
// notifySubscribers will send the provided event to the external subscribers
|
||||
// of the events channel
|
||||
func (s *Supervisor) notifySubscribers(e *Event) {
|
||||
s.subscriberLock.RLock()
|
||||
defer s.subscriberLock.RUnlock()
|
||||
for sub := range s.subscribers {
|
||||
// do a non-blocking send for the channel
|
||||
select {
|
||||
case sub <- e:
|
||||
default:
|
||||
logrus.WithField("event", e.Type).Warn("event not sent to subscriber")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start is a non-blocking call that runs the supervisor for monitoring contianer processes and
|
||||
// executing new containers.
|
||||
//
|
||||
// This event loop is the only thing that is allowed to modify state of containers and processes
|
||||
// therefore it is save to do operations in the handlers that modify state of the system or
|
||||
// state of the Supervisor
|
||||
func (s *Supervisor) Start() error {
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"runtime": s.runtime.Type(),
|
||||
"stateDir": s.stateDir,
|
||||
}).Debug("Supervisor started")
|
||||
return s.el.Start()
|
||||
}
|
||||
|
||||
// Machine returns the machine information for which the
|
||||
// supervisor is executing on.
|
||||
func (s *Supervisor) Machine() Machine {
|
||||
return s.machine
|
||||
}
|
||||
|
||||
// getContainerForPid returns the container where the provided pid is the pid1 or main
|
||||
// process in the container
|
||||
func (s *Supervisor) getContainerForPid(pid int) (runtime.Container, error) {
|
||||
for _, i := range s.containers {
|
||||
container := i.container
|
||||
cpid, err := container.Pid()
|
||||
if err != nil {
|
||||
if lerr, ok := err.(libcontainer.Error); ok {
|
||||
if lerr.Code() == libcontainer.ProcessNotExecuted {
|
||||
continue
|
||||
}
|
||||
}
|
||||
logrus.WithField("error", err).Error("containerd: get container pid")
|
||||
}
|
||||
if pid == cpid {
|
||||
return container, nil
|
||||
}
|
||||
}
|
||||
return nil, errNoContainerForPid
|
||||
}
|
||||
|
||||
// SendEvent sends the provided event the the supervisors main event loop
|
||||
func (s *Supervisor) SendEvent(evt *Event) {
|
||||
EventsCounter.Inc(1)
|
||||
s.el.Send(&commonEvent{data: evt, sv: s})
|
||||
}
|
||||
|
||||
func (s *Supervisor) copyIO(stdin, stdout, stderr string, i *runtime.IO) (*copier, error) {
|
||||
config := &ioConfig{
|
||||
Stdin: i.Stdin,
|
||||
Stdout: i.Stdout,
|
||||
Stderr: i.Stderr,
|
||||
StdoutPath: stdout,
|
||||
StderrPath: stderr,
|
||||
StdinPath: stdin,
|
||||
}
|
||||
l, err := newCopier(config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return l, nil
|
||||
}
|
12
supervisor/supervisor_linux.go
Normal file
12
supervisor/supervisor_linux.go
Normal file
|
@ -0,0 +1,12 @@
|
|||
// +build libcontainer
|
||||
|
||||
package supervisor
|
||||
|
||||
import (
|
||||
"github.com/docker/containerd/linux"
|
||||
"github.com/docker/containerd/runtime"
|
||||
)
|
||||
|
||||
func newRuntime(stateDir string) (runtime.Runtime, error) {
|
||||
return linux.NewRuntime(stateDir)
|
||||
}
|
12
supervisor/supervisor_runc.go
Normal file
12
supervisor/supervisor_runc.go
Normal file
|
@ -0,0 +1,12 @@
|
|||
// +build runc
|
||||
|
||||
package supervisor
|
||||
|
||||
import (
|
||||
"github.com/docker/containerd/runc"
|
||||
"github.com/docker/containerd/runtime"
|
||||
)
|
||||
|
||||
func newRuntime(stateDir string) (runtime.Runtime, error) {
|
||||
return runc.NewRuntime(stateDir)
|
||||
}
|
13
supervisor/supervisor_unsupported.go
Normal file
13
supervisor/supervisor_unsupported.go
Normal file
|
@ -0,0 +1,13 @@
|
|||
// +build !libcontainer,!runc
|
||||
|
||||
package supervisor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/docker/containerd/runtime"
|
||||
)
|
||||
|
||||
func newRuntime(stateDir string) (runtime.Runtime, error) {
|
||||
return nil, errors.New("unsupported platform")
|
||||
}
|
41
supervisor/update.go
Normal file
41
supervisor/update.go
Normal file
|
@ -0,0 +1,41 @@
|
|||
package supervisor
|
||||
|
||||
import "github.com/docker/containerd/runtime"
|
||||
|
||||
type UpdateEvent struct {
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (h *UpdateEvent) Handle(e *Event) error {
|
||||
i, ok := h.s.containers[e.ID]
|
||||
if !ok {
|
||||
return ErrContainerNotFound
|
||||
}
|
||||
container := i.container
|
||||
if e.State.Status != "" {
|
||||
switch e.State.Status {
|
||||
case runtime.Running:
|
||||
if err := container.Resume(); err != nil {
|
||||
return ErrUnknownContainerStatus
|
||||
}
|
||||
case runtime.Paused:
|
||||
if err := container.Pause(); err != nil {
|
||||
return ErrUnknownContainerStatus
|
||||
}
|
||||
default:
|
||||
return ErrUnknownContainerStatus
|
||||
}
|
||||
}
|
||||
if e.Signal != nil {
|
||||
// signal the pid1/main process of the container
|
||||
processes, err := container.Processes()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(processes) == 0 {
|
||||
return ErrProcessNotFound
|
||||
}
|
||||
return processes[0].Signal(e.Signal)
|
||||
}
|
||||
return nil
|
||||
}
|
86
supervisor/worker.go
Normal file
86
supervisor/worker.go
Normal file
|
@ -0,0 +1,86 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/containerd/runtime"
|
||||
)
|
||||
|
||||
type Worker interface {
|
||||
Start()
|
||||
}
|
||||
|
||||
type StartTask struct {
|
||||
Container runtime.Container
|
||||
Checkpoint string
|
||||
IO *runtime.IO
|
||||
Stdin string
|
||||
Stdout string
|
||||
Stderr string
|
||||
Err chan error
|
||||
StartResponse chan StartResponse
|
||||
}
|
||||
|
||||
func NewWorker(s *Supervisor, wg *sync.WaitGroup) Worker {
|
||||
return &worker{
|
||||
s: s,
|
||||
wg: wg,
|
||||
}
|
||||
}
|
||||
|
||||
type worker struct {
|
||||
wg *sync.WaitGroup
|
||||
s *Supervisor
|
||||
}
|
||||
|
||||
func (w *worker) Start() {
|
||||
defer w.wg.Done()
|
||||
for t := range w.s.tasks {
|
||||
started := time.Now()
|
||||
l, err := w.s.copyIO(t.Stdin, t.Stdout, t.Stderr, t.IO)
|
||||
if err != nil {
|
||||
evt := NewEvent(DeleteEventType)
|
||||
evt.ID = t.Container.ID()
|
||||
w.s.SendEvent(evt)
|
||||
t.Err <- err
|
||||
continue
|
||||
}
|
||||
w.s.containers[t.Container.ID()].copier = l
|
||||
if t.Checkpoint != "" {
|
||||
if err := t.Container.Restore(t.Checkpoint); err != nil {
|
||||
evt := NewEvent(DeleteEventType)
|
||||
evt.ID = t.Container.ID()
|
||||
w.s.SendEvent(evt)
|
||||
t.Err <- err
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
if err := t.Container.Start(); err != nil {
|
||||
evt := NewEvent(DeleteEventType)
|
||||
evt.ID = t.Container.ID()
|
||||
w.s.SendEvent(evt)
|
||||
t.Err <- err
|
||||
continue
|
||||
}
|
||||
}
|
||||
pid, err := t.Container.Pid()
|
||||
if err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: get container main pid")
|
||||
}
|
||||
if w.s.notifier != nil {
|
||||
n, err := t.Container.OOM()
|
||||
if err != nil {
|
||||
logrus.WithField("error", err).Error("containerd: notify OOM events")
|
||||
} else {
|
||||
w.s.notifier.Add(n, t.Container.ID())
|
||||
}
|
||||
}
|
||||
ContainerStartTimer.UpdateSince(started)
|
||||
t.Err <- nil
|
||||
t.StartResponse <- StartResponse{
|
||||
Pid: pid,
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue