From 33fab479ef9243cbb29769ea3ddff5f9d64d9211 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 21 Feb 2014 14:35:43 +0100 Subject: [PATCH] cgroups: Add systemd implementation of cgroups This implements cgroup.Apply() using the systemd apis. We create a transient unit called "docker-$id.scope" that contains the container processes. We also have a way to set unit specific properties, currently only defining the Slice to put the scope in. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/apply_nosystemd.go | 15 ++++ cgroups/apply_systemd.go | 158 +++++++++++++++++++++++++++++++++++++ cgroups/cgroups.go | 13 ++- 3 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 cgroups/apply_nosystemd.go create mode 100644 cgroups/apply_systemd.go diff --git a/cgroups/apply_nosystemd.go b/cgroups/apply_nosystemd.go new file mode 100644 index 0000000..f94d475 --- /dev/null +++ b/cgroups/apply_nosystemd.go @@ -0,0 +1,15 @@ +// +build !linux + +package cgroups + +import ( + "fmt" +) + +func useSystemd() bool { + return false +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/cgroups/apply_systemd.go b/cgroups/apply_systemd.go new file mode 100644 index 0000000..c689d57 --- /dev/null +++ b/cgroups/apply_systemd.go @@ -0,0 +1,158 @@ +// +build linux + +package cgroups + +import ( + "fmt" + systemd1 "github.com/coreos/go-systemd/dbus" + "github.com/dotcloud/docker/pkg/systemd" + "github.com/godbus/dbus" + "path/filepath" + "strings" + "sync" +) + +type systemdCgroup struct { +} + +var ( + connLock sync.Mutex + theConn *systemd1.Conn + hasStartTransientUnit bool +) + +func useSystemd() bool { + if !systemd.SdBooted() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemd1.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + } + } + } + } + + return hasStartTransientUnit +} + +type DeviceAllow struct { + Node string + Permissions string +} + +func getIfaceForUnit(unitName string) string { + if strings.HasSuffix(unitName, ".scope") { + return "Scope" + } + if strings.HasSuffix(unitName, ".service") { + return "Service" + } + return "Unit" +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + unitName := c.Parent + "-" + c.Name + ".scope" + slice := "system.slice" + + var properties []systemd1.Property + + for _, v := range c.UnitProperties { + switch v[0] { + case "Slice": + slice = v[1] + default: + return nil, fmt.Errorf("Unknown unit propery %s", v[0]) + } + } + + properties = append(properties, + systemd1.Property{"Slice", dbus.MakeVariant(slice)}, + systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)}, + systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}) + + if !c.DeviceAccess { + properties = append(properties, + systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}, + systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{ + {"/dev/null", "rwm"}, + {"/dev/zero", "rwm"}, + {"/dev/full", "rwm"}, + {"/dev/random", "rwm"}, + {"/dev/urandom", "rwm"}, + {"/dev/tty", "rwm"}, + {"/dev/console", "rwm"}, + {"/dev/tty0", "rwm"}, + {"/dev/tty1", "rwm"}, + {"/dev/pts/ptmx", "rwm"}, + // There is no way to add /dev/pts/* here atm, so we hack this manually below + // /dev/pts/* (how to add this?) + // Same with tuntap, which doesn't exist as a node most of the time + })}) + } + + if c.Memory != 0 { + properties = append(properties, + systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))}) + } + // TODO: MemorySwap not available in systemd + + if c.CpuShares != 0 { + properties = append(properties, + systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))}) + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { + return nil, err + } + + // To work around the lack of /dev/pts/* support above we need to manually add these + // so, ask systemd for the cgroup used + props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName)) + if err != nil { + return nil, err + } + + cgroup := props["ControlGroup"].(string) + + if !c.DeviceAccess { + mountpoint, err := FindCgroupMountpoint("devices") + if err != nil { + return nil, err + } + + path := filepath.Join(mountpoint, cgroup) + + // /dev/pts/* + if err := writeFile(path, "devices.allow", "c 136:* rwm"); err != nil { + return nil, err + } + // tuntap + if err := writeFile(path, "devices.allow", "c 10:200 rwm"); err != nil { + return nil, err + } + } + + return &systemdCgroup{}, nil +} + +func (c *systemdCgroup) Cleanup() error { + // systemd cleans up, we don't need to do anything + return nil +} diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index f35556f..8554ba9 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -19,6 +19,8 @@ type Cgroup struct { Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + + UnitProperties [][2]string `json:"unit_properties,omitempty"` // systemd unit properties } type ActiveCgroup interface { @@ -87,5 +89,14 @@ func writeFile(dir, file, data string) error { } func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) { - return rawApply(c, pid) + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + + if useSystemd() { + return systemdApply(c, pid) + } else { + return rawApply(c, pid) + } }