From 3d2d4a062464d5c9bbde9a56ea40e611f5125ae3 Mon Sep 17 00:00:00 2001 From: Darren Shepherd Date: Mon, 3 Mar 2014 21:53:57 -0700 Subject: [PATCH 1/7] Support hairpin NAT without going through docker server Hairpin NAT is currently done by passing through the docker server. If two containers on the same box try to access each other through exposed ports and using the host IP the current iptables rules will not match the DNAT and thus the traffic goes to 'docker -d' This change drops the restriction that DNAT traffic must not originate from docker0. It should be safe to drop this restriction because the DOCKER chain is already gated by jumps that check for the destination address to be a local address. Docker-DCO-1.1-Signed-off-by: Darren Shepherd (github: ibuildthecloud) --- iptables/iptables.go | 1 - 1 file changed, 1 deletion(-) diff --git a/iptables/iptables.go b/iptables/iptables.go index 4cdd67e..1f25952 100644 --- a/iptables/iptables.go +++ b/iptables/iptables.go @@ -66,7 +66,6 @@ func (c *Chain) Forward(action Action, ip net.IP, port int, proto, dest_addr str "-p", proto, "-d", daddr, "--dport", strconv.Itoa(port), - "!", "-i", c.Bridge, "-j", "DNAT", "--to-destination", net.JoinHostPort(dest_addr, strconv.Itoa(dest_port))); err != nil { return err From 2b01982d1fb1d16d0f8aea89c56ebd70d2a97e2f Mon Sep 17 00:00:00 2001 From: Dan Walsh Date: Tue, 18 Mar 2014 16:49:16 -0400 Subject: [PATCH 2/7] This patch adds SELinux labeling support. docker will run the process(es) within the container with an SELinux label and will label all of the content within the container with mount label. Any temporary file systems created within the container need to be mounted with the same mount label. The user can override the process label by specifying -Z With a string of space separated options. -Z "user=unconfined_u role=unconfined_r type=unconfined_t level=s0" Would cause the process label to run with unconfined_u:unconfined_r:unconfined_t:s0" By default the processes will run execute within the container as svirt_lxc_net_t. All of the content in the container as svirt_sandbox_file_t. The process mcs level is based of the PID of the docker process that is creating the container. If you run the container in --priv mode, the labeling will be disabled. Docker-DCO-1.1-Signed-off-by: Dan Walsh (github: rhatdan) --- label/label.go | 23 ++ label/label_selinux.go | 69 ++++++ libcontainer/nsinit/execin.go | 11 +- libcontainer/nsinit/init.go | 8 +- libcontainer/nsinit/mount.go | 22 +- selinux/selinux.go | 387 ++++++++++++++++++++++++++++++++++ selinux/selinux_test.go | 64 ++++++ 7 files changed, 573 insertions(+), 11 deletions(-) create mode 100644 label/label.go create mode 100644 label/label_selinux.go create mode 100644 selinux/selinux.go create mode 100644 selinux/selinux_test.go diff --git a/label/label.go b/label/label.go new file mode 100644 index 0000000..ba1e9f4 --- /dev/null +++ b/label/label.go @@ -0,0 +1,23 @@ +// +build !selinux !linux + +package label + +func GenLabels(options string) (string, string, error) { + return "", "", nil +} + +func FormatMountLabel(src string, MountLabel string) string { + return src +} + +func SetProcessLabel(processLabel string) error { + return nil +} + +func SetFileLabel(path string, fileLabel string) error { + return nil +} + +func GetPidCon(pid int) (string, error) { + return "", nil +} diff --git a/label/label_selinux.go b/label/label_selinux.go new file mode 100644 index 0000000..300a8b6 --- /dev/null +++ b/label/label_selinux.go @@ -0,0 +1,69 @@ +// +build selinux,linux + +package label + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/selinux" + "strings" +) + +func GenLabels(options string) (string, string, error) { + processLabel, mountLabel := selinux.GetLxcContexts() + var err error + if processLabel == "" { // SELinux is disabled + return "", "", err + } + s := strings.Fields(options) + l := len(s) + if l > 0 { + pcon := selinux.NewContext(processLabel) + for i := 0; i < l; i++ { + o := strings.Split(s[i], "=") + pcon[o[0]] = o[1] + } + processLabel = pcon.Get() + mountLabel, err = selinux.CopyLevel(processLabel, mountLabel) + } + return processLabel, mountLabel, err +} + +func FormatMountLabel(src string, MountLabel string) string { + var mountLabel string + if src != "" { + mountLabel = src + if MountLabel != "" { + mountLabel = fmt.Sprintf("%s,context=\"%s\"", mountLabel, MountLabel) + } + } else { + if MountLabel != "" { + mountLabel = fmt.Sprintf("context=\"%s\"", MountLabel) + } + } + return mountLabel +} + +func SetProcessLabel(processLabel string) error { + if selinux.SelinuxEnabled() { + return selinux.Setexeccon(processLabel) + } + return nil +} + +func GetProcessLabel() (string, error) { + if selinux.SelinuxEnabled() { + return selinux.Getexeccon() + } + return "", nil +} + +func SetFileLabel(path string, fileLabel string) error { + if selinux.SelinuxEnabled() && fileLabel != "" { + return selinux.Setfilecon(path, fileLabel) + } + return nil +} + +func GetPidCon(pid int) (string, error) { + return selinux.Getpidcon(pid) +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index f8b8931..9017af0 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "os" @@ -32,7 +33,11 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s closeFds() return -1, err } - + processLabel, err := label.GetPidCon(nspid) + if err != nil { + closeFds() + return -1, err + } // foreach namespace fd, use setns to join an existing container's namespaces for _, fd := range fds { if fd > 0 { @@ -80,6 +85,10 @@ dropAndExec: if err := finalizeNamespace(container); err != nil { return -1, err } + err = label.SetProcessLabel(processLabel) + if err != nil { + return -1, err + } if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 117ae87..5aa5f9f 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/apparmor" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -12,6 +13,7 @@ import ( "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" "os" + "runtime" "syscall" ) @@ -57,7 +59,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("parent death signal %s", err) } ns.logger.Println("setup mount namespace") - if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { + if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot, container.Context["mount_label"]); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if err := setupNetwork(container, context); err != nil { @@ -76,6 +78,10 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return err } } + runtime.LockOSThread() + if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { + return fmt.Errorf("SetProcessLabel label %s", err) + } ns.logger.Printf("execing %s\n", args[0]) return system.Execv(args[0], args[0:], container.Env) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 61a9012..796143c 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "io/ioutil" @@ -20,7 +21,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // // There is no need to unmount the new mounts because as soon as the mount namespace // is no longer in use, the mounts will be removed automatically -func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool) error { +func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool, mountLabel string) error { flag := syscall.MS_PRIVATE if noPivotRoot { flag = syscall.MS_SLAVE @@ -36,7 +37,7 @@ func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, cons return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } - if err := mountSystem(rootfs); err != nil { + if err := mountSystem(rootfs, mountLabel); err != nil { return fmt.Errorf("mount system %s", err) } @@ -64,7 +65,7 @@ func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, cons if err := setupDev(rootfs); err != nil { return err } - if err := setupPtmx(rootfs, console); err != nil { + if err := setupPtmx(rootfs, console, mountLabel); err != nil { return err } if err := system.Chdir(rootfs); err != nil { @@ -196,7 +197,7 @@ func setupDev(rootfs string) error { } // setupConsole ensures that the container has a proper /dev/console setup -func setupConsole(rootfs, console string) error { +func setupConsole(rootfs, console string, mountLabel string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -220,6 +221,9 @@ func setupConsole(rootfs, console string) error { if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } + if err := label.SetFileLabel(console, mountLabel); err != nil { + return fmt.Errorf("SetFileLabel Failed %s %s", dest, err) + } if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } @@ -228,7 +232,7 @@ func setupConsole(rootfs, console string) error { // mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts // inside the mount namespace -func mountSystem(rootfs string) error { +func mountSystem(rootfs string, mountLabel string) error { for _, m := range []struct { source string path string @@ -238,8 +242,8 @@ func mountSystem(rootfs string) error { }{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777,size=65536k"}, - {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1755,size=65536k", mountLabel)}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) @@ -253,7 +257,7 @@ func mountSystem(rootfs string) error { // setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and // finishes setting up /dev/console -func setupPtmx(rootfs, console string) error { +func setupPtmx(rootfs, console string, mountLabel string) error { ptmx := filepath.Join(rootfs, "dev/ptmx") if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err @@ -262,7 +266,7 @@ func setupPtmx(rootfs, console string) error { return fmt.Errorf("symlink dev ptmx %s", err) } if console != "" { - if err := setupConsole(rootfs, console); err != nil { + if err := setupConsole(rootfs, console, mountLabel); err != nil { return err } } diff --git a/selinux/selinux.go b/selinux/selinux.go new file mode 100644 index 0000000..5236d3f --- /dev/null +++ b/selinux/selinux.go @@ -0,0 +1,387 @@ +package selinux + +import ( + "bufio" + "crypto/rand" + "encoding/binary" + "fmt" + "github.com/dotcloud/docker/pkg/mount" + "github.com/dotcloud/docker/pkg/system" + "io" + "os" + "regexp" + "strconv" + "strings" + "syscall" +) + +const ( + Enforcing = 1 + Permissive = 0 + Disabled = -1 + selinuxDir = "/etc/selinux/" + selinuxConfig = selinuxDir + "config" + selinuxTypeTag = "SELINUXTYPE" + selinuxTag = "SELINUX" + selinuxPath = "/sys/fs/selinux" + xattrNameSelinux = "security.selinux" + stRdOnly = 0x01 +) + +var ( + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + spaceRegex = regexp.MustCompile(`^([^=]+) (.*)$`) + mcsList = make(map[string]bool) + selinuxfs = "unknown" + selinuxEnabled = false + selinuxEnabledChecked = false +) + +type SELinuxContext map[string]string + +func GetSelinuxMountPoint() string { + if selinuxfs != "unknown" { + return selinuxfs + } + selinuxfs = "" + + mounts, err := mount.GetMounts() + if err != nil { + return selinuxfs + } + for _, mount := range mounts { + if mount.Fstype == "selinuxfs" { + selinuxfs = mount.Mountpoint + break + } + } + if selinuxfs != "" { + var buf syscall.Statfs_t + syscall.Statfs(selinuxfs, &buf) + if (buf.Flags & stRdOnly) == 1 { + selinuxfs = "" + } + } + return selinuxfs +} + +func SelinuxEnabled() bool { + if selinuxEnabledChecked { + return selinuxEnabled + } + selinuxEnabledChecked = true + if fs := GetSelinuxMountPoint(); fs != "" { + if con, _ := Getcon(); con != "kernel" { + selinuxEnabled = true + } + } + return selinuxEnabled +} + +func ReadConfig(target string) (value string) { + var ( + val, key string + bufin *bufio.Reader + ) + + in, err := os.Open(selinuxConfig) + if err != nil { + return "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err != io.EOF { + return "" + } + done = true + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == target { + return strings.Trim(val, "\"") + } + } + } + return "" +} + +func GetSELinuxPolicyRoot() string { + return selinuxDir + ReadConfig(selinuxTypeTag) +} + +func readCon(name string) (string, error) { + var val string + + in, err := os.Open(name) + if err != nil { + return "", err + } + defer in.Close() + + _, err = fmt.Fscanf(in, "%s", &val) + return val, err +} + +func Setfilecon(path string, scon string) error { + return system.Lsetxattr(path, xattrNameSelinux, []byte(scon), 0) +} + +func Getfilecon(path string) (string, error) { + var scon []byte + + cnt, err := syscall.Getxattr(path, xattrNameSelinux, scon) + scon = make([]byte, cnt) + cnt, err = syscall.Getxattr(path, xattrNameSelinux, scon) + return string(scon), err +} + +func Setfscreatecon(scon string) error { + return writeCon("/proc/self/attr/fscreate", scon) +} + +func Getfscreatecon() (string, error) { + return readCon("/proc/self/attr/fscreate") +} + +func Getcon() (string, error) { + return readCon("/proc/self/attr/current") +} + +func Getpidcon(pid int) (string, error) { + return readCon(fmt.Sprintf("/proc/%d/attr/current", pid)) +} + +func Getexeccon() (string, error) { + return readCon("/proc/self/attr/exec") +} + +func writeCon(name string, val string) error { + if !SelinuxEnabled() { + return nil + } + out, err := os.OpenFile(name, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + if val != "" { + _, err = out.Write([]byte(val)) + } else { + _, err = out.Write(nil) + } + return err +} + +func Setexeccon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), scon) +} + +func (c SELinuxContext) Get() string { + return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"]) +} + +func NewContext(scon string) SELinuxContext { + c := make(SELinuxContext) + + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + c["user"] = con[0] + c["role"] = con[1] + c["type"] = con[2] + c["level"] = con[3] + } + return c +} + +func SelinuxGetEnforce() int { + var enforce int + + enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) + if err != nil { + return -1 + } + + enforce, err = strconv.Atoi(string(enforceS)) + if err != nil { + return -1 + } + return enforce +} + +func SelinuxGetEnforceMode() int { + switch ReadConfig(selinuxTag) { + case "enforcing": + return Enforcing + case "permissive": + return Permissive + } + return Disabled +} + +func mcsAdd(mcs string) { + mcsList[mcs] = true +} + +func mcsDelete(mcs string) { + mcsList[mcs] = false +} + +func mcsExists(mcs string) bool { + return mcsList[mcs] +} + +func IntToMcs(id int, catRange uint32) string { + var ( + SETSIZE = int(catRange) + TIER = SETSIZE + ORD = id + ) + + if id < 1 || id > 523776 { + return "" + } + + for ORD > TIER { + ORD = ORD - TIER + TIER -= 1 + } + TIER = SETSIZE - TIER + ORD = ORD + TIER + return fmt.Sprintf("s0:c%d,c%d", TIER, ORD) +} + +func uniqMcs(catRange uint32) string { + var ( + n uint32 + c1, c2 uint32 + mcs string + ) + + for { + binary.Read(rand.Reader, binary.LittleEndian, &n) + c1 = n % catRange + binary.Read(rand.Reader, binary.LittleEndian, &n) + c2 = n % catRange + if c1 == c2 { + continue + } else { + if c1 > c2 { + t := c1 + c1 = c2 + c2 = t + } + } + mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) + if mcsExists(mcs) { + continue + } + mcsAdd(mcs) + break + } + return mcs +} + +func FreeContext(con string) { + if con != "" { + scon := NewContext(con) + mcsDelete(scon["level"]) + } +} + +func GetLxcContexts() (processLabel string, fileLabel string) { + var ( + val, key string + bufin *bufio.Reader + ) + + if !SelinuxEnabled() { + return "", "" + } + lxcPath := fmt.Sprintf("%s/content/lxc_contexts", GetSELinuxPolicyRoot()) + fileLabel = "system_u:object_r:svirt_sandbox_file_t:s0" + processLabel = "system_u:system_r:svirt_lxc_net_t:s0" + + in, err := os.Open(lxcPath) + if err != nil { + goto exit + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err == io.EOF { + done = true + } else { + goto exit + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == "process" { + processLabel = strings.Trim(val, "\"") + } + if key == "file" { + fileLabel = strings.Trim(val, "\"") + } + } + } +exit: + mcs := IntToMcs(os.Getpid(), 1024) + scon := NewContext(processLabel) + scon["level"] = mcs + processLabel = scon.Get() + scon = NewContext(fileLabel) + scon["level"] = mcs + fileLabel = scon.Get() + return processLabel, fileLabel +} + +func SecurityCheckContext(val string) error { + return writeCon(fmt.Sprintf("%s.context", selinuxPath), val) +} + +func CopyLevel(src, dest string) (string, error) { + if !SelinuxEnabled() { + return "", nil + } + if src == "" { + return "", nil + } + if err := SecurityCheckContext(src); err != nil { + return "", err + } + if err := SecurityCheckContext(dest); err != nil { + return "", err + } + scon := NewContext(src) + tcon := NewContext(dest) + tcon["level"] = scon["level"] + return tcon.Get(), nil +} diff --git a/selinux/selinux_test.go b/selinux/selinux_test.go new file mode 100644 index 0000000..6b59c1d --- /dev/null +++ b/selinux/selinux_test.go @@ -0,0 +1,64 @@ +package selinux_test + +import ( + "github.com/dotcloud/docker/pkg/selinux" + "os" + "testing" +) + +func testSetfilecon(t *testing.T) { + if selinux.SelinuxEnabled() { + tmp := "selinux_test" + out, _ := os.OpenFile(tmp, os.O_WRONLY, 0) + out.Close() + err := selinux.Setfilecon(tmp, "system_u:object_r:bin_t:s0") + if err == nil { + t.Log(selinux.Getfilecon(tmp)) + } else { + t.Log("Setfilecon failed") + t.Fatal(err) + } + os.Remove(tmp) + } +} + +func TestSELinux(t *testing.T) { + var ( + err error + plabel, flabel string + ) + + if selinux.SelinuxEnabled() { + t.Log("Enabled") + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + t.Log("getenforce ", selinux.SelinuxGetEnforce()) + t.Log("getenforcemode ", selinux.SelinuxGetEnforceMode()) + pid := os.Getpid() + t.Log("PID:%d MCS:%s\n", pid, selinux.IntToMcs(pid, 1023)) + t.Log(selinux.Getcon()) + t.Log(selinux.Getfilecon("/etc/passwd")) + err = selinux.Setfscreatecon("unconfined_u:unconfined_r:unconfined_t:s0") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + err = selinux.Setfscreatecon("") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + t.Log(selinux.Getpidcon(1)) + t.Log(selinux.GetSelinuxMountPoint()) + } else { + t.Log("Disabled") + } +} From fce532280b856229287e7bf5b57835ab0c7accd2 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 14 Mar 2014 10:47:49 +0100 Subject: [PATCH 3/7] cgroups: Splity out Apply/Cleanup to separate file/interface This leaves only the generic cgroup helper functions in cgroups.go and will allow easy implementations of other cgroup managers. This also wires up the call to Cleanup the cgroup which was missing before. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/apply_raw.go | 189 ++++++++++++++++++++++++++++++++++++ cgroups/cgroups.go | 171 ++------------------------------ libcontainer/nsinit/exec.go | 16 +-- 3 files changed, 205 insertions(+), 171 deletions(-) create mode 100644 cgroups/apply_raw.go diff --git a/cgroups/apply_raw.go b/cgroups/apply_raw.go new file mode 100644 index 0000000..bce96f4 --- /dev/null +++ b/cgroups/apply_raw.go @@ -0,0 +1,189 @@ +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + "strconv" +) + +type rawCgroup struct { + root string + cgroup string +} + +func rawApply(c *Cgroup, pid int) (ActiveCgroup, error) { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + + cgroupRoot, err := FindCgroupMountpoint("cpu") + if err != nil { + return nil, err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return nil, fmt.Errorf("cgroups fs not found") + } + + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + + raw := &rawCgroup{ + root: cgroupRoot, + cgroup: cgroup, + } + + if err := raw.setupDevices(c, pid); err != nil { + return nil, err + } + if err := raw.setupMemory(c, pid); err != nil { + return nil, err + } + if err := raw.setupCpu(c, pid); err != nil { + return nil, err + } + return raw, nil +} + +func (raw *rawCgroup) path(subsystem string) (string, error) { + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(raw.root, subsystem, initPath, raw.cgroup), nil +} + +func (raw *rawCgroup) join(subsystem string, pid int) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (raw *rawCgroup) setupDevices(c *Cgroup, pid int) (err error) { + if !c.DeviceAccess { + dir, err := raw.join("devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupMemory(c *Cgroup, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := raw.join("memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupCpu(c *Cgroup, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := raw.join("cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func (raw *rawCgroup) Cleanup() error { + get := func(subsystem string) string { + path, _ := raw.path(subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + if path != "" { + os.RemoveAll(path) + } + } + return nil +} diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index b40e1a3..f35556f 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -8,7 +8,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strconv" "strings" ) @@ -22,6 +21,10 @@ type Cgroup struct { CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) } +type ActiveCgroup interface { + Cleanup() error +} + // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { mounts, err := mount.GetMounts() @@ -62,48 +65,6 @@ func GetInitCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } -func (c *Cgroup) Path(root, subsystem string) (string, error) { - cgroup := c.Name - if c.Parent != "" { - cgroup = filepath.Join(c.Parent, cgroup) - } - initPath, err := GetInitCgroupDir(subsystem) - if err != nil { - return "", err - } - return filepath.Join(root, subsystem, initPath, cgroup), nil -} - -func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { - path, err := c.Path(root, subsystem) - if err != nil { - return "", err - } - if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { - return "", err - } - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { - return "", err - } - return path, nil -} - -func (c *Cgroup) Cleanup(root string) error { - get := func(subsystem string) string { - path, _ := c.Path(root, subsystem) - return path - } - - for _, path := range []string{ - get("memory"), - get("devices"), - get("cpu"), - } { - os.RemoveAll(path) - } - return nil -} - func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) for s.Scan() { @@ -125,126 +86,6 @@ func writeFile(dir, file, data string) error { return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } -func (c *Cgroup) Apply(pid int) error { - // We have two implementation of cgroups support, one is based on - // systemd and the dbus api, and one is based on raw cgroup fs operations - // following the pre-single-writer model docs at: - // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ - // - // we can pick any subsystem to find the root - cgroupRoot, err := FindCgroupMountpoint("cpu") - if err != nil { - return err - } - cgroupRoot = filepath.Dir(cgroupRoot) - - if _, err := os.Stat(cgroupRoot); err != nil { - return fmt.Errorf("cgroups fs not found") - } - if err := c.setupDevices(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupMemory(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupCpu(cgroupRoot, pid); err != nil { - return err - } - return nil -} - -func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { - if !c.DeviceAccess { - dir, err := c.Join(cgroupRoot, "devices", pid) - if err != nil { - return err - } - - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if err := writeFile(dir, "devices.deny", "a"); err != nil { - return err - } - - allow := []string{ - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { - if c.Memory != 0 || c.MemorySwap != 0 { - dir, err := c.Join(cgroupRoot, "memory", pid) - if err != nil { - return err - } - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if c.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - } - // By default, MemorySwap is set to twice the size of RAM. - // If you want to omit MemorySwap, set it to `-1'. - if c.MemorySwap != -1 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - dir, err := c.Join(cgroupRoot, "cpu", pid) - if err != nil { - return err - } - if c.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { - return err - } - } - return nil +func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) { + return rawApply(c, pid) } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 61286cc..a44faaf 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,6 +3,7 @@ package nsinit import ( + "github.com/dotcloud/docker/pkg/cgroups" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" @@ -61,10 +62,15 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ // Do this before syncing with child so that no children // can escape the cgroup ns.logger.Println("setting cgroups") - if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { + activeCgroup, err := ns.SetupCgroups(container, command.Process.Pid) + if err != nil { command.Process.Kill() return -1, err } + if activeCgroup != nil { + defer activeCgroup.Cleanup() + } + ns.logger.Println("setting up network") if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() @@ -85,13 +91,11 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ return status, err } -func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) (cgroups.ActiveCgroup, error) { if container.Cgroups != nil { - if err := container.Cgroups.Apply(nspid); err != nil { - return err - } + return container.Cgroups.Apply(nspid) } - return nil + return nil, nil } func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { From 7cd539bc3d3125352b16d6c7670cbc1f5fa5fe8f Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 14 Mar 2014 11:45:29 +0100 Subject: [PATCH 4/7] cgroups: Join groups by writing to cgroups.procs, not tasks cgroups.procs moves all the threads of the process, and "tasks" just the one thread. I believe there is a risk that we move the main thread, but then we accidentally fork off one of the other threads if the go scheduler randomly switched to another thread. So, it seems safer (and more correct) to use cgroups.procs. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/apply_raw.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgroups/apply_raw.go b/cgroups/apply_raw.go index bce96f4..47a2a00 100644 --- a/cgroups/apply_raw.go +++ b/cgroups/apply_raw.go @@ -68,7 +68,7 @@ func (raw *rawCgroup) join(subsystem string, pid int) (string, error) { if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { return "", err } - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { return "", err } return path, nil From 7d3f869c32f606911f076192696ce4d06b9905f8 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Tue, 18 Mar 2014 18:40:00 +0100 Subject: [PATCH 5/7] pkg/systemd: Drop our copy-pasted version of go-systemd/activation Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- systemd/activation/files.go | 55 --------------------------------- systemd/activation/listeners.go | 37 ---------------------- systemd/listendfd.go | 2 +- 3 files changed, 1 insertion(+), 93 deletions(-) delete mode 100644 systemd/activation/files.go delete mode 100644 systemd/activation/listeners.go diff --git a/systemd/activation/files.go b/systemd/activation/files.go deleted file mode 100644 index 0281146..0000000 --- a/systemd/activation/files.go +++ /dev/null @@ -1,55 +0,0 @@ -/* -Copyright 2013 CoreOS Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Package activation implements primitives for systemd socket activation. -package activation - -import ( - "os" - "strconv" - "syscall" -) - -// based on: https://gist.github.com/alberts/4640792 -const ( - listenFdsStart = 3 -) - -func Files(unsetEnv bool) []*os.File { - if unsetEnv { - // there is no way to unset env in golang os package for now - // https://code.google.com/p/go/issues/detail?id=6423 - defer os.Setenv("LISTEN_PID", "") - defer os.Setenv("LISTEN_FDS", "") - } - - pid, err := strconv.Atoi(os.Getenv("LISTEN_PID")) - if err != nil || pid != os.Getpid() { - return nil - } - - nfds, err := strconv.Atoi(os.Getenv("LISTEN_FDS")) - if err != nil || nfds == 0 { - return nil - } - - var files []*os.File - for fd := listenFdsStart; fd < listenFdsStart+nfds; fd++ { - syscall.CloseOnExec(fd) - files = append(files, os.NewFile(uintptr(fd), "LISTEN_FD_"+strconv.Itoa(fd))) - } - - return files -} diff --git a/systemd/activation/listeners.go b/systemd/activation/listeners.go deleted file mode 100644 index 3296a08..0000000 --- a/systemd/activation/listeners.go +++ /dev/null @@ -1,37 +0,0 @@ -/* -Copyright 2014 CoreOS Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package activation - -import ( - "fmt" - "net" -) - -// Listeners returns net.Listeners for all socket activated fds passed to this process. -func Listeners(unsetEnv bool) ([]net.Listener, error) { - files := Files(unsetEnv) - listeners := make([]net.Listener, len(files)) - - for i, f := range files { - var err error - listeners[i], err = net.FileListener(f) - if err != nil { - return nil, fmt.Errorf("Error setting up FileListener for fd %d: %s", f.Fd(), err.Error()) - } - } - - return listeners, nil -} diff --git a/systemd/listendfd.go b/systemd/listendfd.go index f604432..0fbc0a6 100644 --- a/systemd/listendfd.go +++ b/systemd/listendfd.go @@ -5,7 +5,7 @@ import ( "net" "strconv" - "github.com/dotcloud/docker/pkg/systemd/activation" + "github.com/coreos/go-systemd/activation" ) // ListenFD returns the specified socket activated files as a slice of From 73971df484ac4fac1717fc5965e2adb5b33ca0d2 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 21 Feb 2014 10:48:02 +0100 Subject: [PATCH 6/7] Add systemd.SdBooted() This is a conversion of sd_booted() from libsystemd to go and checks if the system was booted with systemd. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- systemd/booted.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 systemd/booted.go diff --git a/systemd/booted.go b/systemd/booted.go new file mode 100644 index 0000000..2aae931 --- /dev/null +++ b/systemd/booted.go @@ -0,0 +1,15 @@ +package systemd + +import ( + "os" +) + +// Conversion to Go of systemd's sd_booted() +func SdBooted() bool { + s, err := os.Stat("/run/systemd/system") + if err != nil { + return false + } + + return s.IsDir() +} From 33fab479ef9243cbb29769ea3ddff5f9d64d9211 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 21 Feb 2014 14:35:43 +0100 Subject: [PATCH 7/7] cgroups: Add systemd implementation of cgroups This implements cgroup.Apply() using the systemd apis. We create a transient unit called "docker-$id.scope" that contains the container processes. We also have a way to set unit specific properties, currently only defining the Slice to put the scope in. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/apply_nosystemd.go | 15 ++++ cgroups/apply_systemd.go | 158 +++++++++++++++++++++++++++++++++++++ cgroups/cgroups.go | 13 ++- 3 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 cgroups/apply_nosystemd.go create mode 100644 cgroups/apply_systemd.go diff --git a/cgroups/apply_nosystemd.go b/cgroups/apply_nosystemd.go new file mode 100644 index 0000000..f94d475 --- /dev/null +++ b/cgroups/apply_nosystemd.go @@ -0,0 +1,15 @@ +// +build !linux + +package cgroups + +import ( + "fmt" +) + +func useSystemd() bool { + return false +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/cgroups/apply_systemd.go b/cgroups/apply_systemd.go new file mode 100644 index 0000000..c689d57 --- /dev/null +++ b/cgroups/apply_systemd.go @@ -0,0 +1,158 @@ +// +build linux + +package cgroups + +import ( + "fmt" + systemd1 "github.com/coreos/go-systemd/dbus" + "github.com/dotcloud/docker/pkg/systemd" + "github.com/godbus/dbus" + "path/filepath" + "strings" + "sync" +) + +type systemdCgroup struct { +} + +var ( + connLock sync.Mutex + theConn *systemd1.Conn + hasStartTransientUnit bool +) + +func useSystemd() bool { + if !systemd.SdBooted() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemd1.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + } + } + } + } + + return hasStartTransientUnit +} + +type DeviceAllow struct { + Node string + Permissions string +} + +func getIfaceForUnit(unitName string) string { + if strings.HasSuffix(unitName, ".scope") { + return "Scope" + } + if strings.HasSuffix(unitName, ".service") { + return "Service" + } + return "Unit" +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + unitName := c.Parent + "-" + c.Name + ".scope" + slice := "system.slice" + + var properties []systemd1.Property + + for _, v := range c.UnitProperties { + switch v[0] { + case "Slice": + slice = v[1] + default: + return nil, fmt.Errorf("Unknown unit propery %s", v[0]) + } + } + + properties = append(properties, + systemd1.Property{"Slice", dbus.MakeVariant(slice)}, + systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)}, + systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}) + + if !c.DeviceAccess { + properties = append(properties, + systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}, + systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{ + {"/dev/null", "rwm"}, + {"/dev/zero", "rwm"}, + {"/dev/full", "rwm"}, + {"/dev/random", "rwm"}, + {"/dev/urandom", "rwm"}, + {"/dev/tty", "rwm"}, + {"/dev/console", "rwm"}, + {"/dev/tty0", "rwm"}, + {"/dev/tty1", "rwm"}, + {"/dev/pts/ptmx", "rwm"}, + // There is no way to add /dev/pts/* here atm, so we hack this manually below + // /dev/pts/* (how to add this?) + // Same with tuntap, which doesn't exist as a node most of the time + })}) + } + + if c.Memory != 0 { + properties = append(properties, + systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))}) + } + // TODO: MemorySwap not available in systemd + + if c.CpuShares != 0 { + properties = append(properties, + systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))}) + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { + return nil, err + } + + // To work around the lack of /dev/pts/* support above we need to manually add these + // so, ask systemd for the cgroup used + props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName)) + if err != nil { + return nil, err + } + + cgroup := props["ControlGroup"].(string) + + if !c.DeviceAccess { + mountpoint, err := FindCgroupMountpoint("devices") + if err != nil { + return nil, err + } + + path := filepath.Join(mountpoint, cgroup) + + // /dev/pts/* + if err := writeFile(path, "devices.allow", "c 136:* rwm"); err != nil { + return nil, err + } + // tuntap + if err := writeFile(path, "devices.allow", "c 10:200 rwm"); err != nil { + return nil, err + } + } + + return &systemdCgroup{}, nil +} + +func (c *systemdCgroup) Cleanup() error { + // systemd cleans up, we don't need to do anything + return nil +} diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index f35556f..8554ba9 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -19,6 +19,8 @@ type Cgroup struct { Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + + UnitProperties [][2]string `json:"unit_properties,omitempty"` // systemd unit properties } type ActiveCgroup interface { @@ -87,5 +89,14 @@ func writeFile(dir, file, data string) error { } func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) { - return rawApply(c, pid) + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + + if useSystemd() { + return systemdApply(c, pid) + } else { + return rawApply(c, pid) + } }