diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 91ac384..b9318f9 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return parseCgroupFile(subsystem, f) +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) @@ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { } text := s.Text() parts := strings.Split(text, ":") - if parts[1] == subsystem { - return parts[2], nil + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } } } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go new file mode 100644 index 0000000..e30262c --- /dev/null +++ b/libcontainer/cgroup/cgroup.go @@ -0,0 +1,177 @@ +package cgroup + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" + "os" + "path/filepath" + "strconv" +) + +// We have two implementation of cgroups support, one is based on +// systemd and the dbus api, and one is based on raw cgroup fs operations +// following the pre-single-writer model docs at: +// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +func useSystemd() bool { + return false +} + +func applyCgroupSystemd(container *libcontainer.Container, pid int) error { + return fmt.Errorf("not supported yet") +} + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { + cgroup := container.CgroupName + if container.CgroupParent != "" { + cgroup = filepath.Join(container.CgroupParent, cgroup) + } + + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + + path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) + + return path, nil +} + +func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { + path, err := getCgroup(subsystem, container) + if err != nil { + return "", err + } + + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + + return path, nil +} + +func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + + if !container.DeviceAccess { + dir, err := joinCgroup("devices", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + + if container.Memory != 0 || container.MemorySwap != 0 { + dir, err := joinCgroup("memory", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if container.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + } + if container.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { + return err + } + } + } + + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := joinCgroup("cpu", container, pid) + if err != nil { + return err + } + if container.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func CleanupCgroup(container *libcontainer.Container) error { + path, _ := getCgroup("memory", container) + os.RemoveAll(path) + path, _ = getCgroup("devices", container) + os.RemoveAll(path) + path, _ = getCgroup("cpu", container) + os.RemoveAll(path) + return nil +} + +func ApplyCgroup(container *libcontainer.Container, pid int) error { + if container.CgroupName == "" { + return nil + } + + if useSystemd() { + return applyCgroupSystemd(container, pid) + } else { + return applyCgroupRaw(container, pid) + } +} diff --git a/libcontainer/container.go b/libcontainer/container.go index a6a57da..b34ac8b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,6 +11,13 @@ type Container struct { Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Network *Network `json:"network,omitempty"` // nil for host's network stack + + CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup + CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index ccc9abb..3e23600 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -34,5 +34,8 @@ "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 - } + }, + "cgroup_name": "docker-koye", + "cgroup_parent": "docker", + "memory": 524800 } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 202cfca..acff647 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -5,6 +5,7 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/cgroup" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" @@ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return -1, err } if err := writePidFile(command); err != nil { + command.Process.Kill() return -1, err } defer deletePidFile() + // Do this before syncing with child so that no children + // can escape the cgroup + if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if container.Network != nil { vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { @@ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) sendVethName(vethPair, inPipe) } + // Sync with child + inPipe.Close() + go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { fmt.Fprint(pipe, name) - pipe.Close() } // initializeContainerVeth will create a veth pair and setup the host's diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index c77fd90..f619276 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin return err } - var tempVethName string - if container.Network != nil { - tempVethName, err = getVethName() - if err != nil { - return err - } + // We always read this as it is a way to sync with the parent as well + tempVethName, err := getVethName() + if err != nil { + return err } // close pipes so that we can replace it with the pty