libcontainer: Initial version of cgroups support
This is a minimal version of raw cgroup support for libcontainer. It has only enough for what docker needs, and it has no support for systemd yet. Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)
This commit is contained in:
		
							parent
							
								
									8590435fa0
								
							
						
					
					
						commit
						3de41b34a2
					
				
					 6 changed files with 218 additions and 10 deletions
				
			
		|  | @ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { | |||
| 	return parseCgroupFile(subsystem, f) | ||||
| } | ||||
| 
 | ||||
| func GetInitCgroupDir(subsystem string) (string, error) { | ||||
| 	f, err := os.Open("/proc/1/cgroup") | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 	defer f.Close() | ||||
| 
 | ||||
| 	return parseCgroupFile(subsystem, f) | ||||
| } | ||||
| 
 | ||||
| func parseCgroupFile(subsystem string, r io.Reader) (string, error) { | ||||
| 	s := bufio.NewScanner(r) | ||||
| 
 | ||||
|  | @ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { | |||
| 		} | ||||
| 		text := s.Text() | ||||
| 		parts := strings.Split(text, ":") | ||||
| 		if parts[1] == subsystem { | ||||
| 			return parts[2], nil | ||||
| 		for _, subs := range strings.Split(parts[1], ",") { | ||||
| 			if subs == subsystem { | ||||
| 				return parts[2], nil | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) | ||||
|  |  | |||
							
								
								
									
										177
									
								
								libcontainer/cgroup/cgroup.go
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										177
									
								
								libcontainer/cgroup/cgroup.go
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,177 @@ | |||
| package cgroup | ||||
| 
 | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"github.com/dotcloud/docker/pkg/cgroups" | ||||
| 	"github.com/dotcloud/docker/pkg/libcontainer" | ||||
| 	"io/ioutil" | ||||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"strconv" | ||||
| ) | ||||
| 
 | ||||
| // We have two implementation of cgroups support, one is based on | ||||
| // systemd and the dbus api, and one is based on raw cgroup fs operations | ||||
| // following the pre-single-writer model docs at: | ||||
| // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ | ||||
| const ( | ||||
| 	cgroupRoot = "/sys/fs/cgroup" | ||||
| ) | ||||
| 
 | ||||
| func useSystemd() bool { | ||||
| 	return false | ||||
| } | ||||
| 
 | ||||
| func applyCgroupSystemd(container *libcontainer.Container, pid int) error { | ||||
| 	return fmt.Errorf("not supported yet") | ||||
| } | ||||
| 
 | ||||
| func writeFile(dir, file, data string) error { | ||||
| 	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) | ||||
| } | ||||
| 
 | ||||
| func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { | ||||
| 	cgroup := container.CgroupName | ||||
| 	if container.CgroupParent != "" { | ||||
| 		cgroup = filepath.Join(container.CgroupParent, cgroup) | ||||
| 	} | ||||
| 
 | ||||
| 	initPath, err := cgroups.GetInitCgroupDir(subsystem) | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 
 | ||||
| 	path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) | ||||
| 
 | ||||
| 	return path, nil | ||||
| } | ||||
| 
 | ||||
| func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { | ||||
| 	path, err := getCgroup(subsystem, container) | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 
 | ||||
| 	if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { | ||||
| 		return "", err | ||||
| 	} | ||||
| 
 | ||||
| 	if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 
 | ||||
| 	return path, nil | ||||
| } | ||||
| 
 | ||||
| func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { | ||||
| 	if _, err := os.Stat(cgroupRoot); err != nil { | ||||
| 		return fmt.Errorf("cgroups fs not found") | ||||
| 	} | ||||
| 
 | ||||
| 	if !container.DeviceAccess { | ||||
| 		dir, err := joinCgroup("devices", container, pid) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		defer func() { | ||||
| 			if retErr != nil { | ||||
| 				os.RemoveAll(dir) | ||||
| 			} | ||||
| 		}() | ||||
| 
 | ||||
| 		if err := writeFile(dir, "devices.deny", "a"); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 
 | ||||
| 		allow := []string{ | ||||
| 			// /dev/null, zero, full | ||||
| 			"c 1:3 rwm", | ||||
| 			"c 1:5 rwm", | ||||
| 			"c 1:7 rwm", | ||||
| 
 | ||||
| 			// consoles | ||||
| 			"c 5:1 rwm", | ||||
| 			"c 5:0 rwm", | ||||
| 			"c 4:0 rwm", | ||||
| 			"c 4:1 rwm", | ||||
| 
 | ||||
| 			// /dev/urandom,/dev/random | ||||
| 			"c 1:9 rwm", | ||||
| 			"c 1:8 rwm", | ||||
| 
 | ||||
| 			// /dev/pts/ - pts namespaces are "coming soon" | ||||
| 			"c 136:* rwm", | ||||
| 			"c 5:2 rwm", | ||||
| 
 | ||||
| 			// tuntap | ||||
| 			"c 10:200 rwm", | ||||
| 		} | ||||
| 
 | ||||
| 		for _, val := range allow { | ||||
| 			if err := writeFile(dir, "devices.allow", val); err != nil { | ||||
| 				return err | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if container.Memory != 0 || container.MemorySwap != 0 { | ||||
| 		dir, err := joinCgroup("memory", container, pid) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		defer func() { | ||||
| 			if retErr != nil { | ||||
| 				os.RemoveAll(dir) | ||||
| 			} | ||||
| 		}() | ||||
| 
 | ||||
| 		if container.Memory != 0 { | ||||
| 			if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { | ||||
| 				return err | ||||
| 			} | ||||
| 			if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { | ||||
| 				return err | ||||
| 			} | ||||
| 		} | ||||
| 		if container.MemorySwap != 0 { | ||||
| 			if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { | ||||
| 				return err | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// We always want to join the cpu group, to allow fair cpu scheduling | ||||
| 	// on a container basis | ||||
| 	dir, err := joinCgroup("cpu", container, pid) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	if container.CpuShares != 0 { | ||||
| 		if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func CleanupCgroup(container *libcontainer.Container) error { | ||||
| 	path, _ := getCgroup("memory", container) | ||||
| 	os.RemoveAll(path) | ||||
| 	path, _ = getCgroup("devices", container) | ||||
| 	os.RemoveAll(path) | ||||
| 	path, _ = getCgroup("cpu", container) | ||||
| 	os.RemoveAll(path) | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func ApplyCgroup(container *libcontainer.Container, pid int) error { | ||||
| 	if container.CgroupName == "" { | ||||
| 		return nil | ||||
| 	} | ||||
| 
 | ||||
| 	if useSystemd() { | ||||
| 		return applyCgroupSystemd(container, pid) | ||||
| 	} else { | ||||
| 		return applyCgroupRaw(container, pid) | ||||
| 	} | ||||
| } | ||||
|  | @ -11,6 +11,13 @@ type Container struct { | |||
| 	Namespaces   Namespaces   `json:"namespaces,omitempty"`   // namespaces to apply | ||||
| 	Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop | ||||
| 	Network      *Network     `json:"network,omitempty"`      // nil for host's network stack | ||||
| 
 | ||||
| 	CgroupName   string `json:"cgroup_name,omitempty"`   // name of cgroup | ||||
| 	CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice | ||||
| 	DeviceAccess bool   `json:"device_access,omitempty"` // name of parent cgroup or slice | ||||
| 	Memory       int64  `json:"memory,omitempty"`        // Memory limit (in bytes) | ||||
| 	MemorySwap   int64  `json:"memory_swap,omitempty"`   // Total memory usage (memory + swap); set `-1' to disable swap | ||||
| 	CpuShares    int64  `json:"cpu_shares,omitempty"`    // CPU shares (relative weight vs. other containers) | ||||
| } | ||||
| 
 | ||||
| // Network defines configuration for a container's networking stack | ||||
|  |  | |||
|  | @ -34,5 +34,8 @@ | |||
|         "gateway": "172.17.42.1", | ||||
|         "bridge": "docker0", | ||||
|         "mtu": 1500 | ||||
|     } | ||||
|     }, | ||||
|     "cgroup_name": "docker-koye", | ||||
|     "cgroup_parent": "docker", | ||||
|     "memory": 524800 | ||||
| } | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ package main | |||
| import ( | ||||
| 	"fmt" | ||||
| 	"github.com/dotcloud/docker/pkg/libcontainer" | ||||
| 	"github.com/dotcloud/docker/pkg/libcontainer/cgroup" | ||||
| 	"github.com/dotcloud/docker/pkg/libcontainer/network" | ||||
| 	"github.com/dotcloud/docker/pkg/libcontainer/utils" | ||||
| 	"github.com/dotcloud/docker/pkg/system" | ||||
|  | @ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) | |||
| 		return -1, err | ||||
| 	} | ||||
| 	if err := writePidFile(command); err != nil { | ||||
| 		command.Process.Kill() | ||||
| 		return -1, err | ||||
| 	} | ||||
| 	defer deletePidFile() | ||||
| 
 | ||||
| 	// Do this before syncing with child so that no children | ||||
| 	// can escape the cgroup | ||||
| 	if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { | ||||
| 		command.Process.Kill() | ||||
| 		return -1, err | ||||
| 	} | ||||
| 
 | ||||
| 	if container.Network != nil { | ||||
| 		vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) | ||||
| 		if err != nil { | ||||
|  | @ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) | |||
| 		sendVethName(vethPair, inPipe) | ||||
| 	} | ||||
| 
 | ||||
| 	// Sync with child | ||||
| 	inPipe.Close() | ||||
| 
 | ||||
| 	go io.Copy(os.Stdout, master) | ||||
| 	go io.Copy(master, os.Stdin) | ||||
| 
 | ||||
|  | @ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) | |||
| // pipe so that the child stops waiting for more data | ||||
| func sendVethName(name string, pipe io.WriteCloser) { | ||||
| 	fmt.Fprint(pipe, name) | ||||
| 	pipe.Close() | ||||
| } | ||||
| 
 | ||||
| // initializeContainerVeth will create a veth pair and setup the host's | ||||
|  |  | |||
|  | @ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin | |||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	var tempVethName string | ||||
| 	if container.Network != nil { | ||||
| 		tempVethName, err = getVethName() | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 	// We always read this as it is a way to sync with the parent as well | ||||
| 	tempVethName, err := getVethName() | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	// close pipes so that we can replace it with the pty | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue