diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index 0f93320..537a27f 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -2,6 +2,8 @@ package cgroups import ( "errors" + + "github.com/dotcloud/docker/pkg/libcontainer/devices" ) var ( @@ -10,17 +12,18 @@ var ( type Cgroup struct { Name string `json:"name,omitempty"` - Parent string `json:"parent,omitempty"` + Parent string `json:"parent,omitempty"` // name of parent cgroup or slice - DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice - Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) - MemoryReservation int64 `json:"memory_reservation,omitempty"` // Memory reservation or soft_limit (in bytes) - MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap - CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) - CpuQuota int64 `json:"cpu_quota,omitempty"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. - CpuPeriod int64 `json:"cpu_period,omitempty"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. - CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use - Freezer string `json:"freezer,omitempty"` // set the freeze value for the process + AllowAllDevices bool `json:"allow_all_devices,omitempty"` // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + AllowedDevices []devices.Device `json:"allowed_devices,omitempty"` + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemoryReservation int64 `json:"memory_reservation,omitempty"` // Memory reservation or soft_limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + CpuQuota int64 `json:"cpu_quota,omitempty"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuPeriod int64 `json:"cpu_period,omitempty"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use + Freezer string `json:"freezer,omitempty"` // set the freeze value for the process Slice string `json:"slice,omitempty"` // Parent slice to use for systemd } diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index 569cbbf..45c3b48 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -11,41 +11,13 @@ func (s *devicesGroup) Set(d *data) error { return err } - if !d.c.DeviceAccess { + if !d.c.AllowAllDevices { if err := writeFile(dir, "devices.deny", "a"); err != nil { return err } - allow := []string{ - // allow mknod for any device - "c *:* m", - "b *:* m", - - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { + for _, dev := range d.c.AllowedDevices { + if err := writeFile(dir, "devices.allow", dev.GetCgroupAllowString()); err != nil { return err } } diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 0f6beb6..e57cf16 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -21,11 +21,6 @@ type systemdCgroup struct { cleanupDirs []string } -type DeviceAllow struct { - Node string - Permissions string -} - var ( connLock sync.Mutex theConn *systemd1.Conn @@ -116,24 +111,9 @@ func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}, ) - if !c.DeviceAccess { + if !c.AllowAllDevices { properties = append(properties, - systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}, - systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{ - {"/dev/null", "rwm"}, - {"/dev/zero", "rwm"}, - {"/dev/full", "rwm"}, - {"/dev/random", "rwm"}, - {"/dev/urandom", "rwm"}, - {"/dev/tty", "rwm"}, - {"/dev/console", "rwm"}, - {"/dev/tty0", "rwm"}, - {"/dev/tty1", "rwm"}, - {"/dev/pts/ptmx", "rwm"}, - // There is no way to add /dev/pts/* here atm, so we hack this manually below - // /dev/pts/* (how to add this?) - // Same with tuntap, which doesn't exist as a node most of the time - })}) + systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}) } // Always enable accounting, this gets us the same behaviour as the fs implementation, @@ -167,28 +147,16 @@ func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { cgroup := props["ControlGroup"].(string) - if !c.DeviceAccess { + if !c.AllowAllDevices { mountpoint, err := cgroups.FindCgroupMountpoint("devices") if err != nil { return nil, err } - path := filepath.Join(mountpoint, cgroup) - - allow := []string{ - // allow mknod for any device - "c *:* m", - "b *:* m", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := ioutil.WriteFile(filepath.Join(path, "devices.allow"), []byte(val), 0700); err != nil { + dir := filepath.Join(mountpoint, cgroup) + // We use the same method of allowing devices as in the fs backend. This needs to be changed to use DBUS as soon as possible. However, that change has to wait untill http://cgit.freedesktop.org/systemd/systemd/commit/?id=90060676c442604780634c0a993e3f9c3733f8e6 has been applied in most commonly used systemd versions. + for _, dev := range c.AllowedDevices { + if err := writeFile(dir, "devices.allow", dev.GetCgroupAllowString()); err != nil { return nil, err } } @@ -295,6 +263,10 @@ func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { return &res, nil } +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + func (c *systemdCgroup) Cleanup() error { // systemd cleans up, we don't need to do much diff --git a/libcontainer/container.go b/libcontainer/container.go index 27a4235..d56e037 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -2,6 +2,7 @@ package libcontainer import ( "github.com/dotcloud/docker/pkg/libcontainer/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer/devices" ) // Context is a generic key value pair that allows arbatrary data to be sent @@ -63,13 +64,8 @@ type Container struct { // rootfs and mount namespace if specified Mounts Mounts `json:"mounts,omitempty"` - // RequiredDeviceNodes are a list of device nodes that will be mknod into the container's rootfs at /dev - // If the host system does not support the device that the container requests an error is returned - RequiredDeviceNodes []string `json:"required_device_nodes,omitempty"` - - // OptionalDeviceNodes are a list of device nodes that will be mknod into the container's rootfs at /dev - // If the host system does not support the device that the container requests the error is ignored - OptionalDeviceNodes []string `json:"optional_device_nodes,omitempty"` + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + DeviceNodes []devices.Device `json:"device_nodes,omitempty"` } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index 8d845dc..7448a07 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -54,12 +54,54 @@ "type": "devtmpfs" } ], - "required_device_nodes": [ - "/dev/null", - "/dev/zero", - "/dev/full", - "/dev/random", - "/dev/urandom", - "/dev/tty" + "device_nodes": [ + { + "path": "/dev/null", + "type": 99, + "major_number": 1, + "minor_number": 3, + "cgroup_permissions": "rwm", + "file_mode": 438 + }, + { + "path": "/dev/zero", + "type": 99, + "major_number": 1, + "minor_number": 5, + "cgroup_permissions": "rwm", + "file_mode": 438 + }, + { + "path": "/dev/full", + "type": 99, + "major_number": 1, + "minor_number": 7, + "cgroup_permissions": "rwm", + "file_mode": 438 + }, + { + "path": "/dev/tty", + "type": 99, + "major_number": 5, + "minor_number": 0, + "cgroup_permissions": "rwm", + "file_mode": 438 + }, + { + "path": "/dev/urandom", + "type": 99, + "major_number": 1, + "minor_number": 9, + "cgroup_permissions": "rwm", + "file_mode": 438 + }, + { + "path": "/dev/random", + "type": 99, + "major_number": 1, + "minor_number": 8, + "cgroup_permissions": "rwm", + "file_mode": 438 + } ] } diff --git a/libcontainer/container_test.go b/libcontainer/container_test.go index 23b15d2..8382818 100644 --- a/libcontainer/container_test.go +++ b/libcontainer/container_test.go @@ -4,8 +4,6 @@ import ( "encoding/json" "os" "testing" - - "github.com/dotcloud/docker/pkg/libcontainer/mount/nodes" ) // Checks whether the expected capability is specified in the capabilities. @@ -68,11 +66,4 @@ func TestContainerJsonFormat(t *testing.T) { t.Log("capabilities mask should not contain SYS_CHROOT") t.Fail() } - - for _, n := range nodes.DefaultNodes { - if !contains(n, container.RequiredDeviceNodes) { - t.Logf("devices should contain %s", n) - t.Fail() - } - } } diff --git a/libcontainer/devices/devices.go b/libcontainer/devices/devices.go new file mode 100644 index 0000000..6423337 --- /dev/null +++ b/libcontainer/devices/devices.go @@ -0,0 +1,239 @@ +package devices + +import ( + "fmt" + "os" + "syscall" +) + +const ( + Wildcard = -1 +) + +type Device struct { + Type rune `json:"type,omitempty"` + Path string `json:"path,omitempty"` // It is fine if this is an empty string in the case that you are using Wildcards + MajorNumber int64 `json:"major_number,omitempty"` // Use the wildcard constant for wildcards. + MinorNumber int64 `json:"minor_number,omitempty"` // Use the wildcard constant for wildcards. + CgroupPermissions string `json:"cgroup_permissions,omitempty"` // Typically just "rwm" + FileMode os.FileMode `json:"file_mode,omitempty"` // The permission bits of the file's mode +} + +func GetDeviceNumberString(deviceNumber int64) string { + if deviceNumber == Wildcard { + return "*" + } else { + return fmt.Sprintf("%d", deviceNumber) + } +} + +func (device Device) GetCgroupAllowString() string { + return fmt.Sprintf("%c %s:%s %s", device.Type, GetDeviceNumberString(device.MajorNumber), GetDeviceNumberString(device.MinorNumber), device.CgroupPermissions) +} + +// Given the path to a device and it's cgroup_permissions(which cannot be easilly queried) look up the information about a linux device and return that information as a Device struct. +func GetDevice(path string, cgroupPermissions string) (Device, error) { + var ( + err error + fileInfo os.FileInfo + mode os.FileMode + fileModePermissionBits os.FileMode + devType rune + devNumber int + stat_t *syscall.Stat_t + ok bool + device Device + ) + + fileInfo, err = os.Stat(path) + if err != nil { + return Device{}, err + } + + mode = fileInfo.Mode() + fileModePermissionBits = os.FileMode.Perm(mode) + switch { + case (mode & os.ModeDevice) == 0: + return Device{}, fmt.Errorf("%s is not a device", path) + case (mode & os.ModeCharDevice) != 0: + fileModePermissionBits |= syscall.S_IFCHR + devType = 'c' + default: + fileModePermissionBits |= syscall.S_IFBLK + devType = 'b' + } + + stat_t, ok = fileInfo.Sys().(*syscall.Stat_t) + if !ok { + return Device{}, fmt.Errorf("cannot determine the device number for device %s", path) + } + devNumber = int(stat_t.Rdev) + + device = Device{ + Type: devType, + Path: path, + MajorNumber: Major(devNumber), + MinorNumber: Minor(devNumber), + CgroupPermissions: cgroupPermissions, + FileMode: fileModePermissionBits, + } + return device, nil +} + +var ( + // These are devices that are to be both allowed and created. + + DefaultSimpleDevices = []Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + MajorNumber: 1, + MinorNumber: 3, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + MajorNumber: 1, + MinorNumber: 5, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + MajorNumber: 1, + MinorNumber: 7, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + MajorNumber: 5, + MinorNumber: 0, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + MajorNumber: 1, + MinorNumber: 9, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + MajorNumber: 1, + MinorNumber: 8, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + } + + DefaultAllowedDevices = append([]Device{ + // allow mknod for any device + { + Type: 'c', + MajorNumber: Wildcard, + MinorNumber: Wildcard, + CgroupPermissions: "m", + }, + { + Type: 'b', + MajorNumber: Wildcard, + MinorNumber: Wildcard, + CgroupPermissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + MajorNumber: 5, + MinorNumber: 1, + CgroupPermissions: "rwm", + }, + { + Path: "/dev/tty0", + Type: 'c', + MajorNumber: 4, + MinorNumber: 0, + CgroupPermissions: "rwm", + }, + { + Path: "/dev/tty1", + Type: 'c', + MajorNumber: 4, + MinorNumber: 1, + CgroupPermissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + MajorNumber: 136, + MinorNumber: Wildcard, + CgroupPermissions: "rwm", + }, + { + Path: "", + Type: 'c', + MajorNumber: 5, + MinorNumber: 2, + CgroupPermissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + MajorNumber: 10, + MinorNumber: 200, + CgroupPermissions: "rwm", + }, + + /*// fuse + { + Path: "", + Type: 'c', + MajorNumber: 10, + MinorNumber: 229, + CgroupPermissions: "rwm", + }, + + // rtc + { + Path: "", + Type: 'c', + MajorNumber: 254, + MinorNumber: 0, + CgroupPermissions: "rwm", + }, + */ + }, DefaultSimpleDevices...) + + DefaultAutoCreatedDevices = append([]Device{ + { + // /dev/fuse is created but not allowed. + // This is to allow java to work. Because java + // Insists on there being a /dev/fuse + // https://github.com/dotcloud/docker/issues/514 + // https://github.com/dotcloud/docker/issues/2393 + // + Path: "/dev/fuse", + Type: 'c', + MajorNumber: 10, + MinorNumber: 229, + CgroupPermissions: "rwm", + }, + }, DefaultSimpleDevices...) +) diff --git a/libcontainer/devices/number.go b/libcontainer/devices/number.go new file mode 100644 index 0000000..3aae380 --- /dev/null +++ b/libcontainer/devices/number.go @@ -0,0 +1,26 @@ +package devices + +/* + +This code provides support for manipulating linux device numbers. It should be replaced by normal syscall functions once http://code.google.com/p/go/issues/detail?id=8106 is solved. + +You can read what they are here: + + - http://www.makelinux.net/ldd3/chp-3-sect-2 + - http://www.linux-tutorial.info/modules.php?name=MContent&pageid=94 + +Note! These are NOT the same as the MAJOR(dev_t device);, MINOR(dev_t device); and MKDEV(int major, int minor); functions as defined in as the representation of device numbers used by go is different than the one used internally to the kernel! - https://github.com/torvalds/linux/blob/master/include/linux/kdev_t.h#L9 + +*/ + +func Major(devNumber int) int64 { + return int64((devNumber >> 8) & 0xfff) +} + +func Minor(devNumber int) int64 { + return int64((devNumber & 0xff) | ((devNumber >> 12) & 0xfff00)) +} + +func Mkdev(majorNumber int64, minorNumber int64) int { + return int((majorNumber << 8) | (minorNumber & 0xff) | ((minorNumber & 0xfff00) << 12)) +} diff --git a/libcontainer/mount/init.go b/libcontainer/mount/init.go index 82c76aa..af7a521 100644 --- a/libcontainer/mount/init.go +++ b/libcontainer/mount/init.go @@ -48,11 +48,8 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co if err := setupBindmounts(rootfs, container.Mounts); err != nil { return fmt.Errorf("bind mounts %s", err) } - if err := nodes.CopyN(rootfs, container.RequiredDeviceNodes, true); err != nil { - return fmt.Errorf("copy required dev nodes %s", err) - } - if err := nodes.CopyN(rootfs, container.OptionalDeviceNodes, false); err != nil { - return fmt.Errorf("copy optional dev nodes %s", err) + if err := nodes.CreateDeviceNodes(rootfs, container.DeviceNodes); err != nil { + return fmt.Errorf("create device nodes %s", err) } if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil { return err diff --git a/libcontainer/mount/nodes/nodes.go b/libcontainer/mount/nodes/nodes.go index f8e6e97..18ef487 100644 --- a/libcontainer/mount/nodes/nodes.go +++ b/libcontainer/mount/nodes/nodes.go @@ -9,47 +9,27 @@ import ( "path/filepath" "syscall" + "github.com/dotcloud/docker/pkg/libcontainer/devices" "github.com/dotcloud/docker/pkg/system" ) -// Default list of device nodes to copy -var DefaultNodes = []string{ - "/dev/null", - "/dev/zero", - "/dev/full", - "/dev/random", - "/dev/urandom", - "/dev/tty", -} - -// CopyN copies the device node from the host into the rootfs -func CopyN(rootfs string, nodesToCopy []string, shouldExist bool) error { +// Create the device nodes in the container. +func CreateDeviceNodes(rootfs string, nodesToCreate []devices.Device) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) - for _, node := range nodesToCopy { - if err := Copy(rootfs, node, shouldExist); err != nil { + for _, node := range nodesToCreate { + if err := CreateDeviceNode(rootfs, node); err != nil { return err } } return nil } -// Copy copies the device node into the rootfs. If the node -// on the host system does not exist and the boolean flag is passed -// an error will be returned -func Copy(rootfs, node string, shouldExist bool) error { - stat, err := os.Stat(node) - if err != nil { - if os.IsNotExist(err) && !shouldExist { - return nil - } - return err - } - +// Creates the device node in the rootfs of the container. +func CreateDeviceNode(rootfs string, node devices.Device) error { var ( - dest = filepath.Join(rootfs, node) - st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, node.Path) parent = filepath.Dir(dest) ) @@ -57,13 +37,23 @@ func Copy(rootfs, node string, shouldExist bool) error { return err } - if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { - return fmt.Errorf("mknod %s %s", node, err) + fileMode := node.FileMode + switch node.Type { + case 'c': + fileMode |= syscall.S_IFCHR + case 'b': + fileMode |= syscall.S_IFBLK + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + + if err := system.Mknod(dest, uint32(fileMode), devices.Mkdev(node.MajorNumber, node.MinorNumber)); err != nil && !os.IsExist(err) { + return fmt.Errorf("mknod %s %s", node.Path, err) } return nil } -func getNodes(path string) ([]string, error) { +func getDeviceNodes(path string) ([]string, error) { out := []string{} files, err := ioutil.ReadDir(path) if err != nil { @@ -71,7 +61,7 @@ func getNodes(path string) ([]string, error) { } for _, f := range files { if f.IsDir() && f.Name() != "pts" && f.Name() != "shm" { - sub, err := getNodes(filepath.Join(path, f.Name())) + sub, err := getDeviceNodes(filepath.Join(path, f.Name())) if err != nil { return nil, err } @@ -84,5 +74,5 @@ func getNodes(path string) ([]string, error) { } func GetHostDeviceNodes() ([]string, error) { - return getNodes("/dev") + return getDeviceNodes("/dev") } diff --git a/libcontainer/mount/nodes/nodes_unsupported.go b/libcontainer/mount/nodes/nodes_unsupported.go index 24409f4..b92f89b 100644 --- a/libcontainer/mount/nodes/nodes_unsupported.go +++ b/libcontainer/mount/nodes/nodes_unsupported.go @@ -2,10 +2,15 @@ package nodes -import "github.com/dotcloud/docker/pkg/libcontainer" - -var DefaultNodes = []string{} +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/devices" +) func GetHostDeviceNodes() ([]string, error) { return nil, libcontainer.ErrUnsupported } + +func CreateDeviceNodes(rootfs string, nodesToCreate []devices.Device) error { + return libcontainer.ErrUnsupported +}