diff --git a/cgroups/apply_nosystemd.go b/cgroups/apply_nosystemd.go new file mode 100644 index 0000000..f94d475 --- /dev/null +++ b/cgroups/apply_nosystemd.go @@ -0,0 +1,15 @@ +// +build !linux + +package cgroups + +import ( + "fmt" +) + +func useSystemd() bool { + return false +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/cgroups/apply_raw.go b/cgroups/apply_raw.go new file mode 100644 index 0000000..47a2a00 --- /dev/null +++ b/cgroups/apply_raw.go @@ -0,0 +1,189 @@ +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + "strconv" +) + +type rawCgroup struct { + root string + cgroup string +} + +func rawApply(c *Cgroup, pid int) (ActiveCgroup, error) { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + + cgroupRoot, err := FindCgroupMountpoint("cpu") + if err != nil { + return nil, err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return nil, fmt.Errorf("cgroups fs not found") + } + + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + + raw := &rawCgroup{ + root: cgroupRoot, + cgroup: cgroup, + } + + if err := raw.setupDevices(c, pid); err != nil { + return nil, err + } + if err := raw.setupMemory(c, pid); err != nil { + return nil, err + } + if err := raw.setupCpu(c, pid); err != nil { + return nil, err + } + return raw, nil +} + +func (raw *rawCgroup) path(subsystem string) (string, error) { + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(raw.root, subsystem, initPath, raw.cgroup), nil +} + +func (raw *rawCgroup) join(subsystem string, pid int) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (raw *rawCgroup) setupDevices(c *Cgroup, pid int) (err error) { + if !c.DeviceAccess { + dir, err := raw.join("devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupMemory(c *Cgroup, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := raw.join("memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (raw *rawCgroup) setupCpu(c *Cgroup, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := raw.join("cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func (raw *rawCgroup) Cleanup() error { + get := func(subsystem string) string { + path, _ := raw.path(subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + if path != "" { + os.RemoveAll(path) + } + } + return nil +} diff --git a/cgroups/apply_systemd.go b/cgroups/apply_systemd.go new file mode 100644 index 0000000..c689d57 --- /dev/null +++ b/cgroups/apply_systemd.go @@ -0,0 +1,158 @@ +// +build linux + +package cgroups + +import ( + "fmt" + systemd1 "github.com/coreos/go-systemd/dbus" + "github.com/dotcloud/docker/pkg/systemd" + "github.com/godbus/dbus" + "path/filepath" + "strings" + "sync" +) + +type systemdCgroup struct { +} + +var ( + connLock sync.Mutex + theConn *systemd1.Conn + hasStartTransientUnit bool +) + +func useSystemd() bool { + if !systemd.SdBooted() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemd1.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + } + } + } + } + + return hasStartTransientUnit +} + +type DeviceAllow struct { + Node string + Permissions string +} + +func getIfaceForUnit(unitName string) string { + if strings.HasSuffix(unitName, ".scope") { + return "Scope" + } + if strings.HasSuffix(unitName, ".service") { + return "Service" + } + return "Unit" +} + +func systemdApply(c *Cgroup, pid int) (ActiveCgroup, error) { + unitName := c.Parent + "-" + c.Name + ".scope" + slice := "system.slice" + + var properties []systemd1.Property + + for _, v := range c.UnitProperties { + switch v[0] { + case "Slice": + slice = v[1] + default: + return nil, fmt.Errorf("Unknown unit propery %s", v[0]) + } + } + + properties = append(properties, + systemd1.Property{"Slice", dbus.MakeVariant(slice)}, + systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)}, + systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}) + + if !c.DeviceAccess { + properties = append(properties, + systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")}, + systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{ + {"/dev/null", "rwm"}, + {"/dev/zero", "rwm"}, + {"/dev/full", "rwm"}, + {"/dev/random", "rwm"}, + {"/dev/urandom", "rwm"}, + {"/dev/tty", "rwm"}, + {"/dev/console", "rwm"}, + {"/dev/tty0", "rwm"}, + {"/dev/tty1", "rwm"}, + {"/dev/pts/ptmx", "rwm"}, + // There is no way to add /dev/pts/* here atm, so we hack this manually below + // /dev/pts/* (how to add this?) + // Same with tuntap, which doesn't exist as a node most of the time + })}) + } + + if c.Memory != 0 { + properties = append(properties, + systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))}) + } + // TODO: MemorySwap not available in systemd + + if c.CpuShares != 0 { + properties = append(properties, + systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))}) + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { + return nil, err + } + + // To work around the lack of /dev/pts/* support above we need to manually add these + // so, ask systemd for the cgroup used + props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName)) + if err != nil { + return nil, err + } + + cgroup := props["ControlGroup"].(string) + + if !c.DeviceAccess { + mountpoint, err := FindCgroupMountpoint("devices") + if err != nil { + return nil, err + } + + path := filepath.Join(mountpoint, cgroup) + + // /dev/pts/* + if err := writeFile(path, "devices.allow", "c 136:* rwm"); err != nil { + return nil, err + } + // tuntap + if err := writeFile(path, "devices.allow", "c 10:200 rwm"); err != nil { + return nil, err + } + } + + return &systemdCgroup{}, nil +} + +func (c *systemdCgroup) Cleanup() error { + // systemd cleans up, we don't need to do anything + return nil +} diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 9d485d1..cdf2687 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -8,7 +8,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strconv" "strings" ) @@ -21,6 +20,12 @@ type Cgroup struct { MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use + + UnitProperties [][2]string `json:"unit_properties,omitempty"` // systemd unit properties +} + +type ActiveCgroup interface { + Cleanup() error } // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt @@ -63,49 +68,6 @@ func GetInitCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } -func (c *Cgroup) Path(root, subsystem string) (string, error) { - cgroup := c.Name - if c.Parent != "" { - cgroup = filepath.Join(c.Parent, cgroup) - } - initPath, err := GetInitCgroupDir(subsystem) - if err != nil { - return "", err - } - return filepath.Join(root, subsystem, initPath, cgroup), nil -} - -func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { - path, err := c.Path(root, subsystem) - if err != nil { - return "", err - } - if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { - return "", err - } - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { - return "", err - } - return path, nil -} - -func (c *Cgroup) Cleanup(root string) error { - get := func(subsystem string) string { - path, _ := c.Path(root, subsystem) - return path - } - - for _, path := range []string{ - get("memory"), - get("devices"), - get("cpu"), - get("cpuset"), - } { - os.RemoveAll(path) - } - return nil -} - func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) for s.Scan() { @@ -127,131 +89,17 @@ func writeFile(dir, file, data string) error { return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } -func (c *Cgroup) Apply(pid int) error { +func (c *Cgroup) Apply(pid int) (ActiveCgroup, error) { // We have two implementation of cgroups support, one is based on // systemd and the dbus api, and one is based on raw cgroup fs operations // following the pre-single-writer model docs at: // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ - // - // we can pick any subsystem to find the root - cgroupRoot, err := FindCgroupMountpoint("cpu") - if err != nil { - return err + + if useSystemd() { + return systemdApply(c, pid) + } else { + return rawApply(c, pid) } - cgroupRoot = filepath.Dir(cgroupRoot) - - if _, err := os.Stat(cgroupRoot); err != nil { - return fmt.Errorf("cgroups fs not found") - } - if err := c.setupDevices(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupMemory(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupCpu(cgroupRoot, pid); err != nil { - return err - } - if err := c.setupCpuset(cgroupRoot, pid); err != nil { - return err - } - return nil -} - -func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { - if !c.DeviceAccess { - dir, err := c.Join(cgroupRoot, "devices", pid) - if err != nil { - return err - } - - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if err := writeFile(dir, "devices.deny", "a"); err != nil { - return err - } - - allow := []string{ - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { - if c.Memory != 0 || c.MemorySwap != 0 { - dir, err := c.Join(cgroupRoot, "memory", pid) - if err != nil { - return err - } - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if c.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { - return err - } - } - // By default, MemorySwap is set to twice the size of RAM. - // If you want to omit MemorySwap, set it to `-1'. - if c.MemorySwap != -1 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { - return err - } - } - } - return nil -} - -func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - dir, err := c.Join(cgroupRoot, "cpu", pid) - if err != nil { - return err - } - if c.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { - return err - } - } - return nil } func (c *Cgroup) setupCpuset(cgroupRoot string, pid int) (err error) { diff --git a/iptables/iptables.go b/iptables/iptables.go index 4cdd67e..1f25952 100644 --- a/iptables/iptables.go +++ b/iptables/iptables.go @@ -66,7 +66,6 @@ func (c *Chain) Forward(action Action, ip net.IP, port int, proto, dest_addr str "-p", proto, "-d", daddr, "--dport", strconv.Itoa(port), - "!", "-i", c.Bridge, "-j", "DNAT", "--to-destination", net.JoinHostPort(dest_addr, strconv.Itoa(dest_port))); err != nil { return err diff --git a/label/label.go b/label/label.go new file mode 100644 index 0000000..ba1e9f4 --- /dev/null +++ b/label/label.go @@ -0,0 +1,23 @@ +// +build !selinux !linux + +package label + +func GenLabels(options string) (string, string, error) { + return "", "", nil +} + +func FormatMountLabel(src string, MountLabel string) string { + return src +} + +func SetProcessLabel(processLabel string) error { + return nil +} + +func SetFileLabel(path string, fileLabel string) error { + return nil +} + +func GetPidCon(pid int) (string, error) { + return "", nil +} diff --git a/label/label_selinux.go b/label/label_selinux.go new file mode 100644 index 0000000..300a8b6 --- /dev/null +++ b/label/label_selinux.go @@ -0,0 +1,69 @@ +// +build selinux,linux + +package label + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/selinux" + "strings" +) + +func GenLabels(options string) (string, string, error) { + processLabel, mountLabel := selinux.GetLxcContexts() + var err error + if processLabel == "" { // SELinux is disabled + return "", "", err + } + s := strings.Fields(options) + l := len(s) + if l > 0 { + pcon := selinux.NewContext(processLabel) + for i := 0; i < l; i++ { + o := strings.Split(s[i], "=") + pcon[o[0]] = o[1] + } + processLabel = pcon.Get() + mountLabel, err = selinux.CopyLevel(processLabel, mountLabel) + } + return processLabel, mountLabel, err +} + +func FormatMountLabel(src string, MountLabel string) string { + var mountLabel string + if src != "" { + mountLabel = src + if MountLabel != "" { + mountLabel = fmt.Sprintf("%s,context=\"%s\"", mountLabel, MountLabel) + } + } else { + if MountLabel != "" { + mountLabel = fmt.Sprintf("context=\"%s\"", MountLabel) + } + } + return mountLabel +} + +func SetProcessLabel(processLabel string) error { + if selinux.SelinuxEnabled() { + return selinux.Setexeccon(processLabel) + } + return nil +} + +func GetProcessLabel() (string, error) { + if selinux.SelinuxEnabled() { + return selinux.Getexeccon() + } + return "", nil +} + +func SetFileLabel(path string, fileLabel string) error { + if selinux.SelinuxEnabled() && fileLabel != "" { + return selinux.Setfilecon(path, fileLabel) + } + return nil +} + +func GetPidCon(pid int) (string, error) { + return selinux.Getpidcon(pid) +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6e902d1..73842f7 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -7,6 +7,7 @@ import ( "os/exec" "syscall" + "github.com/dotcloud/docker/pkg/cgroups" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" @@ -62,10 +63,15 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ // Do this before syncing with child so that no children // can escape the cgroup ns.logger.Println("setting cgroups") - if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { + activeCgroup, err := ns.SetupCgroups(container, command.Process.Pid) + if err != nil { command.Process.Kill() return -1, err } + if activeCgroup != nil { + defer activeCgroup.Cleanup() + } + ns.logger.Println("setting up network") if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() @@ -86,13 +92,11 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ return status, err } -func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) (cgroups.ActiveCgroup, error) { if container.Cgroups != nil { - if err := container.Cgroups.Apply(nspid); err != nil { - return err - } + return container.Cgroups.Apply(nspid) } - return nil + return nil, nil } func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index f8b8931..9017af0 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "os" @@ -32,7 +33,11 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s closeFds() return -1, err } - + processLabel, err := label.GetPidCon(nspid) + if err != nil { + closeFds() + return -1, err + } // foreach namespace fd, use setns to join an existing container's namespaces for _, fd := range fds { if fd > 0 { @@ -80,6 +85,10 @@ dropAndExec: if err := finalizeNamespace(container); err != nil { return -1, err } + err = label.SetProcessLabel(processLabel) + if err != nil { + return -1, err + } if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index c39928d..8518232 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,8 +5,10 @@ package nsinit import ( "fmt" "os" + "runtime" "syscall" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/apparmor" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -61,7 +63,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setup networking %s", err) } ns.logger.Println("setup mount namespace") - if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { + if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot, container.Context["mount_label"]); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -77,6 +79,10 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return err } } + runtime.LockOSThread() + if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { + return fmt.Errorf("SetProcessLabel label %s", err) + } ns.logger.Printf("execing %s\n", args[0]) return system.Execv(args[0], args[0:], container.Env) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 19dacfa..a10f2b5 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "io/ioutil" @@ -20,7 +21,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // // There is no need to unmount the new mounts because as soon as the mount namespace // is no longer in use, the mounts will be removed automatically -func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool) error { +func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool, mountLabel string) error { flag := syscall.MS_PRIVATE if noPivotRoot { flag = syscall.MS_SLAVE @@ -31,7 +32,7 @@ func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, cons if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } - if err := mountSystem(rootfs); err != nil { + if err := mountSystem(rootfs, mountLabel); err != nil { return fmt.Errorf("mount system %s", err) } @@ -59,7 +60,7 @@ func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, cons if err := setupDev(rootfs); err != nil { return err } - if err := setupPtmx(rootfs, console); err != nil { + if err := setupPtmx(rootfs, console, mountLabel); err != nil { return err } if err := system.Chdir(rootfs); err != nil { @@ -197,7 +198,7 @@ func setupDev(rootfs string) error { } // setupConsole ensures that the container has a proper /dev/console setup -func setupConsole(rootfs, console string) error { +func setupConsole(rootfs, console string, mountLabel string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -221,6 +222,9 @@ func setupConsole(rootfs, console string) error { if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } + if err := label.SetFileLabel(console, mountLabel); err != nil { + return fmt.Errorf("SetFileLabel Failed %s %s", dest, err) + } if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } @@ -229,7 +233,7 @@ func setupConsole(rootfs, console string) error { // mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts // inside the mount namespace -func mountSystem(rootfs string) error { +func mountSystem(rootfs string, mountLabel string) error { for _, m := range []struct { source string path string @@ -239,8 +243,8 @@ func mountSystem(rootfs string) error { }{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777,size=65536k"}, - {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1755,size=65536k", mountLabel)}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) @@ -254,7 +258,7 @@ func mountSystem(rootfs string) error { // setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and // finishes setting up /dev/console -func setupPtmx(rootfs, console string) error { +func setupPtmx(rootfs, console string, mountLabel string) error { ptmx := filepath.Join(rootfs, "dev/ptmx") if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err @@ -263,7 +267,7 @@ func setupPtmx(rootfs, console string) error { return fmt.Errorf("symlink dev ptmx %s", err) } if console != "" { - if err := setupConsole(rootfs, console); err != nil { + if err := setupConsole(rootfs, console, mountLabel); err != nil { return err } } diff --git a/selinux/selinux.go b/selinux/selinux.go new file mode 100644 index 0000000..5236d3f --- /dev/null +++ b/selinux/selinux.go @@ -0,0 +1,387 @@ +package selinux + +import ( + "bufio" + "crypto/rand" + "encoding/binary" + "fmt" + "github.com/dotcloud/docker/pkg/mount" + "github.com/dotcloud/docker/pkg/system" + "io" + "os" + "regexp" + "strconv" + "strings" + "syscall" +) + +const ( + Enforcing = 1 + Permissive = 0 + Disabled = -1 + selinuxDir = "/etc/selinux/" + selinuxConfig = selinuxDir + "config" + selinuxTypeTag = "SELINUXTYPE" + selinuxTag = "SELINUX" + selinuxPath = "/sys/fs/selinux" + xattrNameSelinux = "security.selinux" + stRdOnly = 0x01 +) + +var ( + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + spaceRegex = regexp.MustCompile(`^([^=]+) (.*)$`) + mcsList = make(map[string]bool) + selinuxfs = "unknown" + selinuxEnabled = false + selinuxEnabledChecked = false +) + +type SELinuxContext map[string]string + +func GetSelinuxMountPoint() string { + if selinuxfs != "unknown" { + return selinuxfs + } + selinuxfs = "" + + mounts, err := mount.GetMounts() + if err != nil { + return selinuxfs + } + for _, mount := range mounts { + if mount.Fstype == "selinuxfs" { + selinuxfs = mount.Mountpoint + break + } + } + if selinuxfs != "" { + var buf syscall.Statfs_t + syscall.Statfs(selinuxfs, &buf) + if (buf.Flags & stRdOnly) == 1 { + selinuxfs = "" + } + } + return selinuxfs +} + +func SelinuxEnabled() bool { + if selinuxEnabledChecked { + return selinuxEnabled + } + selinuxEnabledChecked = true + if fs := GetSelinuxMountPoint(); fs != "" { + if con, _ := Getcon(); con != "kernel" { + selinuxEnabled = true + } + } + return selinuxEnabled +} + +func ReadConfig(target string) (value string) { + var ( + val, key string + bufin *bufio.Reader + ) + + in, err := os.Open(selinuxConfig) + if err != nil { + return "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err != io.EOF { + return "" + } + done = true + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == target { + return strings.Trim(val, "\"") + } + } + } + return "" +} + +func GetSELinuxPolicyRoot() string { + return selinuxDir + ReadConfig(selinuxTypeTag) +} + +func readCon(name string) (string, error) { + var val string + + in, err := os.Open(name) + if err != nil { + return "", err + } + defer in.Close() + + _, err = fmt.Fscanf(in, "%s", &val) + return val, err +} + +func Setfilecon(path string, scon string) error { + return system.Lsetxattr(path, xattrNameSelinux, []byte(scon), 0) +} + +func Getfilecon(path string) (string, error) { + var scon []byte + + cnt, err := syscall.Getxattr(path, xattrNameSelinux, scon) + scon = make([]byte, cnt) + cnt, err = syscall.Getxattr(path, xattrNameSelinux, scon) + return string(scon), err +} + +func Setfscreatecon(scon string) error { + return writeCon("/proc/self/attr/fscreate", scon) +} + +func Getfscreatecon() (string, error) { + return readCon("/proc/self/attr/fscreate") +} + +func Getcon() (string, error) { + return readCon("/proc/self/attr/current") +} + +func Getpidcon(pid int) (string, error) { + return readCon(fmt.Sprintf("/proc/%d/attr/current", pid)) +} + +func Getexeccon() (string, error) { + return readCon("/proc/self/attr/exec") +} + +func writeCon(name string, val string) error { + if !SelinuxEnabled() { + return nil + } + out, err := os.OpenFile(name, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + if val != "" { + _, err = out.Write([]byte(val)) + } else { + _, err = out.Write(nil) + } + return err +} + +func Setexeccon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), scon) +} + +func (c SELinuxContext) Get() string { + return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"]) +} + +func NewContext(scon string) SELinuxContext { + c := make(SELinuxContext) + + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + c["user"] = con[0] + c["role"] = con[1] + c["type"] = con[2] + c["level"] = con[3] + } + return c +} + +func SelinuxGetEnforce() int { + var enforce int + + enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) + if err != nil { + return -1 + } + + enforce, err = strconv.Atoi(string(enforceS)) + if err != nil { + return -1 + } + return enforce +} + +func SelinuxGetEnforceMode() int { + switch ReadConfig(selinuxTag) { + case "enforcing": + return Enforcing + case "permissive": + return Permissive + } + return Disabled +} + +func mcsAdd(mcs string) { + mcsList[mcs] = true +} + +func mcsDelete(mcs string) { + mcsList[mcs] = false +} + +func mcsExists(mcs string) bool { + return mcsList[mcs] +} + +func IntToMcs(id int, catRange uint32) string { + var ( + SETSIZE = int(catRange) + TIER = SETSIZE + ORD = id + ) + + if id < 1 || id > 523776 { + return "" + } + + for ORD > TIER { + ORD = ORD - TIER + TIER -= 1 + } + TIER = SETSIZE - TIER + ORD = ORD + TIER + return fmt.Sprintf("s0:c%d,c%d", TIER, ORD) +} + +func uniqMcs(catRange uint32) string { + var ( + n uint32 + c1, c2 uint32 + mcs string + ) + + for { + binary.Read(rand.Reader, binary.LittleEndian, &n) + c1 = n % catRange + binary.Read(rand.Reader, binary.LittleEndian, &n) + c2 = n % catRange + if c1 == c2 { + continue + } else { + if c1 > c2 { + t := c1 + c1 = c2 + c2 = t + } + } + mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) + if mcsExists(mcs) { + continue + } + mcsAdd(mcs) + break + } + return mcs +} + +func FreeContext(con string) { + if con != "" { + scon := NewContext(con) + mcsDelete(scon["level"]) + } +} + +func GetLxcContexts() (processLabel string, fileLabel string) { + var ( + val, key string + bufin *bufio.Reader + ) + + if !SelinuxEnabled() { + return "", "" + } + lxcPath := fmt.Sprintf("%s/content/lxc_contexts", GetSELinuxPolicyRoot()) + fileLabel = "system_u:object_r:svirt_sandbox_file_t:s0" + processLabel = "system_u:system_r:svirt_lxc_net_t:s0" + + in, err := os.Open(lxcPath) + if err != nil { + goto exit + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err == io.EOF { + done = true + } else { + goto exit + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == "process" { + processLabel = strings.Trim(val, "\"") + } + if key == "file" { + fileLabel = strings.Trim(val, "\"") + } + } + } +exit: + mcs := IntToMcs(os.Getpid(), 1024) + scon := NewContext(processLabel) + scon["level"] = mcs + processLabel = scon.Get() + scon = NewContext(fileLabel) + scon["level"] = mcs + fileLabel = scon.Get() + return processLabel, fileLabel +} + +func SecurityCheckContext(val string) error { + return writeCon(fmt.Sprintf("%s.context", selinuxPath), val) +} + +func CopyLevel(src, dest string) (string, error) { + if !SelinuxEnabled() { + return "", nil + } + if src == "" { + return "", nil + } + if err := SecurityCheckContext(src); err != nil { + return "", err + } + if err := SecurityCheckContext(dest); err != nil { + return "", err + } + scon := NewContext(src) + tcon := NewContext(dest) + tcon["level"] = scon["level"] + return tcon.Get(), nil +} diff --git a/selinux/selinux_test.go b/selinux/selinux_test.go new file mode 100644 index 0000000..6b59c1d --- /dev/null +++ b/selinux/selinux_test.go @@ -0,0 +1,64 @@ +package selinux_test + +import ( + "github.com/dotcloud/docker/pkg/selinux" + "os" + "testing" +) + +func testSetfilecon(t *testing.T) { + if selinux.SelinuxEnabled() { + tmp := "selinux_test" + out, _ := os.OpenFile(tmp, os.O_WRONLY, 0) + out.Close() + err := selinux.Setfilecon(tmp, "system_u:object_r:bin_t:s0") + if err == nil { + t.Log(selinux.Getfilecon(tmp)) + } else { + t.Log("Setfilecon failed") + t.Fatal(err) + } + os.Remove(tmp) + } +} + +func TestSELinux(t *testing.T) { + var ( + err error + plabel, flabel string + ) + + if selinux.SelinuxEnabled() { + t.Log("Enabled") + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + t.Log("getenforce ", selinux.SelinuxGetEnforce()) + t.Log("getenforcemode ", selinux.SelinuxGetEnforceMode()) + pid := os.Getpid() + t.Log("PID:%d MCS:%s\n", pid, selinux.IntToMcs(pid, 1023)) + t.Log(selinux.Getcon()) + t.Log(selinux.Getfilecon("/etc/passwd")) + err = selinux.Setfscreatecon("unconfined_u:unconfined_r:unconfined_t:s0") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + err = selinux.Setfscreatecon("") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + t.Log(selinux.Getpidcon(1)) + t.Log(selinux.GetSelinuxMountPoint()) + } else { + t.Log("Disabled") + } +} diff --git a/systemd/activation/files.go b/systemd/activation/files.go deleted file mode 100644 index 0281146..0000000 --- a/systemd/activation/files.go +++ /dev/null @@ -1,55 +0,0 @@ -/* -Copyright 2013 CoreOS Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Package activation implements primitives for systemd socket activation. -package activation - -import ( - "os" - "strconv" - "syscall" -) - -// based on: https://gist.github.com/alberts/4640792 -const ( - listenFdsStart = 3 -) - -func Files(unsetEnv bool) []*os.File { - if unsetEnv { - // there is no way to unset env in golang os package for now - // https://code.google.com/p/go/issues/detail?id=6423 - defer os.Setenv("LISTEN_PID", "") - defer os.Setenv("LISTEN_FDS", "") - } - - pid, err := strconv.Atoi(os.Getenv("LISTEN_PID")) - if err != nil || pid != os.Getpid() { - return nil - } - - nfds, err := strconv.Atoi(os.Getenv("LISTEN_FDS")) - if err != nil || nfds == 0 { - return nil - } - - var files []*os.File - for fd := listenFdsStart; fd < listenFdsStart+nfds; fd++ { - syscall.CloseOnExec(fd) - files = append(files, os.NewFile(uintptr(fd), "LISTEN_FD_"+strconv.Itoa(fd))) - } - - return files -} diff --git a/systemd/activation/listeners.go b/systemd/activation/listeners.go deleted file mode 100644 index 3296a08..0000000 --- a/systemd/activation/listeners.go +++ /dev/null @@ -1,37 +0,0 @@ -/* -Copyright 2014 CoreOS Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package activation - -import ( - "fmt" - "net" -) - -// Listeners returns net.Listeners for all socket activated fds passed to this process. -func Listeners(unsetEnv bool) ([]net.Listener, error) { - files := Files(unsetEnv) - listeners := make([]net.Listener, len(files)) - - for i, f := range files { - var err error - listeners[i], err = net.FileListener(f) - if err != nil { - return nil, fmt.Errorf("Error setting up FileListener for fd %d: %s", f.Fd(), err.Error()) - } - } - - return listeners, nil -} diff --git a/systemd/booted.go b/systemd/booted.go new file mode 100644 index 0000000..2aae931 --- /dev/null +++ b/systemd/booted.go @@ -0,0 +1,15 @@ +package systemd + +import ( + "os" +) + +// Conversion to Go of systemd's sd_booted() +func SdBooted() bool { + s, err := os.Stat("/run/systemd/system") + if err != nil { + return false + } + + return s.IsDir() +} diff --git a/systemd/listendfd.go b/systemd/listendfd.go index f604432..0fbc0a6 100644 --- a/systemd/listendfd.go +++ b/systemd/listendfd.go @@ -5,7 +5,7 @@ import ( "net" "strconv" - "github.com/dotcloud/docker/pkg/systemd/activation" + "github.com/coreos/go-systemd/activation" ) // ListenFD returns the specified socket activated files as a slice of