a5364236a7
It has been pointed out that some files in /proc and /sys can be used to break out of containers. However, if those filesystems are mounted read-only, most of the known exploits are mitigated, since they rely on writing some file in those filesystems. This does not replace security modules (like SELinux or AppArmor), it is just another layer of security. Likewise, it doesn't mean that the other mitigations (shadowing parts of /proc or /sys with bind mounts) are useless. Those measures are still useful. As such, the shadowing of /proc/kcore is still enabled with both LXC and native drivers. Special care has to be taken with /proc/1/attr, which still needs to be mounted read-write in order to enable the AppArmor profile. It is bind-mounted from a private read-write mount of procfs. All that enforcement is done in dockerinit. The code doing the real work is in libcontainer. The init function for the LXC driver calls the function from libcontainer to avoid code duplication. Docker-DCO-1.1-Signed-off-by: Jérôme Petazzoni <jerome@docker.com> (github: jpetazzo)
137 lines
4.5 KiB
Go
137 lines
4.5 KiB
Go
// +build linux
|
|
|
|
package mount
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"syscall"
|
|
|
|
"github.com/dotcloud/docker/pkg/label"
|
|
"github.com/dotcloud/docker/pkg/libcontainer"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
|
|
"github.com/dotcloud/docker/pkg/system"
|
|
)
|
|
|
|
// default mount point flags
|
|
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
|
|
|
|
type mount struct {
|
|
source string
|
|
path string
|
|
device string
|
|
flags int
|
|
data string
|
|
}
|
|
|
|
// InitializeMountNamespace setups up the devices, mount points, and filesystems for use inside a
|
|
// new mount namepsace
|
|
func InitializeMountNamespace(rootfs, console string, container *libcontainer.Container) error {
|
|
var (
|
|
err error
|
|
flag = syscall.MS_PRIVATE
|
|
)
|
|
if container.NoPivotRoot {
|
|
flag = syscall.MS_SLAVE
|
|
}
|
|
if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil {
|
|
return fmt.Errorf("mounting / as slave %s", err)
|
|
}
|
|
if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
|
|
return fmt.Errorf("mouting %s as bind %s", rootfs, err)
|
|
}
|
|
if err := mountSystem(rootfs, container); err != nil {
|
|
return fmt.Errorf("mount system %s", err)
|
|
}
|
|
if err := setupBindmounts(rootfs, container.Mounts); err != nil {
|
|
return fmt.Errorf("bind mounts %s", err)
|
|
}
|
|
if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
|
|
return fmt.Errorf("copy dev nodes %s", err)
|
|
}
|
|
if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
|
|
return err
|
|
}
|
|
if err := system.Chdir(rootfs); err != nil {
|
|
return fmt.Errorf("chdir into %s %s", rootfs, err)
|
|
}
|
|
|
|
if container.NoPivotRoot {
|
|
err = MsMoveRoot(rootfs)
|
|
} else {
|
|
err = PivotRoot(rootfs)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if container.ReadonlyFs {
|
|
if err := SetReadonly(); err != nil {
|
|
return fmt.Errorf("set readonly %s", err)
|
|
}
|
|
}
|
|
|
|
system.Umask(0022)
|
|
|
|
return nil
|
|
}
|
|
|
|
// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
|
|
// inside the mount namespace
|
|
func mountSystem(rootfs string, container *libcontainer.Container) error {
|
|
for _, m := range newSystemMounts(rootfs, container.Context["mount_label"], container.Mounts) {
|
|
if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
|
|
return fmt.Errorf("mkdirall %s %s", m.path, err)
|
|
}
|
|
if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
|
|
return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
|
|
for _, m := range bindMounts.OfType("bind") {
|
|
var (
|
|
flags = syscall.MS_BIND | syscall.MS_REC
|
|
dest = filepath.Join(rootfs, m.Destination)
|
|
)
|
|
if !m.Writable {
|
|
flags = flags | syscall.MS_RDONLY
|
|
}
|
|
if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
|
|
return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
|
|
}
|
|
if !m.Writable {
|
|
if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil {
|
|
return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err)
|
|
}
|
|
}
|
|
if m.Private {
|
|
if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
|
|
return fmt.Errorf("mounting %s private %s", dest, err)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TODO: this is crappy right now and should be cleaned up with a better way of handling system and
|
|
// standard bind mounts allowing them to be more dynamic
|
|
func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
|
|
systemMounts := []mount{
|
|
{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
|
|
{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
|
|
}
|
|
|
|
if len(mounts.OfType("devtmpfs")) == 1 {
|
|
systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)})
|
|
}
|
|
systemMounts = append(systemMounts,
|
|
mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
|
|
mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
|
|
)
|
|
|
|
return systemMounts
|
|
}
|