Mount /proc and /sys read-only, except in privileged containers.

It has been pointed out that some files in /proc and /sys can be used
to break out of containers. However, if those filesystems are mounted
read-only, most of the known exploits are mitigated, since they rely
on writing some file in those filesystems.

This does not replace security modules (like SELinux or AppArmor), it
is just another layer of security. Likewise, it doesn't mean that the
other mitigations (shadowing parts of /proc or /sys with bind mounts)
are useless. Those measures are still useful. As such, the shadowing
of /proc/kcore is still enabled with both LXC and native drivers.

Special care has to be taken with /proc/1/attr, which still needs to
be mounted read-write in order to enable the AppArmor profile. It is
bind-mounted from a private read-write mount of procfs.

All that enforcement is done in dockerinit. The code doing the real
work is in libcontainer. The init function for the LXC driver calls
the function from libcontainer to avoid code duplication.

Docker-DCO-1.1-Signed-off-by: Jérôme Petazzoni <jerome@docker.com> (github: jpetazzo)
This commit is contained in:
Jérôme Petazzoni 2014-04-30 18:00:42 -07:00 committed by Michael Crosby
parent ecb2b00021
commit a5364236a7
3 changed files with 68 additions and 44 deletions

View file

@ -11,7 +11,6 @@ import (
"github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/label"
"github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes" "github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
"github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/system"
) )
@ -51,11 +50,6 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co
if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil { if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
return fmt.Errorf("copy dev nodes %s", err) return fmt.Errorf("copy dev nodes %s", err)
} }
if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
if err := restrict.Restrict(rootfs, restrictionPath); err != nil {
return fmt.Errorf("restrict %s", err)
}
}
if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil { if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
return err return err
} }
@ -124,10 +118,11 @@ func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
} }
// TODO: this is crappy right now and should be cleaned up with a better way of handling system and // TODO: this is crappy right now and should be cleaned up with a better way of handling system and
// standard bind mounts allowing them to be more dymanic // standard bind mounts allowing them to be more dynamic
func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount { func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
systemMounts := []mount{ systemMounts := []mount{
{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
} }
if len(mounts.OfType("devtmpfs")) == 1 { if len(mounts.OfType("devtmpfs")) == 1 {
@ -138,8 +133,5 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo
mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
) )
if len(mounts.OfType("sysfs")) == 1 {
systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags})
}
return systemMounts return systemMounts
} }

View file

@ -16,6 +16,7 @@ import (
"github.com/dotcloud/docker/pkg/libcontainer/mount" "github.com/dotcloud/docker/pkg/libcontainer/mount"
"github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/network"
"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
"github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/libcontainer/utils"
"github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/system"
"github.com/dotcloud/docker/pkg/user" "github.com/dotcloud/docker/pkg/user"
@ -68,18 +69,25 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string,
if err := system.Sethostname(container.Hostname); err != nil { if err := system.Sethostname(container.Hostname); err != nil {
return fmt.Errorf("sethostname %s", err) return fmt.Errorf("sethostname %s", err)
} }
if err := FinalizeNamespace(container); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
runtime.LockOSThread() runtime.LockOSThread()
if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
if err := restrict.Restrict("/", restrictionPath); err != nil {
return err
}
}
if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil {
return err return err
} }
if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { if err := label.SetProcessLabel(container.Context["process_label"]); err != nil {
return fmt.Errorf("set process label %s", err) return fmt.Errorf("set process label %s", err)
} }
if err := FinalizeNamespace(container); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
return system.Execv(args[0], args[0:], container.Env) return system.Execv(args[0], args[0:], container.Env)
} }

View file

@ -9,43 +9,67 @@ import (
"github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/system"
) )
const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY // "restrictions" are container paths (files, directories, whatever) that have to be masked.
// maskPath is a "safe" path to be mounted over maskedPath. It can take two special values:
var restrictions = map[string]string{ // - if it is "", then nothing is mounted;
// dirs // - if it is "EMPTY", then an empty directory is mounted instead.
"/proc/sys": "", // If remountRO is true then the maskedPath is remounted read-only (regardless of whether a maskPath was used).
"/proc/irq": "", type restriction struct {
"/proc/acpi": "", maskedPath string
maskPath string
// files remountRO bool
"/proc/sysrq-trigger": "/dev/null",
"/proc/kcore": "/dev/null",
} }
// Restrict locks down access to many areas of proc var restrictions = []restriction{
// by using the asumption that the user does not have mount caps to {"/proc", "", true},
// revert the changes made here {"/sys", "", true},
{"/proc/kcore", "/dev/null", false},
}
// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts).
// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes).
// "empty" should be the path to an empty directory.
func Restrict(rootfs, empty string) error { func Restrict(rootfs, empty string) error {
for dest, source := range restrictions { for _, restriction := range restrictions {
dest = filepath.Join(rootfs, dest) dest := filepath.Join(rootfs, restriction.maskedPath)
if restriction.maskPath != "" {
// we don't have a "/dev/null" for dirs so have the requester pass a dir var source string
// for us to bind mount if restriction.maskPath == "EMPTY" {
switch source {
case "":
source = empty source = empty
default: } else {
source = filepath.Join(rootfs, source) source = filepath.Join(rootfs, restriction.maskPath)
} }
if err := system.Mount(source, dest, "bind", flags, ""); err != nil { if err := system.Mount(source, dest, "", syscall.MS_BIND, ""); err != nil {
if os.IsNotExist(err) { return fmt.Errorf("unable to bind-mount %s over %s: %s", source, dest, err)
continue
} }
return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
} }
if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil { if restriction.remountRO {
return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil {
return fmt.Errorf("unable to remount %s readonly: %s", dest, err)
} }
} }
}
// This weird trick will allow us to mount /proc read-only, while being able to use AppArmor.
// This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr.
// So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it.
tmpProcPath := filepath.Join(rootfs, ".proc")
if err := os.Mkdir(tmpProcPath, 0700); err != nil {
return fmt.Errorf("unable to create temporary proc mountpoint %s: %s", tmpProcPath, err)
}
if err := system.Mount("proc", tmpProcPath, "proc", 0, ""); err != nil {
return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err)
}
if err := system.Mount("proc", tmpProcPath, "", syscall.MS_REMOUNT, ""); err != nil {
return fmt.Errorf("unable to remount proc read-write: %s", err)
}
rwAttrPath := filepath.Join(rootfs, ".proc", "1", "attr")
roAttrPath := filepath.Join(rootfs, "proc", "1", "attr")
if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil {
return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err)
}
if err := system.Unmount(tmpProcPath, 0); err != nil {
return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err)
}
return nil return nil
} }