a5364236a7
It has been pointed out that some files in /proc and /sys can be used to break out of containers. However, if those filesystems are mounted read-only, most of the known exploits are mitigated, since they rely on writing some file in those filesystems. This does not replace security modules (like SELinux or AppArmor), it is just another layer of security. Likewise, it doesn't mean that the other mitigations (shadowing parts of /proc or /sys with bind mounts) are useless. Those measures are still useful. As such, the shadowing of /proc/kcore is still enabled with both LXC and native drivers. Special care has to be taken with /proc/1/attr, which still needs to be mounted read-write in order to enable the AppArmor profile. It is bind-mounted from a private read-write mount of procfs. All that enforcement is done in dockerinit. The code doing the real work is in libcontainer. The init function for the LXC driver calls the function from libcontainer to avoid code duplication. Docker-DCO-1.1-Signed-off-by: Jérôme Petazzoni <jerome@docker.com> (github: jpetazzo)
160 lines
4.7 KiB
Go
160 lines
4.7 KiB
Go
// +build linux
|
|
|
|
package nsinit
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"runtime"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/dotcloud/docker/pkg/apparmor"
|
|
"github.com/dotcloud/docker/pkg/label"
|
|
"github.com/dotcloud/docker/pkg/libcontainer"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/console"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/mount"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/network"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
|
|
"github.com/dotcloud/docker/pkg/libcontainer/utils"
|
|
"github.com/dotcloud/docker/pkg/system"
|
|
"github.com/dotcloud/docker/pkg/user"
|
|
)
|
|
|
|
// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
|
|
// and other options required for the new container.
|
|
func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, syncPipe *SyncPipe, args []string) error {
|
|
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// clear the current processes env and replace it with the environment
|
|
// defined on the container
|
|
if err := LoadContainerEnvironment(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
// We always read this as it is a way to sync with the parent as well
|
|
context, err := syncPipe.ReadFromParent()
|
|
if err != nil {
|
|
syncPipe.Close()
|
|
return err
|
|
}
|
|
syncPipe.Close()
|
|
|
|
if consolePath != "" {
|
|
if err := console.OpenAndDup(consolePath); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if _, err := system.Setsid(); err != nil {
|
|
return fmt.Errorf("setsid %s", err)
|
|
}
|
|
if consolePath != "" {
|
|
if err := system.Setctty(); err != nil {
|
|
return fmt.Errorf("setctty %s", err)
|
|
}
|
|
}
|
|
if err := setupNetwork(container, context); err != nil {
|
|
return fmt.Errorf("setup networking %s", err)
|
|
}
|
|
|
|
label.Init()
|
|
|
|
if err := mount.InitializeMountNamespace(rootfs, consolePath, container); err != nil {
|
|
return fmt.Errorf("setup mount namespace %s", err)
|
|
}
|
|
if err := system.Sethostname(container.Hostname); err != nil {
|
|
return fmt.Errorf("sethostname %s", err)
|
|
}
|
|
|
|
runtime.LockOSThread()
|
|
|
|
if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
|
|
if err := restrict.Restrict("/", restrictionPath); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil {
|
|
return err
|
|
}
|
|
if err := label.SetProcessLabel(container.Context["process_label"]); err != nil {
|
|
return fmt.Errorf("set process label %s", err)
|
|
}
|
|
|
|
if err := FinalizeNamespace(container); err != nil {
|
|
return fmt.Errorf("finalize namespace %s", err)
|
|
}
|
|
return system.Execv(args[0], args[0:], container.Env)
|
|
}
|
|
|
|
// SetupUser changes the groups, gid, and uid for the user inside the container
|
|
func SetupUser(u string) error {
|
|
uid, gid, suppGids, err := user.GetUserGroupSupplementary(u, syscall.Getuid(), syscall.Getgid())
|
|
if err != nil {
|
|
return fmt.Errorf("get supplementary groups %s", err)
|
|
}
|
|
if err := system.Setgroups(suppGids); err != nil {
|
|
return fmt.Errorf("setgroups %s", err)
|
|
}
|
|
if err := system.Setgid(gid); err != nil {
|
|
return fmt.Errorf("setgid %s", err)
|
|
}
|
|
if err := system.Setuid(uid); err != nil {
|
|
return fmt.Errorf("setuid %s", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setupVethNetwork uses the Network config if it is not nil to initialize
|
|
// the new veth interface inside the container for use by changing the name to eth0
|
|
// setting the MTU and IP address along with the default gateway
|
|
func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error {
|
|
for _, config := range container.Networks {
|
|
strategy, err := network.GetStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err1 := strategy.Initialize(config, context)
|
|
if err1 != nil {
|
|
return err1
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// FinalizeNamespace drops the caps, sets the correct user
|
|
// and working dir, and closes any leaky file descriptors
|
|
// before execing the command inside the namespace
|
|
func FinalizeNamespace(container *libcontainer.Container) error {
|
|
if err := capabilities.DropCapabilities(container); err != nil {
|
|
return fmt.Errorf("drop capabilities %s", err)
|
|
}
|
|
if err := system.CloseFdsFrom(3); err != nil {
|
|
return fmt.Errorf("close open file descriptors %s", err)
|
|
}
|
|
if err := SetupUser(container.User); err != nil {
|
|
return fmt.Errorf("setup user %s", err)
|
|
}
|
|
if container.WorkingDir != "" {
|
|
if err := system.Chdir(container.WorkingDir); err != nil {
|
|
return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func LoadContainerEnvironment(container *libcontainer.Container) error {
|
|
os.Clearenv()
|
|
for _, pair := range container.Env {
|
|
p := strings.SplitN(pair, "=", 2)
|
|
if err := os.Setenv(p[0], p[1]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|