From 81d2c6749227e2959d48da20de1f5b050d6da292 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 16:56:11 -0800 Subject: [PATCH 01/64] Initial commit of libcontainer Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/MAINTAINERS | 2 + libcontainer/README.md | 63 +++++ libcontainer/capabilities/capabilities.go | 49 ++++ libcontainer/cli/main.go | 171 ++++++++++++++ libcontainer/container.go | 27 +++ libcontainer/container.json | 38 ++++ libcontainer/errors.go | 9 + libcontainer/namespaces/calls_linux.go | 164 +++++++++++++ libcontainer/namespaces/exec.go | 266 ++++++++++++++++++++++ libcontainer/namespaces/linux_x86_64.go | 7 + libcontainer/namespaces/mount.go | 207 +++++++++++++++++ libcontainer/namespaces/namespaces.go | 70 ++++++ libcontainer/namespaces/ns_linux.go | 35 +++ libcontainer/namespaces/utils.go | 108 +++++++++ libcontainer/network/network.go | 104 +++++++++ libcontainer/network/veth.go | 85 +++++++ libcontainer/privileged.json | 22 ++ libcontainer/types.go | 49 ++++ libcontainer/ubuntu.json | 22 ++ libcontainer/utils/utils.go | 33 +++ 20 files changed, 1531 insertions(+) create mode 100644 libcontainer/MAINTAINERS create mode 100644 libcontainer/README.md create mode 100644 libcontainer/capabilities/capabilities.go create mode 100644 libcontainer/cli/main.go create mode 100644 libcontainer/container.go create mode 100644 libcontainer/container.json create mode 100644 libcontainer/errors.go create mode 100644 libcontainer/namespaces/calls_linux.go create mode 100644 libcontainer/namespaces/exec.go create mode 100644 libcontainer/namespaces/linux_x86_64.go create mode 100644 libcontainer/namespaces/mount.go create mode 100644 libcontainer/namespaces/namespaces.go create mode 100644 libcontainer/namespaces/ns_linux.go create mode 100644 libcontainer/namespaces/utils.go create mode 100644 libcontainer/network/network.go create mode 100644 libcontainer/network/veth.go create mode 100644 libcontainer/privileged.json create mode 100644 libcontainer/types.go create mode 100644 libcontainer/ubuntu.json create mode 100644 libcontainer/utils/utils.go diff --git a/libcontainer/MAINTAINERS b/libcontainer/MAINTAINERS new file mode 100644 index 0000000..e53d933 --- /dev/null +++ b/libcontainer/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/libcontainer/README.md b/libcontainer/README.md new file mode 100644 index 0000000..91d7478 --- /dev/null +++ b/libcontainer/README.md @@ -0,0 +1,63 @@ +## libcontainer - reference implementation for containers + +#### playground + + +Use the cli package to test out functionality + +First setup a container configuration. You will need a root fs, better go the path to a +stopped docker container and use that. + + +```json +{ + "id": "koye", + "namespace_pid": 12265, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/development/gocode/src/github.com/docker/libcontainer/namespaces/ubuntu", + "network": null, + "user": "", + "working_dir": "", + "namespaces": [ + "NEWNET", + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} +``` + +After you have a json file and a rootfs path to use just run: +`./cli exec container.json` + + +If you want to attach to an existing namespace just use the same json +file with the container still running and do: +`./cli execin container.json` diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000..3301e10 --- /dev/null +++ b/libcontainer/capabilities/capabilities.go @@ -0,0 +1,49 @@ +package capabilities + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/syndtr/gocapability/capability" + "os" +) + +var capMap = map[libcontainer.Capability]capability.Cap{ + libcontainer.CAP_SETPCAP: capability.CAP_SETPCAP, + libcontainer.CAP_SYS_MODULE: capability.CAP_SYS_MODULE, + libcontainer.CAP_SYS_RAWIO: capability.CAP_SYS_RAWIO, + libcontainer.CAP_SYS_PACCT: capability.CAP_SYS_PACCT, + libcontainer.CAP_SYS_ADMIN: capability.CAP_SYS_ADMIN, + libcontainer.CAP_SYS_NICE: capability.CAP_SYS_NICE, + libcontainer.CAP_SYS_RESOURCE: capability.CAP_SYS_RESOURCE, + libcontainer.CAP_SYS_TIME: capability.CAP_SYS_TIME, + libcontainer.CAP_SYS_TTY_CONFIG: capability.CAP_SYS_TTY_CONFIG, + libcontainer.CAP_MKNOD: capability.CAP_MKNOD, + libcontainer.CAP_AUDIT_WRITE: capability.CAP_AUDIT_WRITE, + libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, + libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, + libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, +} + +// DropCapabilities drops capabilities for the current process based +// on the container's configuration. +func DropCapabilities(container *libcontainer.Container) error { + if drop := getCapabilities(container); len(drop) > 0 { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + } + return nil +} + +func getCapabilities(container *libcontainer.Container) []capability.Cap { + drop := []capability.Cap{} + for _, c := range container.Capabilities { + drop = append(drop, capMap[c]) + } + return drop +} diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go new file mode 100644 index 0000000..490135e --- /dev/null +++ b/libcontainer/cli/main.go @@ -0,0 +1,171 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" +) + +var ( + displayPid bool + newCommand string + usrNet bool +) + +func init() { + flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") + flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") + flag.BoolVar(&usrNet, "net", false, "user a net namespace") + flag.Parse() +} + +func exec(container *libcontainer.Container) error { + var ( + netFile *os.File + err error + ) + container.NetNsFd = 0 + + if usrNet { + netFile, err = os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = netFile.Fd() + } + + pid, err := namespaces.Exec(container) + if err != nil { + return fmt.Errorf("error exec container %s", err) + } + + if displayPid { + fmt.Println(pid) + } + + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + fmt.Println(exitcode) + if usrNet { + netFile.Close() + if err := network.DeleteNetworkNamespace("/root/nsroot/test"); err != nil { + return err + } + } + os.Exit(exitcode) + return nil +} + +func execIn(container *libcontainer.Container) error { + f, err := os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = f.Fd() + pid, err := namespaces.ExecIn(container, &libcontainer.Command{ + Env: container.Command.Env, + Args: []string{ + newCommand, + }, + }) + if err != nil { + return fmt.Errorf("error exexin container %s", err) + } + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + os.Exit(exitcode) + return nil +} + +func createNet(config *libcontainer.Network) error { + root := "/root/nsroot" + if err := network.SetupNamespaceMountDir(root); err != nil { + return err + } + + nspath := root + "/test" + if err := network.CreateNetworkNamespace(nspath); err != nil { + return nil + } + if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { + return err + } + if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { + return err + } + if err := network.InterfaceUp("veth0"); err != nil { + return err + } + + f, err := os.Open(nspath) + if err != nil { + return err + } + defer f.Close() + + if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { + return err + } + + /* + if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + return err + } + */ + return nil +} + +func printErr(err error) { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) +} + +func main() { + var ( + err error + cliCmd = flag.Arg(0) + config = flag.Arg(1) + ) + f, err := os.Open(config) + if err != nil { + printErr(err) + } + + dec := json.NewDecoder(f) + var container *libcontainer.Container + + if err := dec.Decode(&container); err != nil { + printErr(err) + } + f.Close() + + switch cliCmd { + case "exec": + err = exec(container) + case "execin": + err = execIn(container) + case "net": + err = createNet(&libcontainer.Network{ + TempVethName: "veth1", + IP: "172.17.0.100/16", + Gateway: "172.17.42.1", + Mtu: 1500, + Bridge: "docker0", + }) + default: + err = fmt.Errorf("command not supported: %s", cliCmd) + } + + if err != nil { + printErr(err) + } +} diff --git a/libcontainer/container.go b/libcontainer/container.go new file mode 100644 index 0000000..b77890f --- /dev/null +++ b/libcontainer/container.go @@ -0,0 +1,27 @@ +package libcontainer + +type Container struct { + ID string `json:"id,omitempty"` + NsPid int `json:"namespace_pid,omitempty"` + Command *Command `json:"command,omitempty"` + RootFs string `json:"rootfs,omitempty"` + ReadonlyFs bool `json:"readonly_fs,omitempty"` + NetNsFd uintptr `json:"network_namespace_fd,omitempty"` + User string `json:"user,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + Namespaces Namespaces `json:"namespaces,omitempty"` + Capabilities Capabilities `json:"capabilities,omitempty"` +} + +type Command struct { + Args []string `json:"args,omitempty"` + Env []string `json:"environment,omitempty"` +} + +type Network struct { + TempVethName string `json:"temp_veth,omitempty"` + IP string `json:"ip,omitempty"` + Gateway string `json:"gateway,omitempty"` + Bridge string `json:"bridge,omitempty"` + Mtu int `json:"mtu,omitempty"` +} diff --git a/libcontainer/container.json b/libcontainer/container.json new file mode 100644 index 0000000..ed8eb1b --- /dev/null +++ b/libcontainer/container.json @@ -0,0 +1,38 @@ +{ + "id": "koye", + "namespace_pid": 3117, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} diff --git a/libcontainer/errors.go b/libcontainer/errors.go new file mode 100644 index 0000000..c6964ee --- /dev/null +++ b/libcontainer/errors.go @@ -0,0 +1,9 @@ +package libcontainer + +import ( + "errors" +) + +var ( + ErrInvalidPid = errors.New("no ns pid found") +) diff --git a/libcontainer/namespaces/calls_linux.go b/libcontainer/namespaces/calls_linux.go new file mode 100644 index 0000000..793e940 --- /dev/null +++ b/libcontainer/namespaces/calls_linux.go @@ -0,0 +1,164 @@ +package namespaces + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +const ( + TIOCGPTN = 0x80045430 + TIOCSPTLCK = 0x40045431 +) + +func chroot(dir string) error { + return syscall.Chroot(dir) +} + +func chdir(dir string) error { + return syscall.Chdir(dir) +} + +func exec(cmd string, args []string, env []string) error { + return syscall.Exec(cmd, args, env) +} + +func fork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func vfork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func mount(source, target, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} + +func unmount(target string, flags int) error { + return syscall.Unmount(target, flags) +} + +func pivotroot(newroot, putold string) error { + return syscall.PivotRoot(newroot, putold) +} + +func unshare(flags int) error { + return syscall.Unshare(flags) +} + +func clone(flags uintptr) (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func setns(fd uintptr, flags uintptr) error { + _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + if err != 0 { + return err + } + return nil +} + +func usetCloseOnExec(fd uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { + return err + } + return nil +} + +func setgroups(gids []int) error { + return syscall.Setgroups(gids) +} + +func setresgid(rgid, egid, sgid int) error { + return syscall.Setresgid(rgid, egid, sgid) +} + +func setresuid(ruid, euid, suid int) error { + return syscall.Setresuid(ruid, euid, suid) +} + +func sethostname(name string) error { + return syscall.Sethostname([]byte(name)) +} + +func setsid() (int, error) { + return syscall.Setsid() +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +func openpmtx() (*os.File, error) { + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +func unlockpt(f *os.File) error { + var u int + return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +func ptsname(f *os.File) (string, error) { + var n int + if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +func closefd(fd uintptr) error { + return syscall.Close(int(fd)) +} + +func dup2(fd1, fd2 uintptr) error { + return syscall.Dup2(int(fd1), int(fd2)) +} + +func mknod(path string, mode uint32, dev int) error { + return syscall.Mknod(path, mode, dev) +} + +func parentDeathSignal() error { + if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { + return err + } + return nil +} + +func setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} + +func mkfifo(name string, mode uint32) error { + return syscall.Mkfifo(name, mode) +} + +func umask(mask int) int { + return syscall.Umask(mask) +} diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go new file mode 100644 index 0000000..893b302 --- /dev/null +++ b/libcontainer/namespaces/exec.go @@ -0,0 +1,266 @@ +/* + Higher level convience functions for setting up a container +*/ + +package namespaces + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "io" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace") +) + +// Exec will spawn new namespaces with the specified Container configuration +// in the RootFs path and return the pid of the new containerized process. +// +// If an existing network namespace is specified the container +// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is, +// the container will be spawned with a new network namespace with no configuration. Omiting an +// existing network namespace and the CLONE_NEWNET option in the container configuration will allow +// the container to the the host's networking options and configuration. +func Exec(container *libcontainer.Container) (pid int, err error) { + // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join + if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { + return -1, ErrExistingNetworkNamespace + } + + rootfs, err := resolveRootfs(container) + if err != nil { + return -1, err + } + + master, console, err := createMasterAndConsole() + if err != nil { + return -1, err + } + + logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) + if err != nil { + return -1, err + } + log.SetOutput(logger) + + // we need CLONE_VFORK so we can wait on the child + flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK + + if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil { + return -1, fmt.Errorf("error cloning process: %s", err) + } + + if pid == 0 { + // welcome to your new namespace ;) + // + // any errors encoutered inside the namespace we should write + // out to a log or a pipe to our parent and exit(1) + // because writing to stderr will not work after we close + if err := closeMasterAndStd(master); err != nil { + writeError("close master and std %s", err) + } + slave, err := openTerminal(console, syscall.O_RDWR) + if err != nil { + writeError("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + writeError("dup2 slave %s", err) + } + + if container.NetNsFd > 0 { + if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { + writeError("join existing net namespace %s", err) + } + } + + if _, err := setsid(); err != nil { + writeError("setsid %s", err) + } + if err := setctty(); err != nil { + writeError("setctty %s", err) + } + if err := parentDeathSignal(); err != nil { + writeError("parent deth signal %s", err) + } + if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { + writeError("setup mount namespace %s", err) + } + if err := sethostname(container.ID); err != nil { + writeError("sethostname %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + writeError("setup user %s", err) + } + if container.WorkingDir != "" { + if err := chdir(container.WorkingDir); err != nil { + writeError("chdir to %s %s", container.WorkingDir, err) + } + } + if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + + go func() { + if _, err := io.Copy(os.Stdout, master); err != nil { + log.Println(err) + } + }() + go func() { + if _, err := io.Copy(master, os.Stdin); err != nil { + log.Println(err) + } + }() + return pid, nil +} + +// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's +// pid and namespace configuration is needed along with the specific capabilities that should +// be dropped once inside the namespace. +func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { + if container.NsPid <= 0 { + return -1, libcontainer.ErrInvalidPid + } + + fds, err := getNsFds(container) + if err != nil { + return -1, err + } + + if container.NetNsFd > 0 { + fds = append(fds, container.NetNsFd) + } + + pid, err := fork() + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + return -1, err + } + + if pid == 0 { + for _, fd := range fds { + if fd > 0 { + if err := JoinExistingNamespace(fd, ""); err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + writeError("join existing namespace for %d %s", fd, err) + } + } + syscall.Close(int(fd)) + } + + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + // important: + // + // we need to fork and unshare so that re can remount proc and sys within + // the namespace so the CLONE_NEWPID namespace will take effect + // if we don't fork we would end up unmounting proc and sys for the entire + // namespace + child, err := fork() + if err != nil { + writeError("fork child %s", err) + } + + if child == 0 { + if err := unshare(CLONE_NEWNS); err != nil { + writeError("unshare newns %s", err) + } + if err := remountProc(); err != nil { + writeError("remount proc %s", err) + } + if err := remountSys(); err != nil { + writeError("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop caps %s", err) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + exit, err := utils.WaitOnPid(child) + if err != nil { + writeError("wait on child %s", err) + } + os.Exit(exit) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + return pid, err +} + +func resolveRootfs(container *libcontainer.Container) (string, error) { + rootfs, err := filepath.Abs(container.RootFs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func createMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + + console, err := ptsname(master) + if err != nil { + return nil, "", err + } + + if err := unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + +func closeMasterAndStd(master *os.File) error { + closefd(master.Fd()) + closefd(0) + closefd(1) + closefd(2) + + return nil +} + +func dupSlave(slave *os.File) error { + // we close Stdin,etc so our pty slave should have fd 0 + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := dup2(slave.Fd(), 1); err != nil { + return err + } + if err := dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +func openTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/libcontainer/namespaces/linux_x86_64.go b/libcontainer/namespaces/linux_x86_64.go new file mode 100644 index 0000000..ac9a014 --- /dev/null +++ b/libcontainer/namespaces/linux_x86_64.go @@ -0,0 +1,7 @@ +// +build linux,x86_64 +package namespaces + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +const ( + SYS_SETNS = 308 +) diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go new file mode 100644 index 0000000..6d867c9 --- /dev/null +++ b/libcontainer/namespaces/mount.go @@ -0,0 +1,207 @@ +package namespaces + +import ( + "fmt" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + // default mount point options + defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +) + +func SetupNewMountNamespace(rootfs, console string, readonly bool) error { + if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + + if readonly { + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting %s as readonly %s", rootfs, err) + } + } + + if err := mountSystem(rootfs); err != nil { + return fmt.Errorf("mount system %s", err) + } + + if err := copyDevNodes(rootfs); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + + if err := setupDev(rootfs); err != nil { + return err + } + + if err := setupConsole(rootfs, console); err != nil { + return err + } + + if err := chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + + if err := chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + + if err := chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + + umask(0022) + + return nil +} + +func copyDevNodes(rootfs string) error { + umask(0000) + + for _, node := range []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", + } { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + + log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) + if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + } + return nil +} + +func setupDev(rootfs string) error { + for _, link := range []struct { + from string + to string + }{ + {"/proc/kcore", "/dev/core"}, + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } { + dest := filepath.Join(rootfs, link.to) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Symlink(link.from, dest); err != nil { + return fmt.Errorf("symlink %s %s", dest, err) + } + } + return nil +} + +func setupConsole(rootfs, console string) error { + umask(0000) + + stat, err := os.Stat(console) + if err != nil { + return fmt.Errorf("stat console %s %s", console, err) + } + st := stat.Sys().(*syscall.Stat_t) + + dest := filepath.Join(rootfs, "dev/console") + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + + if err := os.Chmod(console, 0600); err != nil { + return err + } + if err := os.Chown(console, 0, 0); err != nil { + return err + } + + if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + + if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", console, dest, err) + } + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string) error { + mounts := []struct { + source string + path string + device string + flags int + data string + }{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults}, + {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, + } + for _, m := range mounts { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +func remountProc() error { + if err := unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + return err + } + return nil +} + +func remountSys() error { + if err := unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go new file mode 100644 index 0000000..2a50847 --- /dev/null +++ b/libcontainer/namespaces/namespaces.go @@ -0,0 +1,70 @@ +/* + TODO + pivot root + cgroups + more mount stuff that I probably am forgetting + apparmor +*/ + +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" + "path/filepath" + "syscall" +) + +// CreateNewNamespace creates a new namespace and binds it's fd to the specified path +func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error { + var ( + flag = namespaceMap[namespace] + name = namespaceFileMap[namespace] + nspath = filepath.Join("/proc/self/ns", name) + ) + // TODO: perform validation on name and flag + + pid, err := fork() + if err != nil { + return err + } + + if pid == 0 { + if err := unshare(flag); err != nil { + writeError("unshare %s", err) + } + if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil { + writeError("bind mount %s", err) + } + os.Exit(0) + } + exit, err := utils.WaitOnPid(pid) + if err != nil { + return err + } + if exit != 0 { + return fmt.Errorf("exit status %d", exit) + } + return err +} + +// JoinExistingNamespace uses the fd of an existing linux namespace and +// has the current process join that namespace or the spacespace specified by ns +func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { + flag := namespaceMap[ns] + if err := setns(fd, uintptr(flag)); err != nil { + return err + } + return nil +} + +// getNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= namespaceMap[ns] + } + return +} diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go new file mode 100644 index 0000000..b0e5119 --- /dev/null +++ b/libcontainer/namespaces/ns_linux.go @@ -0,0 +1,35 @@ +package namespaces + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +const ( + SIGCHLD = 0x14 + CLONE_VFORK = 0x00004000 + CLONE_NEWNS = 0x00020000 + CLONE_NEWUTS = 0x04000000 + CLONE_NEWIPC = 0x08000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 +) + +var namespaceMap = map[libcontainer.Namespace]int{ + "": 0, + libcontainer.CLONE_NEWNS: CLONE_NEWNS, + libcontainer.CLONE_NEWUTS: CLONE_NEWUTS, + libcontainer.CLONE_NEWIPC: CLONE_NEWIPC, + libcontainer.CLONE_NEWUSER: CLONE_NEWUSER, + libcontainer.CLONE_NEWPID: CLONE_NEWPID, + libcontainer.CLONE_NEWNET: CLONE_NEWNET, +} + +var namespaceFileMap = map[libcontainer.Namespace]string{ + libcontainer.CLONE_NEWNS: "mnt", + libcontainer.CLONE_NEWUTS: "uts", + libcontainer.CLONE_NEWIPC: "ipc", + libcontainer.CLONE_NEWUSER: "user", + libcontainer.CLONE_NEWPID: "pid", + libcontainer.CLONE_NEWNET: "net", +} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go new file mode 100644 index 0000000..438d896 --- /dev/null +++ b/libcontainer/namespaces/utils.go @@ -0,0 +1,108 @@ +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +func addEnvIfNotSet(container *libcontainer.Container, key, value string) { + jv := fmt.Sprintf("%s=%s", key, value) + if len(container.Command.Env) == 0 { + container.Command.Env = []string{jv} + return + } + + for _, v := range container.Command.Env { + parts := strings.Split(v, "=") + if parts[0] == key { + return + } + } + container.Command.Env = append(container.Command.Env, jv) +} + +// print and error to stderr and exit(1) +func writeError(format string, v ...interface{}) { + fmt.Fprintf(os.Stderr, format, v...) + os.Exit(1) +} + +// getNsFds inspects the container's namespace configuration and opens the fds to +// each of the namespaces. +func getNsFds(container *libcontainer.Container) ([]uintptr, error) { + var ( + namespaces = []string{} + fds = []uintptr{} + ) + + for _, ns := range container.Namespaces { + namespaces = append(namespaces, namespaceFileMap[ns]) + } + + for _, ns := range namespaces { + fd, err := getNsFd(container.NsPid, ns) + if err != nil { + for _, fd = range fds { + syscall.Close(int(fd)) + } + return nil, err + } + fds = append(fds, fd) + } + return fds, nil +} + +// getNsFd returns the fd for a specific pid and namespace option +func getNsFd(pid int, ns string) (uintptr, error) { + nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) + // OpenFile adds closOnExec + f, err := os.OpenFile(nspath, os.O_RDONLY, 0666) + if err != nil { + return 0, err + } + return f.Fd(), nil +} + +// setupEnvironment adds additional environment variables to the container's +// Command such as USER, LOGNAME, container, and TERM +func setupEnvironment(container *libcontainer.Container) { + addEnvIfNotSet(container, "container", "docker") + // TODO: check if pty + addEnvIfNotSet(container, "TERM", "xterm") + // TODO: get username from container + addEnvIfNotSet(container, "USER", "root") + addEnvIfNotSet(container, "LOGNAME", "root") +} + +func setupUser(container *libcontainer.Container) error { + // TODO: honor user passed on container + if err := setgroups(nil); err != nil { + return err + } + if err := setresgid(0, 0, 0); err != nil { + return err + } + if err := setresuid(0, 0, 0); err != nil { + return err + } + return nil +} + +func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { + master, err := openpmtx() + if err != nil { + return "", nil, err + } + + console, err := ptsname(master) + if err != nil { + master.Close() + return "", nil, err + } + return console, master, nil +} diff --git a/libcontainer/network/network.go b/libcontainer/network/network.go new file mode 100644 index 0000000..31c5d32 --- /dev/null +++ b/libcontainer/network/network.go @@ -0,0 +1,104 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/netlink" + "net" +) + +var ( + ErrNoDefaultRoute = errors.New("no default network route found") +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceInNamespaceFd(name string, fd int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsFd(iface, fd) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.NetworkSetMaster(iface, masterIface) +} + +func SetDefaultGateway(ip string) error { + return netlink.AddDefaultGw(net.ParseIP(ip)) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} + +func GetDefaultMtu() (int, error) { + routes, err := netlink.NetworkGetRoutes() + if err != nil { + return -1, err + } + for _, r := range routes { + if r.Default { + return r.Iface.MTU, nil + } + } + return -1, ErrNoDefaultRoute +} diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go new file mode 100644 index 0000000..dc207b3 --- /dev/null +++ b/libcontainer/network/veth.go @@ -0,0 +1,85 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "os" + "syscall" +) + +// SetupVeth sets up an existing network namespace with the specified +// network configuration. +func SetupVeth(config *libcontainer.Network) error { + if err := InterfaceDown(config.TempVethName); err != nil { + return fmt.Errorf("interface down %s %s", config.TempVethName, err) + } + if err := ChangeInterfaceName(config.TempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", config.TempVethName, err) + } + if err := SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// SetupNamespaceMountDir prepares a new root for use as a mount +// source for bind mounting namespace fd to an outside path +func SetupNamespaceMountDir(root string) error { + if err := os.MkdirAll(root, 0666); err != nil { + return err + } + // make sure mounts are not unmounted by other mnt namespaces + if err := syscall.Mount("", root, "none", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil && err != syscall.EINVAL { + return err + } + if err := syscall.Mount(root, root, "none", syscall.MS_BIND, ""); err != nil { + return err + } + return nil +} + +// CreateNetworkNamespace creates a new network namespace and binds it's fd +// at the binding path +func CreateNetworkNamespace(bindingPath string) error { + f, err := os.OpenFile(bindingPath, os.O_RDONLY|os.O_CREATE|os.O_EXCL, 0) + if err != nil { + return err + } + f.Close() + + if err := namespaces.CreateNewNamespace(libcontainer.CLONE_NEWNET, bindingPath); err != nil { + return err + } + return nil +} + +// DeleteNetworkNamespace unmounts the binding path and removes the +// file so that no references to the fd are present and the network +// namespace is automatically cleaned up +func DeleteNetworkNamespace(bindingPath string) error { + if err := syscall.Unmount(bindingPath, 0); err != nil { + return err + } + return os.Remove(bindingPath) +} diff --git a/libcontainer/privileged.json b/libcontainer/privileged.json new file mode 100644 index 0000000..be877ad --- /dev/null +++ b/libcontainer/privileged.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/usr/lib/systemd/systemd" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/libcontainer/types.go b/libcontainer/types.go new file mode 100644 index 0000000..db1c3b9 --- /dev/null +++ b/libcontainer/types.go @@ -0,0 +1,49 @@ +package libcontainer + +type Namespace string +type Namespaces []Namespace + +func (n Namespaces) Contains(ns Namespace) bool { + for _, nns := range n { + if nns == ns { + return true + } + } + return false +} + +type Capability string +type Capabilities []Capability + +func (c Capabilities) Contains(capp Capability) bool { + for _, cc := range c { + if cc == capp { + return true + } + } + return false +} + +const ( + CAP_SETPCAP Capability = "SETPCAP" + CAP_SYS_MODULE Capability = "SYS_MODULE" + CAP_SYS_RAWIO Capability = "SYS_RAWIO" + CAP_SYS_PACCT Capability = "SYS_PACCT" + CAP_SYS_ADMIN Capability = "SYS_ADMIN" + CAP_SYS_NICE Capability = "SYS_NICE" + CAP_SYS_RESOURCE Capability = "SYS_RESOURCE" + CAP_SYS_TIME Capability = "SYS_TIME" + CAP_SYS_TTY_CONFIG Capability = "SYS_TTY_CONFIG" + CAP_MKNOD Capability = "MKNOD" + CAP_AUDIT_WRITE Capability = "AUDIT_WRITE" + CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" + CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" + CAP_MAC_ADMIN Capability = "MAC_ADMIN" + + CLONE_NEWNS Namespace = "NEWNS" // mount + CLONE_NEWUTS Namespace = "NEWUTS" // utsname + CLONE_NEWIPC Namespace = "NEWIPC" // ipc + CLONE_NEWUSER Namespace = "NEWUSER" // user + CLONE_NEWPID Namespace = "NEWPID" // pid + CLONE_NEWNET Namespace = "NEWNET" // network +) diff --git a/libcontainer/ubuntu.json b/libcontainer/ubuntu.json new file mode 100644 index 0000000..0a450ae --- /dev/null +++ b/libcontainer/ubuntu.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/sbin/init" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/var/lib/docker/btrfs/subvolumes/7c0f15df1ad2e2fe04d7a6e079aec17406e9465a6a37dd16cb0dd754fc0167b3", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go new file mode 100644 index 0000000..7289fec --- /dev/null +++ b/libcontainer/utils/utils.go @@ -0,0 +1,33 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "os" + "syscall" +) + +func WaitOnPid(pid int) (exitcode int, err error) { + child, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := child.Wait() + if err != nil { + return -1, err + } + return getExitCode(state), nil +} + +func getExitCode(state *os.ProcessState) int { + return state.Sys().(syscall.WaitStatus).ExitStatus() +} + +func GenerateRandomName(size int) (string, error) { + id := make([]byte, size) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return hex.EncodeToString(id), nil +} From b0cdb601b9728975e697a4762b886acc58342b66 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 17:52:06 -0800 Subject: [PATCH 02/64] Make separate nsinit pkg for a dockerinit like init Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 2 + libcontainer/namespaces/calls_linux.go | 80 ++++---- libcontainer/namespaces/exec.go | 270 ++++++++----------------- libcontainer/namespaces/mount.go | 36 ++-- libcontainer/namespaces/namespaces.go | 40 +--- libcontainer/namespaces/nsinit/init.go | 140 +++++++++++++ libcontainer/namespaces/utils.go | 24 +-- 7 files changed, 285 insertions(+), 307 deletions(-) create mode 100644 libcontainer/namespaces/nsinit/init.go diff --git a/libcontainer/container.go b/libcontainer/container.go index b77890f..dd5e728 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,6 +11,8 @@ type Container struct { WorkingDir string `json:"working_dir,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` + Master uintptr `json:"master"` + Console string `json:"console"` } type Command struct { diff --git a/libcontainer/namespaces/calls_linux.go b/libcontainer/namespaces/calls_linux.go index 793e940..f006d56 100644 --- a/libcontainer/namespaces/calls_linux.go +++ b/libcontainer/namespaces/calls_linux.go @@ -12,19 +12,19 @@ const ( TIOCSPTLCK = 0x40045431 ) -func chroot(dir string) error { +func Chroot(dir string) error { return syscall.Chroot(dir) } -func chdir(dir string) error { +func Chdir(dir string) error { return syscall.Chdir(dir) } -func exec(cmd string, args []string, env []string) error { +func Exec(cmd string, args []string, env []string) error { return syscall.Exec(cmd, args, env) } -func fork() (int, error) { +func Fork() (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) syscall.ForkLock.Unlock() @@ -34,33 +34,23 @@ func fork() (int, error) { return int(pid), nil } -func vfork() (int, error) { - syscall.ForkLock.Lock() - pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0) - syscall.ForkLock.Unlock() - if err != 0 { - return -1, err - } - return int(pid), nil -} - -func mount(source, target, fstype string, flags uintptr, data string) error { +func Mount(source, target, fstype string, flags uintptr, data string) error { return syscall.Mount(source, target, fstype, flags, data) } -func unmount(target string, flags int) error { +func Unmount(target string, flags int) error { return syscall.Unmount(target, flags) } -func pivotroot(newroot, putold string) error { +func Pivotroot(newroot, putold string) error { return syscall.PivotRoot(newroot, putold) } -func unshare(flags int) error { +func Unshare(flags int) error { return syscall.Unshare(flags) } -func clone(flags uintptr) (int, error) { +func Clone(flags uintptr) (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) syscall.ForkLock.Unlock() @@ -70,7 +60,7 @@ func clone(flags uintptr) (int, error) { return int(pid), nil } -func setns(fd uintptr, flags uintptr) error { +func Setns(fd uintptr, flags uintptr) error { _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) if err != 0 { return err @@ -78,87 +68,87 @@ func setns(fd uintptr, flags uintptr) error { return nil } -func usetCloseOnExec(fd uintptr) error { +func UsetCloseOnExec(fd uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { return err } return nil } -func setgroups(gids []int) error { +func Setgroups(gids []int) error { return syscall.Setgroups(gids) } -func setresgid(rgid, egid, sgid int) error { +func Setresgid(rgid, egid, sgid int) error { return syscall.Setresgid(rgid, egid, sgid) } -func setresuid(ruid, euid, suid int) error { +func Setresuid(ruid, euid, suid int) error { return syscall.Setresuid(ruid, euid, suid) } -func sethostname(name string) error { +func Sethostname(name string) error { return syscall.Sethostname([]byte(name)) } -func setsid() (int, error) { +func Setsid() (int, error) { return syscall.Setsid() } -func ioctl(fd uintptr, flag, data uintptr) error { +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +func Ioctl(fd uintptr, flag, data uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { return err } return nil } -func openpmtx() (*os.File, error) { - return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) -} - -func unlockpt(f *os.File) error { - var u int - return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) -} - -func ptsname(f *os.File) (string, error) { +func Ptsname(f *os.File) (string, error) { var n int - if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + if err := Ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { return "", err } return fmt.Sprintf("/dev/pts/%d", n), nil } -func closefd(fd uintptr) error { +func Openpmtx() (*os.File, error) { + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +func Closefd(fd uintptr) error { return syscall.Close(int(fd)) } -func dup2(fd1, fd2 uintptr) error { +func Dup2(fd1, fd2 uintptr) error { return syscall.Dup2(int(fd1), int(fd2)) } -func mknod(path string, mode uint32, dev int) error { +func Mknod(path string, mode uint32, dev int) error { return syscall.Mknod(path, mode, dev) } -func parentDeathSignal() error { +func ParentDeathSignal() error { if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { return err } return nil } -func setctty() error { +func Setctty() error { if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { return err } return nil } -func mkfifo(name string, mode uint32) error { +func Mkfifo(name string, mode uint32) error { return syscall.Mkfifo(name, mode) } -func umask(mask int) int { +func Umask(mask int) int { return syscall.Umask(mask) } diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 893b302..0077a0b 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -8,12 +8,10 @@ import ( "errors" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/capabilities" - "github.com/dotcloud/docker/pkg/libcontainer/utils" "io" "log" "os" - "path/filepath" + "os/exec" "syscall" ) @@ -29,89 +27,31 @@ var ( // the container will be spawned with a new network namespace with no configuration. Omiting an // existing network namespace and the CLONE_NEWNET option in the container configuration will allow // the container to the the host's networking options and configuration. -func Exec(container *libcontainer.Container) (pid int, err error) { +func ExecContainer(container *libcontainer.Container) (pid int, err error) { // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { return -1, ErrExistingNetworkNamespace } - rootfs, err := resolveRootfs(container) - if err != nil { - return -1, err - } - master, console, err := createMasterAndConsole() if err != nil { return -1, err } - - logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) - if err != nil { - return -1, err - } - log.SetOutput(logger) + container.Console = console + container.Master = master.Fd() // we need CLONE_VFORK so we can wait on the child - flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK + flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil { - return -1, fmt.Errorf("error cloning process: %s", err) - } - - if pid == 0 { - // welcome to your new namespace ;) - // - // any errors encoutered inside the namespace we should write - // out to a log or a pipe to our parent and exit(1) - // because writing to stderr will not work after we close - if err := closeMasterAndStd(master); err != nil { - writeError("close master and std %s", err) - } - slave, err := openTerminal(console, syscall.O_RDWR) - if err != nil { - writeError("open terminal %s", err) - } - if err := dupSlave(slave); err != nil { - writeError("dup2 slave %s", err) - } - - if container.NetNsFd > 0 { - if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { - writeError("join existing net namespace %s", err) - } - } - - if _, err := setsid(); err != nil { - writeError("setsid %s", err) - } - if err := setctty(); err != nil { - writeError("setctty %s", err) - } - if err := parentDeathSignal(); err != nil { - writeError("parent deth signal %s", err) - } - if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { - writeError("setup mount namespace %s", err) - } - if err := sethostname(container.ID); err != nil { - writeError("sethostname %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop capabilities %s", err) - } - if err := setupUser(container); err != nil { - writeError("setup user %s", err) - } - if container.WorkingDir != "" { - if err := chdir(container.WorkingDir); err != nil { - writeError("chdir to %s %s", container.WorkingDir, err) - } - } - if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") + command := exec.Command("/.nsinit") + command.SysProcAttr = &syscall.SysProcAttr{} + command.SysProcAttr.Cloneflags = flag + command.SysProcAttr.Setctty = true + + if err := command.Start(); err != nil { + return -1, err } + pid = command.Process.Pid go func() { if _, err := io.Copy(os.Stdout, master); err != nil { @@ -130,91 +70,86 @@ func Exec(container *libcontainer.Container) (pid int, err error) { // pid and namespace configuration is needed along with the specific capabilities that should // be dropped once inside the namespace. func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { - if container.NsPid <= 0 { - return -1, libcontainer.ErrInvalidPid - } - - fds, err := getNsFds(container) - if err != nil { - return -1, err - } - - if container.NetNsFd > 0 { - fds = append(fds, container.NetNsFd) - } - - pid, err := fork() - if err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) + return -1, fmt.Errorf("not implemented") + /* + if container.NsPid <= 0 { + return -1, libcontainer.ErrInvalidPid } - return -1, err - } - if pid == 0 { - for _, fd := range fds { - if fd > 0 { - if err := JoinExistingNamespace(fd, ""); err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) + fds, err := getNsFds(container) + if err != nil { + return -1, err + } + + if container.NetNsFd > 0 { + fds = append(fds, container.NetNsFd) + } + + pid, err := fork() + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + return -1, err + } + + if pid == 0 { + for _, fd := range fds { + if fd > 0 { + if err := JoinExistingNamespace(fd, ""); err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + writeError("join existing namespace for %d %s", fd, err) } - writeError("join existing namespace for %d %s", fd, err) } - } - syscall.Close(int(fd)) - } - - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - // important: - // - // we need to fork and unshare so that re can remount proc and sys within - // the namespace so the CLONE_NEWPID namespace will take effect - // if we don't fork we would end up unmounting proc and sys for the entire - // namespace - child, err := fork() - if err != nil { - writeError("fork child %s", err) + syscall.Close(int(fd)) } - if child == 0 { - if err := unshare(CLONE_NEWNS); err != nil { - writeError("unshare newns %s", err) + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + // important: + // + // we need to fork and unshare so that re can remount proc and sys within + // the namespace so the CLONE_NEWPID namespace will take effect + // if we don't fork we would end up unmounting proc and sys for the entire + // namespace + child, err := fork() + if err != nil { + writeError("fork child %s", err) } - if err := remountProc(); err != nil { - writeError("remount proc %s", err) - } - if err := remountSys(); err != nil { - writeError("remount sys %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop caps %s", err) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - exit, err := utils.WaitOnPid(child) - if err != nil { - writeError("wait on child %s", err) - } - os.Exit(exit) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - return pid, err -} -func resolveRootfs(container *libcontainer.Container) (string, error) { - rootfs, err := filepath.Abs(container.RootFs) - if err != nil { - return "", err - } - return filepath.EvalSymlinks(rootfs) + if child == 0 { + if err := unshare(CLONE_NEWNS); err != nil { + writeError("unshare newns %s", err) + } + if err := remountProc(); err != nil { + writeError("remount proc %s", err) + } + if err := remountSys(); err != nil { + writeError("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop caps %s", err) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + exit, err := utils.WaitOnPid(child) + if err != nil { + writeError("wait on child %s", err) + } + os.Exit(exit) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + return pid, err + */ } func createMasterAndConsole() (*os.File, string, error) { @@ -223,44 +158,13 @@ func createMasterAndConsole() (*os.File, string, error) { return nil, "", err } - console, err := ptsname(master) + console, err := Ptsname(master) if err != nil { return nil, "", err } - if err := unlockpt(master); err != nil { + if err := Unlockpt(master); err != nil { return nil, "", err } return master, console, nil } - -func closeMasterAndStd(master *os.File) error { - closefd(master.Fd()) - closefd(0) - closefd(1) - closefd(2) - - return nil -} - -func dupSlave(slave *os.File) error { - // we close Stdin,etc so our pty slave should have fd 0 - if slave.Fd() != 0 { - return fmt.Errorf("slave fd not 0 %d", slave.Fd()) - } - if err := dup2(slave.Fd(), 1); err != nil { - return err - } - if err := dup2(slave.Fd(), 2); err != nil { - return err - } - return nil -} - -func openTerminal(name string, flag int) (*os.File, error) { - r, e := syscall.Open(name, flag, 0) - if e != nil { - return nil, &os.PathError{"open", name, e} - } - return os.NewFile(uintptr(r), name), nil -} diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index 6d867c9..8e7c54b 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -14,16 +14,16 @@ var ( ) func SetupNewMountNamespace(rootfs, console string, readonly bool) error { - if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } if readonly { - if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } @@ -52,29 +52,29 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { return err } - if err := chdir(rootfs); err != nil { + if err := Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + if err := Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := chroot("."); err != nil { + if err := Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := chdir("/"); err != nil { + if err := Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - umask(0022) + Umask(0022) return nil } func copyDevNodes(rootfs string) error { - umask(0000) + Umask(0000) for _, node := range []string{ "null", @@ -95,7 +95,7 @@ func copyDevNodes(rootfs string) error { ) log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) - if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + if err := Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } } @@ -125,7 +125,7 @@ func setupDev(rootfs string) error { } func setupConsole(rootfs, console string) error { - umask(0000) + Umask(0000) stat, err := os.Stat(console) if err != nil { @@ -145,11 +145,11 @@ func setupConsole(rootfs, console string) error { return err } - if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + if err := Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + if err := Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } return nil @@ -176,7 +176,7 @@ func mountSystem(rootfs string) error { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) } - if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + if err := Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) } } @@ -184,22 +184,22 @@ func mountSystem(rootfs string) error { } func remountProc() error { - if err := unmount("/proc", syscall.MNT_DETACH); err != nil { + if err := Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { return err } return nil } func remountSys() error { - if err := unmount("/sys", syscall.MNT_DETACH); err != nil { + if err := Unmount("/sys", syscall.MNT_DETACH); err != nil { if err != syscall.EINVAL { return err } } else { - if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { return err } } diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go index 2a50847..05ef0ac 100644 --- a/libcontainer/namespaces/namespaces.go +++ b/libcontainer/namespaces/namespaces.go @@ -9,52 +9,14 @@ package namespaces import ( - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/utils" - "os" - "path/filepath" - "syscall" ) -// CreateNewNamespace creates a new namespace and binds it's fd to the specified path -func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error { - var ( - flag = namespaceMap[namespace] - name = namespaceFileMap[namespace] - nspath = filepath.Join("/proc/self/ns", name) - ) - // TODO: perform validation on name and flag - - pid, err := fork() - if err != nil { - return err - } - - if pid == 0 { - if err := unshare(flag); err != nil { - writeError("unshare %s", err) - } - if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil { - writeError("bind mount %s", err) - } - os.Exit(0) - } - exit, err := utils.WaitOnPid(pid) - if err != nil { - return err - } - if exit != 0 { - return fmt.Errorf("exit status %d", exit) - } - return err -} - // JoinExistingNamespace uses the fd of an existing linux namespace and // has the current process join that namespace or the spacespace specified by ns func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { flag := namespaceMap[ns] - if err := setns(fd, uintptr(flag)); err != nil { + if err := Setns(fd, uintptr(flag)); err != nil { return err } return nil diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go new file mode 100644 index 0000000..9a75636 --- /dev/null +++ b/libcontainer/namespaces/nsinit/init.go @@ -0,0 +1,140 @@ +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "log" + "os" + "path/filepath" + "syscall" +) + +// InitNamespace should be run inside an existing namespace to setup +// common mounts, drop capabilities, and setup network interfaces +func InitNamespace(container *libcontainer.Container) error { + rootfs, err := resolveRootfs(container) + if err != nil { + return err + } + + // any errors encoutered inside the namespace we should write + // out to a log or a pipe to our parent and exit(1) + // because writing to stderr will not work after we close + if err := closeMasterAndStd(container.Master); err != nil { + log.Fatalf("close master and std %s", err) + return err + } + + slave, err := openTerminal(container.Console, syscall.O_RDWR) + if err != nil { + log.Fatalf("open terminal %s", err) + return err + } + if err := dupSlave(slave); err != nil { + log.Fatalf("dup2 slave %s", err) + return err + } + + /* + if container.NetNsFd > 0 { + if err := joinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { + log.Fatalf("join existing net namespace %s", err) + } + } + */ + + if _, err := namespaces.Setsid(); err != nil { + log.Fatalf("setsid %s", err) + return err + } + if err := namespaces.Setctty(); err != nil { + log.Fatalf("setctty %s", err) + return err + } + if err := namespaces.ParentDeathSignal(); err != nil { + log.Fatalf("parent deth signal %s", err) + return err + } + if err := namespaces.SetupNewMountNamespace(rootfs, container.Console, container.ReadonlyFs); err != nil { + log.Fatalf("setup mount namespace %s", err) + return err + } + if err := namespaces.Sethostname(container.ID); err != nil { + log.Fatalf("sethostname %s", err) + return err + } + if err := capabilities.DropCapabilities(container); err != nil { + log.Fatalf("drop capabilities %s", err) + return err + } + if err := setupUser(container); err != nil { + log.Fatalf("setup user %s", err) + return err + } + if container.WorkingDir != "" { + if err := namespaces.Chdir(container.WorkingDir); err != nil { + log.Fatalf("chdir to %s %s", container.WorkingDir, err) + return err + } + } + if err := namespaces.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + log.Fatalf("exec %s", err) + return err + } + panic("unreachable") +} + +func resolveRootfs(container *libcontainer.Container) (string, error) { + rootfs, err := filepath.Abs(container.RootFs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func closeMasterAndStd(master uintptr) error { + namespaces.Closefd(master) + namespaces.Closefd(0) + namespaces.Closefd(1) + namespaces.Closefd(2) + + return nil +} + +func setupUser(container *libcontainer.Container) error { + // TODO: honor user passed on container + if err := namespaces.Setgroups(nil); err != nil { + return err + } + if err := namespaces.Setresgid(0, 0, 0); err != nil { + return err + } + if err := namespaces.Setresuid(0, 0, 0); err != nil { + return err + } + return nil +} + +func dupSlave(slave *os.File) error { + // we close Stdin,etc so our pty slave should have fd 0 + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := namespaces.Dup2(slave.Fd(), 1); err != nil { + return err + } + if err := namespaces.Dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +func openTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index 438d896..fd195c0 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -26,12 +26,6 @@ func addEnvIfNotSet(container *libcontainer.Container, key, value string) { container.Command.Env = append(container.Command.Env, jv) } -// print and error to stderr and exit(1) -func writeError(format string, v ...interface{}) { - fmt.Fprintf(os.Stderr, format, v...) - os.Exit(1) -} - // getNsFds inspects the container's namespace configuration and opens the fds to // each of the namespaces. func getNsFds(container *libcontainer.Container) ([]uintptr, error) { @@ -79,27 +73,13 @@ func setupEnvironment(container *libcontainer.Container) { addEnvIfNotSet(container, "LOGNAME", "root") } -func setupUser(container *libcontainer.Container) error { - // TODO: honor user passed on container - if err := setgroups(nil); err != nil { - return err - } - if err := setresgid(0, 0, 0); err != nil { - return err - } - if err := setresuid(0, 0, 0); err != nil { - return err - } - return nil -} - func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { - master, err := openpmtx() + master, err := Openpmtx() if err != nil { return "", nil, err } - console, err := ptsname(master) + console, err := Ptsname(master) if err != nil { master.Close() return "", nil, err From c2777d46113666f683bfee89960bfefa4da39ff1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 18:15:41 -0800 Subject: [PATCH 03/64] WIP moving to nsini Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 78 +++++++++++++++++++-------------- libcontainer/namespaces/exec.go | 10 +++-- libcontainer/network/veth.go | 16 ------- 3 files changed, 52 insertions(+), 52 deletions(-) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index 490135e..0430e29 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -6,6 +6,7 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces/nsinit" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "os" @@ -15,15 +16,26 @@ var ( displayPid bool newCommand string usrNet bool + masterFd int + console string ) func init() { flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") flag.BoolVar(&usrNet, "net", false, "user a net namespace") + flag.IntVar(&masterFd, "master", 0, "master fd") + flag.StringVar(&console, "console", "", "console path") flag.Parse() } +func nsinitFunc(container *libcontainer.Container) error { + container.Master = uintptr(masterFd) + container.Console = console + + return nsinit.InitNamespace(container) +} + func exec(container *libcontainer.Container) error { var ( netFile *os.File @@ -39,7 +51,7 @@ func exec(container *libcontainer.Container) error { container.NetNsFd = netFile.Fd() } - pid, err := namespaces.Exec(container) + pid, err := namespaces.ExecContainer(container) if err != nil { return fmt.Errorf("error exec container %s", err) } @@ -87,39 +99,39 @@ func execIn(container *libcontainer.Container) error { } func createNet(config *libcontainer.Network) error { - root := "/root/nsroot" - if err := network.SetupNamespaceMountDir(root); err != nil { - return err - } - - nspath := root + "/test" - if err := network.CreateNetworkNamespace(nspath); err != nil { - return nil - } - if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { - return err - } - if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { - return err - } - if err := network.InterfaceUp("veth0"); err != nil { - return err - } - - f, err := os.Open(nspath) - if err != nil { - return err - } - defer f.Close() - - if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { - return err - } - /* - if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + root := "/root/nsroot" + if err := network.SetupNamespaceMountDir(root); err != nil { return err } + + nspath := root + "/test" + if err := network.CreateNetworkNamespace(nspath); err != nil { + return nil + } + if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { + return err + } + if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { + return err + } + if err := network.InterfaceUp("veth0"); err != nil { + return err + } + + f, err := os.Open(nspath) + if err != nil { + return err + } + defer f.Close() + + if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { + return err + } + + if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + return err + } */ return nil } @@ -133,7 +145,7 @@ func main() { var ( err error cliCmd = flag.Arg(0) - config = flag.Arg(1) + config = "/root/development/gocode/src/github.com/dotcloud/docker/pkg/libcontainer/container.json" //flag.Arg(1) ) f, err := os.Open(config) if err != nil { @@ -149,6 +161,8 @@ func main() { f.Close() switch cliCmd { + case "init": + err = nsinitFunc(container) case "exec": err = exec(container) case "execin": diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 0077a0b..93b155b 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -12,6 +12,8 @@ import ( "log" "os" "os/exec" + "path/filepath" + "strconv" "syscall" ) @@ -37,16 +39,15 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { if err != nil { return -1, err } - container.Console = console - container.Master = master.Fd() + nsinit := filepath.Join(container.RootFs, ".nsinit") // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command("/.nsinit") + command := exec.Command(nsinit, "init", "-master", strconv.Itoa(int(master.Fd())), "-console", console) command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - command.SysProcAttr.Setctty = true + // command.SysProcAttr.Setctty = true if err := command.Start(); err != nil { return -1, err @@ -63,6 +64,7 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() + command.Wait() return pid, nil } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index dc207b3..2ecce22 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -3,7 +3,6 @@ package network import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" "os" "syscall" ) @@ -59,21 +58,6 @@ func SetupNamespaceMountDir(root string) error { return nil } -// CreateNetworkNamespace creates a new network namespace and binds it's fd -// at the binding path -func CreateNetworkNamespace(bindingPath string) error { - f, err := os.OpenFile(bindingPath, os.O_RDONLY|os.O_CREATE|os.O_EXCL, 0) - if err != nil { - return err - } - f.Close() - - if err := namespaces.CreateNewNamespace(libcontainer.CLONE_NEWNET, bindingPath); err != nil { - return err - } - return nil -} - // DeleteNetworkNamespace unmounts the binding path and removes the // file so that no references to the fd are present and the network // namespace is automatically cleaned up From 593219d1914c263f1efaef230fdc51c7996d1c86 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 10:44:29 -0800 Subject: [PATCH 04/64] Use nsinit for setting up namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 1 + libcontainer/container.go | 1 + libcontainer/namespaces/exec.go | 4 ++-- libcontainer/namespaces/nsinit/init.go | 13 +++++++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index 0430e29..ac0ea29 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -32,6 +32,7 @@ func init() { func nsinitFunc(container *libcontainer.Container) error { container.Master = uintptr(masterFd) container.Console = console + container.LogFile = "/root/logs" return nsinit.InitNamespace(container) } diff --git a/libcontainer/container.go b/libcontainer/container.go index dd5e728..c9a3f2e 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -13,6 +13,7 @@ type Container struct { Capabilities Capabilities `json:"capabilities,omitempty"` Master uintptr `json:"master"` Console string `json:"console"` + LogFile string `json:"log_file"` } type Command struct { diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 93b155b..7f4b4a6 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -44,9 +44,10 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "init", "-master", strconv.Itoa(int(master.Fd())), "-console", console) + command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init") command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag + command.ExtraFiles = []*os.File{master} // command.SysProcAttr.Setctty = true if err := command.Start(); err != nil { @@ -64,7 +65,6 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() - command.Wait() return pid, nil } diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index 9a75636..ae6159b 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -14,6 +14,10 @@ import ( // InitNamespace should be run inside an existing namespace to setup // common mounts, drop capabilities, and setup network interfaces func InitNamespace(container *libcontainer.Container) error { + if err := setLogFile(container); err != nil { + return err + } + rootfs, err := resolveRootfs(container) if err != nil { return err @@ -138,3 +142,12 @@ func openTerminal(name string, flag int) (*os.File, error) { } return os.NewFile(uintptr(r), name), nil } + +func setLogFile(container *libcontainer.Container) error { + f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) + if err != nil { + return err + } + log.SetOutput(f) + return nil +} From a304eab9d4da1a57b7592ca4ecf2b0bf51224b81 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Tue, 18 Feb 2014 23:13:36 -0800 Subject: [PATCH 05/64] Improve general quality of libcontainer Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/cli/main.go | 65 ++++++----- libcontainer/container.json | 2 +- libcontainer/namespaces/exec.go | 101 ++---------------- libcontainer/namespaces/mount.go | 44 ++++---- libcontainer/namespaces/namespaces.go | 32 ------ libcontainer/namespaces/ns_linux.go | 9 ++ libcontainer/namespaces/nsinit/init.go | 41 +++---- libcontainer/namespaces/utils.go | 14 --- .../namespaces => system}/calls_linux.go | 37 +------ system/pty_linux.go | 31 ++++++ system/setns_linux.go | 13 +++ system/setns_linux_amd64.go | 8 ++ 12 files changed, 159 insertions(+), 238 deletions(-) delete mode 100644 libcontainer/namespaces/namespaces.go rename {libcontainer/namespaces => system}/calls_linux.go (74%) create mode 100644 system/pty_linux.go create mode 100644 system/setns_linux.go create mode 100644 system/setns_linux_amd64.go diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index ac0ea29..93bb039 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -10,6 +10,9 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "os" + exec_ "os/exec" + "path" + "path/filepath" ) var ( @@ -52,6 +55,18 @@ func exec(container *libcontainer.Container) error { container.NetNsFd = netFile.Fd() } + self, err := exec_.LookPath(os.Args[0]) + if err != nil { + return err + } + if output, err := exec_.Command("cp", self, path.Join(container.RootFs, ".nsinit")).CombinedOutput(); err != nil { + return fmt.Errorf("Error exec cp: %s, (%s)", err, output) + } else { + println(self, container.RootFs) + fmt.Printf("-----> %s\n", output) + } + println("----") + pid, err := namespaces.ExecContainer(container) if err != nil { return fmt.Errorf("error exec container %s", err) @@ -77,25 +92,25 @@ func exec(container *libcontainer.Container) error { } func execIn(container *libcontainer.Container) error { - f, err := os.Open("/root/nsroot/test") - if err != nil { - return err - } - container.NetNsFd = f.Fd() - pid, err := namespaces.ExecIn(container, &libcontainer.Command{ - Env: container.Command.Env, - Args: []string{ - newCommand, - }, - }) - if err != nil { - return fmt.Errorf("error exexin container %s", err) - } - exitcode, err := utils.WaitOnPid(pid) - if err != nil { - return fmt.Errorf("error waiting on child %s", err) - } - os.Exit(exitcode) + // f, err := os.Open("/root/nsroot/test") + // if err != nil { + // return err + // } + // container.NetNsFd = f.Fd() + // pid, err := namespaces.ExecIn(container, &libcontainer.Command{ + // Env: container.Command.Env, + // Args: []string{ + // newCommand, + // }, + // }) + // if err != nil { + // return fmt.Errorf("error exexin container %s", err) + // } + // exitcode, err := utils.WaitOnPid(pid) + // if err != nil { + // return fmt.Errorf("error waiting on child %s", err) + // } + // os.Exit(exitcode) return nil } @@ -143,11 +158,13 @@ func printErr(err error) { } func main() { - var ( - err error - cliCmd = flag.Arg(0) - config = "/root/development/gocode/src/github.com/dotcloud/docker/pkg/libcontainer/container.json" //flag.Arg(1) - ) + cliCmd := flag.Arg(0) + + config, err := filepath.Abs(flag.Arg(1)) + if err != nil { + printErr(err) + } + println("cli:", cliCmd, "config:", config) f, err := os.Open(config) if err != nil { printErr(err) diff --git a/libcontainer/container.json b/libcontainer/container.json index ed8eb1b..6e4fda5 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -12,7 +12,7 @@ "TERM=xterm" ] }, - "rootfs": "/root/main/mycontainer", + "rootfs": "/var/lib/docker/containers/ee76122136d691d63e09d24168a91ddb2ef9fdcf210b4de5c50aa76354892f4b/root", "namespaces": [ "NEWIPC", "NEWNS", diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 7f4b4a6..ea3d2ca 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -6,8 +6,8 @@ package namespaces import ( "errors" - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" "io" "log" "os" @@ -44,12 +44,15 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init") + command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init", "container.json") + // command.Stdin = os.Stdin + // command.Stdout = os.Stdout + // command.Stderr = os.Stderr command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - command.ExtraFiles = []*os.File{master} - // command.SysProcAttr.Setctty = true + //command.ExtraFiles = []*os.File{master} + println("vvvvvvvvv") if err := command.Start(); err != nil { return -1, err } @@ -68,104 +71,18 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { return pid, nil } -// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's -// pid and namespace configuration is needed along with the specific capabilities that should -// be dropped once inside the namespace. -func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { - return -1, fmt.Errorf("not implemented") - /* - if container.NsPid <= 0 { - return -1, libcontainer.ErrInvalidPid - } - - fds, err := getNsFds(container) - if err != nil { - return -1, err - } - - if container.NetNsFd > 0 { - fds = append(fds, container.NetNsFd) - } - - pid, err := fork() - if err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) - } - return -1, err - } - - if pid == 0 { - for _, fd := range fds { - if fd > 0 { - if err := JoinExistingNamespace(fd, ""); err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) - } - writeError("join existing namespace for %d %s", fd, err) - } - } - syscall.Close(int(fd)) - } - - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - // important: - // - // we need to fork and unshare so that re can remount proc and sys within - // the namespace so the CLONE_NEWPID namespace will take effect - // if we don't fork we would end up unmounting proc and sys for the entire - // namespace - child, err := fork() - if err != nil { - writeError("fork child %s", err) - } - - if child == 0 { - if err := unshare(CLONE_NEWNS); err != nil { - writeError("unshare newns %s", err) - } - if err := remountProc(); err != nil { - writeError("remount proc %s", err) - } - if err := remountSys(); err != nil { - writeError("remount sys %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop caps %s", err) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - exit, err := utils.WaitOnPid(child) - if err != nil { - writeError("wait on child %s", err) - } - os.Exit(exit) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - return pid, err - */ -} - func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, "", err } - console, err := Ptsname(master) + console, err := system.Ptsname(master) if err != nil { return nil, "", err } - if err := Unlockpt(master); err != nil { + if err := system.Unlockpt(master); err != nil { return nil, "", err } return master, console, nil diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index 8e7c54b..a9b981e 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -2,6 +2,7 @@ package namespaces import ( "fmt" + "github.com/dotcloud/docker/pkg/system" "log" "os" "path/filepath" @@ -14,16 +15,16 @@ var ( ) func SetupNewMountNamespace(rootfs, console string, readonly bool) error { - if err := Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } if readonly { - if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } @@ -52,29 +53,30 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { return err } - if err := Chdir(rootfs); err != nil { + if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := Chroot("."); err != nil { + if err := system.Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := Chdir("/"); err != nil { + if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - Umask(0022) + system.Umask(0022) return nil } func copyDevNodes(rootfs string) error { - Umask(0000) + oldMask := system.Umask(0000) + defer system.Umask(oldMask) for _, node := range []string{ "null", @@ -95,7 +97,7 @@ func copyDevNodes(rootfs string) error { ) log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) - if err := Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } } @@ -125,7 +127,8 @@ func setupDev(rootfs string) error { } func setupConsole(rootfs, console string) error { - Umask(0000) + oldMask := system.Umask(0000) + defer system.Umask(oldMask) stat, err := os.Stat(console) if err != nil { @@ -145,11 +148,11 @@ func setupConsole(rootfs, console string) error { return err } - if err := Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } return nil @@ -158,7 +161,7 @@ func setupConsole(rootfs, console string) error { // mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts // inside the mount namespace func mountSystem(rootfs string) error { - mounts := []struct { + for _, m := range []struct { source string path string device string @@ -171,12 +174,11 @@ func mountSystem(rootfs string) error { {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, - } - for _, m := range mounts { + } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) } - if err := Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) } } @@ -184,22 +186,22 @@ func mountSystem(rootfs string) error { } func remountProc() error { - if err := Unmount("/proc", syscall.MNT_DETACH); err != nil { + if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := system.Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { return err } return nil } func remountSys() error { - if err := Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { if err != syscall.EINVAL { return err } } else { - if err := Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { return err } } diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go deleted file mode 100644 index 05ef0ac..0000000 --- a/libcontainer/namespaces/namespaces.go +++ /dev/null @@ -1,32 +0,0 @@ -/* - TODO - pivot root - cgroups - more mount stuff that I probably am forgetting - apparmor -*/ - -package namespaces - -import ( - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// JoinExistingNamespace uses the fd of an existing linux namespace and -// has the current process join that namespace or the spacespace specified by ns -func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { - flag := namespaceMap[ns] - if err := Setns(fd, uintptr(flag)); err != nil { - return err - } - return nil -} - -// getNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare, and setns -func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { - for _, ns := range namespaces { - flag |= namespaceMap[ns] - } - return -} diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go index b0e5119..f612793 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/namespaces/ns_linux.go @@ -33,3 +33,12 @@ var namespaceFileMap = map[libcontainer.Namespace]string{ libcontainer.CLONE_NEWPID: "pid", libcontainer.CLONE_NEWNET: "net", } + +// getNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= namespaceMap[ns] + } + return +} diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index ae6159b..7f85eba 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -5,6 +5,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/system" "log" "os" "path/filepath" @@ -14,10 +15,12 @@ import ( // InitNamespace should be run inside an existing namespace to setup // common mounts, drop capabilities, and setup network interfaces func InitNamespace(container *libcontainer.Container) error { + println("|||||||||||||") if err := setLogFile(container); err != nil { return err } - + println(container.LogFile) + log.Printf("--------->") rootfs, err := resolveRootfs(container) if err != nil { return err @@ -26,7 +29,7 @@ func InitNamespace(container *libcontainer.Container) error { // any errors encoutered inside the namespace we should write // out to a log or a pipe to our parent and exit(1) // because writing to stderr will not work after we close - if err := closeMasterAndStd(container.Master); err != nil { + if err := closeMasterAndStd(os.NewFile(container.Master, "/dev/ptmx")); err != nil { log.Fatalf("close master and std %s", err) return err } @@ -49,15 +52,15 @@ func InitNamespace(container *libcontainer.Container) error { } */ - if _, err := namespaces.Setsid(); err != nil { + if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) return err } - if err := namespaces.Setctty(); err != nil { + if err := system.Setctty(); err != nil { log.Fatalf("setctty %s", err) return err } - if err := namespaces.ParentDeathSignal(); err != nil { + if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) return err } @@ -65,7 +68,7 @@ func InitNamespace(container *libcontainer.Container) error { log.Fatalf("setup mount namespace %s", err) return err } - if err := namespaces.Sethostname(container.ID); err != nil { + if err := system.Sethostname(container.ID); err != nil { log.Fatalf("sethostname %s", err) return err } @@ -78,12 +81,12 @@ func InitNamespace(container *libcontainer.Container) error { return err } if container.WorkingDir != "" { - if err := namespaces.Chdir(container.WorkingDir); err != nil { + if err := system.Chdir(container.WorkingDir); err != nil { log.Fatalf("chdir to %s %s", container.WorkingDir, err) return err } } - if err := namespaces.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { log.Fatalf("exec %s", err) return err } @@ -98,24 +101,23 @@ func resolveRootfs(container *libcontainer.Container) (string, error) { return filepath.EvalSymlinks(rootfs) } -func closeMasterAndStd(master uintptr) error { - namespaces.Closefd(master) - namespaces.Closefd(0) - namespaces.Closefd(1) - namespaces.Closefd(2) - +func closeMasterAndStd(master *os.File) error { + master.Close() + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() return nil } func setupUser(container *libcontainer.Container) error { // TODO: honor user passed on container - if err := namespaces.Setgroups(nil); err != nil { + if err := system.Setgroups(nil); err != nil { return err } - if err := namespaces.Setresgid(0, 0, 0); err != nil { + if err := system.Setresgid(0, 0, 0); err != nil { return err } - if err := namespaces.Setresuid(0, 0, 0); err != nil { + if err := system.Setresuid(0, 0, 0); err != nil { return err } return nil @@ -126,15 +128,16 @@ func dupSlave(slave *os.File) error { if slave.Fd() != 0 { return fmt.Errorf("slave fd not 0 %d", slave.Fd()) } - if err := namespaces.Dup2(slave.Fd(), 1); err != nil { + if err := system.Dup2(slave.Fd(), 1); err != nil { return err } - if err := namespaces.Dup2(slave.Fd(), 2); err != nil { + if err := system.Dup2(slave.Fd(), 2); err != nil { return err } return nil } +// openTerminal is a clone of os.OpenFile without the O_CLOEXEC addition. func openTerminal(name string, flag int) (*os.File, error) { r, e := syscall.Open(name, flag, 0) if e != nil { diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index fd195c0..a5d677c 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -72,17 +72,3 @@ func setupEnvironment(container *libcontainer.Container) { addEnvIfNotSet(container, "USER", "root") addEnvIfNotSet(container, "LOGNAME", "root") } - -func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { - master, err := Openpmtx() - if err != nil { - return "", nil, err - } - - console, err := Ptsname(master) - if err != nil { - master.Close() - return "", nil, err - } - return console, master, nil -} diff --git a/libcontainer/namespaces/calls_linux.go b/system/calls_linux.go similarity index 74% rename from libcontainer/namespaces/calls_linux.go rename to system/calls_linux.go index f006d56..42afa34 100644 --- a/libcontainer/namespaces/calls_linux.go +++ b/system/calls_linux.go @@ -1,15 +1,7 @@ -package namespaces +package system import ( - "fmt" - "os" "syscall" - "unsafe" -) - -const ( - TIOCGPTN = 0x80045430 - TIOCSPTLCK = 0x40045431 ) func Chroot(dir string) error { @@ -60,14 +52,6 @@ func Clone(flags uintptr) (int, error) { return int(pid), nil } -func Setns(fd uintptr, flags uintptr) error { - _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) - if err != 0 { - return err - } - return nil -} - func UsetCloseOnExec(fd uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { return err @@ -95,11 +79,6 @@ func Setsid() (int, error) { return syscall.Setsid() } -func Unlockpt(f *os.File) error { - var u int - return Ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) -} - func Ioctl(fd uintptr, flag, data uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { return err @@ -107,18 +86,6 @@ func Ioctl(fd uintptr, flag, data uintptr) error { return nil } -func Ptsname(f *os.File) (string, error) { - var n int - if err := Ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { - return "", err - } - return fmt.Sprintf("/dev/pts/%d", n), nil -} - -func Openpmtx() (*os.File, error) { - return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) -} - func Closefd(fd uintptr) error { return syscall.Close(int(fd)) } @@ -132,7 +99,7 @@ func Mknod(path string, mode uint32, dev int) error { } func ParentDeathSignal() error { - if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 { return err } return nil diff --git a/system/pty_linux.go b/system/pty_linux.go new file mode 100644 index 0000000..b281b71 --- /dev/null +++ b/system/pty_linux.go @@ -0,0 +1,31 @@ +package system + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// Unlockpt should be called before opening the slave side of a pseudoterminal. +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// Ptsname retrieves the name of the first available pts for the given master. +func Ptsname(f *os.File) (string, error) { + var n int + + if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// OpenPtmx opens /dev/ptmx, i.e. the PTY master. +func OpenPtmx() (*os.File, error) { + // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} diff --git a/system/setns_linux.go b/system/setns_linux.go new file mode 100644 index 0000000..be6f3ed --- /dev/null +++ b/system/setns_linux.go @@ -0,0 +1,13 @@ +package system + +import ( + "syscall" +) + +func Setns(fd uintptr, flags uintptr) error { + _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + if err != 0 { + return err + } + return nil +} diff --git a/system/setns_linux_amd64.go b/system/setns_linux_amd64.go new file mode 100644 index 0000000..4e30625 --- /dev/null +++ b/system/setns_linux_amd64.go @@ -0,0 +1,8 @@ +// +build linux,amd64 + +package system + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +const ( + SYS_SETNS = 308 +) From d62cc1cc661bcf621a262aa2229524f6001b99ab Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 19 Feb 2014 12:47:01 -0800 Subject: [PATCH 06/64] Fix ptmx issue on libcontainer Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/namespaces/exec.go | 3 ++- libcontainer/namespaces/mount.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index ea3d2ca..77550d6 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -50,7 +50,8 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // command.Stderr = os.Stderr command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - //command.ExtraFiles = []*os.File{master} + + command.ExtraFiles = []*os.File{master} println("vvvvvvvvv") if err := command.Start(); err != nil { diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index a9b981e..5c0b8ea 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -41,7 +41,7 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err } - if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil { + if err := os.Symlink("pts/ptmx", ptmx); err != nil { return fmt.Errorf("symlink dev ptmx %s", err) } From c1f8606d508ae40f721bc6c20e5b2afab1911b42 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 14:33:25 -0800 Subject: [PATCH 07/64] Use nsinit as app Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 10 +- libcontainer/namespaces/exec.go | 40 ++----- libcontainer/namespaces/linux_x86_64.go | 7 -- libcontainer/namespaces/ns_linux.go | 2 +- libcontainer/namespaces/nsinit/init.go | 112 ++++++++++-------- libcontainer/namespaces/{ => nsinit}/mount.go | 4 +- libcontainer/namespaces/utils.go | 26 ---- 7 files changed, 82 insertions(+), 119 deletions(-) delete mode 100644 libcontainer/namespaces/linux_x86_64.go rename libcontainer/namespaces/{ => nsinit}/mount.go (98%) diff --git a/libcontainer/container.go b/libcontainer/container.go index c9a3f2e..c288544 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -2,18 +2,14 @@ package libcontainer type Container struct { ID string `json:"id,omitempty"` - NsPid int `json:"namespace_pid,omitempty"` Command *Command `json:"command,omitempty"` - RootFs string `json:"rootfs,omitempty"` ReadonlyFs bool `json:"readonly_fs,omitempty"` - NetNsFd uintptr `json:"network_namespace_fd,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` - Master uintptr `json:"master"` - Console string `json:"console"` - LogFile string `json:"log_file"` + LogFile string `json:"log_file,omitempty"` + Network *Network `json:"network,omitempty"` } type Command struct { @@ -22,9 +18,9 @@ type Command struct { } type Network struct { - TempVethName string `json:"temp_veth,omitempty"` IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` Bridge string `json:"bridge,omitempty"` Mtu int `json:"mtu,omitempty"` + TempVethName string `json:"temp_veth,omitempty"` } diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 77550d6..8e5bf68 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -1,27 +1,17 @@ -/* - Higher level convience functions for setting up a container -*/ - package namespaces import ( - "errors" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/term" "io" "log" "os" "os/exec" - "path/filepath" - "strconv" "syscall" ) -var ( - ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace") -) - -// Exec will spawn new namespaces with the specified Container configuration +// ExecContainer will spawn new namespaces with the specified Container configuration // in the RootFs path and return the pid of the new containerized process. // // If an existing network namespace is specified the container @@ -30,30 +20,19 @@ var ( // existing network namespace and the CLONE_NEWNET option in the container configuration will allow // the container to the the host's networking options and configuration. func ExecContainer(container *libcontainer.Container) (pid int, err error) { - // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join - if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { - return -1, ErrExistingNetworkNamespace - } - master, console, err := createMasterAndConsole() if err != nil { return -1, err } - nsinit := filepath.Join(container.RootFs, ".nsinit") // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init", "container.json") - // command.Stdin = os.Stdin - // command.Stdout = os.Stdout - // command.Stderr = os.Stderr - command.SysProcAttr = &syscall.SysProcAttr{} - command.SysProcAttr.Cloneflags = flag + command := exec.Command("nsinit", console) + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: flag, + } - command.ExtraFiles = []*os.File{master} - - println("vvvvvvvvv") if err := command.Start(); err != nil { return -1, err } @@ -64,11 +43,18 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() + go func() { if _, err := io.Copy(master, os.Stdin); err != nil { log.Println(err) } }() + + term.SetRawTerminal(os.Stdin.Fd()) + + if err := command.Wait(); err != nil { + return pid, err + } return pid, nil } diff --git a/libcontainer/namespaces/linux_x86_64.go b/libcontainer/namespaces/linux_x86_64.go deleted file mode 100644 index ac9a014..0000000 --- a/libcontainer/namespaces/linux_x86_64.go +++ /dev/null @@ -1,7 +0,0 @@ -// +build linux,x86_64 -package namespaces - -// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 -const ( - SYS_SETNS = 308 -) diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go index f612793..2c73e08 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/namespaces/ns_linux.go @@ -40,5 +40,5 @@ func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { flag |= namespaceMap[ns] } - return + return flag } diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index 7f85eba..523854e 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -1,6 +1,7 @@ -package nsinit +package main import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -12,103 +13,112 @@ import ( "syscall" ) -// InitNamespace should be run inside an existing namespace to setup -// common mounts, drop capabilities, and setup network interfaces -func InitNamespace(container *libcontainer.Container) error { - println("|||||||||||||") - if err := setLogFile(container); err != nil { - return err - } - println(container.LogFile) - log.Printf("--------->") - rootfs, err := resolveRootfs(container) +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open("container.json") if err != nil { - return err + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} + +func main() { + container, err := loadContainer() + if err != nil { + log.Fatal(err) } - // any errors encoutered inside the namespace we should write - // out to a log or a pipe to our parent and exit(1) - // because writing to stderr will not work after we close - if err := closeMasterAndStd(os.NewFile(container.Master, "/dev/ptmx")); err != nil { - log.Fatalf("close master and std %s", err) - return err + if os.Args[1] == "exec" { + _, err := namespaces.ExecContainer(container) + if err != nil { + log.Fatal(err) + } + os.Exit(0) + } + console := os.Args[1] + + if err := setLogFile(container); err != nil { + log.Fatal(err) } - slave, err := openTerminal(container.Console, syscall.O_RDWR) + rootfs, err := resolveRootfs() + if err != nil { + log.Fatal(err) + } + + // close pipes so that we can replace it with the pty + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() + + slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { log.Fatalf("open terminal %s", err) - return err + } + if slave.Fd() != 0 { + log.Fatalf("slave fd should be 0") } if err := dupSlave(slave); err != nil { log.Fatalf("dup2 slave %s", err) - return err } - /* - if container.NetNsFd > 0 { - if err := joinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { - log.Fatalf("join existing net namespace %s", err) - } - } - */ - if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) - return err } if err := system.Setctty(); err != nil { log.Fatalf("setctty %s", err) - return err } if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) - return err } - if err := namespaces.SetupNewMountNamespace(rootfs, container.Console, container.ReadonlyFs); err != nil { + + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { log.Fatalf("setup mount namespace %s", err) - return err } + + if container.Network != nil { + if err := setupNetworking(container); err != nil { + log.Fatalf("setup networking %s", err) + } + } + if err := system.Sethostname(container.ID); err != nil { log.Fatalf("sethostname %s", err) - return err } if err := capabilities.DropCapabilities(container); err != nil { log.Fatalf("drop capabilities %s", err) - return err } if err := setupUser(container); err != nil { log.Fatalf("setup user %s", err) - return err } if container.WorkingDir != "" { if err := system.Chdir(container.WorkingDir); err != nil { log.Fatalf("chdir to %s %s", container.WorkingDir, err) - return err } } if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { log.Fatalf("exec %s", err) - return err } panic("unreachable") } -func resolveRootfs(container *libcontainer.Container) (string, error) { - rootfs, err := filepath.Abs(container.RootFs) +func resolveRootfs() (string, error) { + cwd, err := os.Getwd() + if err != nil { + return "", err + } + rootfs, err := filepath.Abs(cwd) if err != nil { return "", err } return filepath.EvalSymlinks(rootfs) } -func closeMasterAndStd(master *os.File) error { - master.Close() - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() - return nil -} - func setupUser(container *libcontainer.Container) error { // TODO: honor user passed on container if err := system.Setgroups(nil); err != nil { @@ -154,3 +164,7 @@ func setLogFile(container *libcontainer.Container) error { log.SetOutput(f) return nil } + +func setupNetworking(conatiner *libcontainer.Container) error { + return nil +} diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/nsinit/mount.go similarity index 98% rename from libcontainer/namespaces/mount.go rename to libcontainer/namespaces/nsinit/mount.go index 5c0b8ea..f9ee969 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/nsinit/mount.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "fmt" @@ -14,7 +14,7 @@ var ( defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV ) -func SetupNewMountNamespace(rootfs, console string, readonly bool) error { +func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index a5d677c..edc3ab5 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -7,7 +7,6 @@ import ( "path/filepath" "strconv" "strings" - "syscall" ) func addEnvIfNotSet(container *libcontainer.Container, key, value string) { @@ -26,31 +25,6 @@ func addEnvIfNotSet(container *libcontainer.Container, key, value string) { container.Command.Env = append(container.Command.Env, jv) } -// getNsFds inspects the container's namespace configuration and opens the fds to -// each of the namespaces. -func getNsFds(container *libcontainer.Container) ([]uintptr, error) { - var ( - namespaces = []string{} - fds = []uintptr{} - ) - - for _, ns := range container.Namespaces { - namespaces = append(namespaces, namespaceFileMap[ns]) - } - - for _, ns := range namespaces { - fd, err := getNsFd(container.NsPid, ns) - if err != nil { - for _, fd = range fds { - syscall.Close(int(fd)) - } - return nil, err - } - fds = append(fds, fd) - } - return fds, nil -} - // getNsFd returns the fd for a specific pid and namespace option func getNsFd(pid int, ns string) (uintptr, error) { nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) From e25ebdd06c3dcb901fa9df7f8042ed0abd041337 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 14:55:34 -0800 Subject: [PATCH 08/64] Simplify namespaces with only nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 203 ------------------ libcontainer/namespaces/utils.go | 48 ----- libcontainer/{namespaces => nsinit}/exec.go | 26 ++- libcontainer/{namespaces => }/nsinit/init.go | 13 +- libcontainer/{namespaces => }/nsinit/mount.go | 0 .../{namespaces => nsinit}/ns_linux.go | 11 +- 6 files changed, 23 insertions(+), 278 deletions(-) delete mode 100644 libcontainer/cli/main.go delete mode 100644 libcontainer/namespaces/utils.go rename libcontainer/{namespaces => nsinit}/exec.go (63%) rename libcontainer/{namespaces => }/nsinit/init.go (93%) rename libcontainer/{namespaces => }/nsinit/mount.go (100%) rename libcontainer/{namespaces => nsinit}/ns_linux.go (74%) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go deleted file mode 100644 index 93bb039..0000000 --- a/libcontainer/cli/main.go +++ /dev/null @@ -1,203 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces/nsinit" - "github.com/dotcloud/docker/pkg/libcontainer/network" - "github.com/dotcloud/docker/pkg/libcontainer/utils" - "os" - exec_ "os/exec" - "path" - "path/filepath" -) - -var ( - displayPid bool - newCommand string - usrNet bool - masterFd int - console string -) - -func init() { - flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") - flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") - flag.BoolVar(&usrNet, "net", false, "user a net namespace") - flag.IntVar(&masterFd, "master", 0, "master fd") - flag.StringVar(&console, "console", "", "console path") - flag.Parse() -} - -func nsinitFunc(container *libcontainer.Container) error { - container.Master = uintptr(masterFd) - container.Console = console - container.LogFile = "/root/logs" - - return nsinit.InitNamespace(container) -} - -func exec(container *libcontainer.Container) error { - var ( - netFile *os.File - err error - ) - container.NetNsFd = 0 - - if usrNet { - netFile, err = os.Open("/root/nsroot/test") - if err != nil { - return err - } - container.NetNsFd = netFile.Fd() - } - - self, err := exec_.LookPath(os.Args[0]) - if err != nil { - return err - } - if output, err := exec_.Command("cp", self, path.Join(container.RootFs, ".nsinit")).CombinedOutput(); err != nil { - return fmt.Errorf("Error exec cp: %s, (%s)", err, output) - } else { - println(self, container.RootFs) - fmt.Printf("-----> %s\n", output) - } - println("----") - - pid, err := namespaces.ExecContainer(container) - if err != nil { - return fmt.Errorf("error exec container %s", err) - } - - if displayPid { - fmt.Println(pid) - } - - exitcode, err := utils.WaitOnPid(pid) - if err != nil { - return fmt.Errorf("error waiting on child %s", err) - } - fmt.Println(exitcode) - if usrNet { - netFile.Close() - if err := network.DeleteNetworkNamespace("/root/nsroot/test"); err != nil { - return err - } - } - os.Exit(exitcode) - return nil -} - -func execIn(container *libcontainer.Container) error { - // f, err := os.Open("/root/nsroot/test") - // if err != nil { - // return err - // } - // container.NetNsFd = f.Fd() - // pid, err := namespaces.ExecIn(container, &libcontainer.Command{ - // Env: container.Command.Env, - // Args: []string{ - // newCommand, - // }, - // }) - // if err != nil { - // return fmt.Errorf("error exexin container %s", err) - // } - // exitcode, err := utils.WaitOnPid(pid) - // if err != nil { - // return fmt.Errorf("error waiting on child %s", err) - // } - // os.Exit(exitcode) - return nil -} - -func createNet(config *libcontainer.Network) error { - /* - root := "/root/nsroot" - if err := network.SetupNamespaceMountDir(root); err != nil { - return err - } - - nspath := root + "/test" - if err := network.CreateNetworkNamespace(nspath); err != nil { - return nil - } - if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { - return err - } - if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { - return err - } - if err := network.InterfaceUp("veth0"); err != nil { - return err - } - - f, err := os.Open(nspath) - if err != nil { - return err - } - defer f.Close() - - if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { - return err - } - - if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { - return err - } - */ - return nil -} - -func printErr(err error) { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) -} - -func main() { - cliCmd := flag.Arg(0) - - config, err := filepath.Abs(flag.Arg(1)) - if err != nil { - printErr(err) - } - println("cli:", cliCmd, "config:", config) - f, err := os.Open(config) - if err != nil { - printErr(err) - } - - dec := json.NewDecoder(f) - var container *libcontainer.Container - - if err := dec.Decode(&container); err != nil { - printErr(err) - } - f.Close() - - switch cliCmd { - case "init": - err = nsinitFunc(container) - case "exec": - err = exec(container) - case "execin": - err = execIn(container) - case "net": - err = createNet(&libcontainer.Network{ - TempVethName: "veth1", - IP: "172.17.0.100/16", - Gateway: "172.17.42.1", - Mtu: 1500, - Bridge: "docker0", - }) - default: - err = fmt.Errorf("command not supported: %s", cliCmd) - } - - if err != nil { - printErr(err) - } -} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go deleted file mode 100644 index edc3ab5..0000000 --- a/libcontainer/namespaces/utils.go +++ /dev/null @@ -1,48 +0,0 @@ -package namespaces - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" - "os" - "path/filepath" - "strconv" - "strings" -) - -func addEnvIfNotSet(container *libcontainer.Container, key, value string) { - jv := fmt.Sprintf("%s=%s", key, value) - if len(container.Command.Env) == 0 { - container.Command.Env = []string{jv} - return - } - - for _, v := range container.Command.Env { - parts := strings.Split(v, "=") - if parts[0] == key { - return - } - } - container.Command.Env = append(container.Command.Env, jv) -} - -// getNsFd returns the fd for a specific pid and namespace option -func getNsFd(pid int, ns string) (uintptr, error) { - nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) - // OpenFile adds closOnExec - f, err := os.OpenFile(nspath, os.O_RDONLY, 0666) - if err != nil { - return 0, err - } - return f.Fd(), nil -} - -// setupEnvironment adds additional environment variables to the container's -// Command such as USER, LOGNAME, container, and TERM -func setupEnvironment(container *libcontainer.Container) { - addEnvIfNotSet(container, "container", "docker") - // TODO: check if pty - addEnvIfNotSet(container, "TERM", "xterm") - // TODO: get username from container - addEnvIfNotSet(container, "USER", "root") - addEnvIfNotSet(container, "LOGNAME", "root") -} diff --git a/libcontainer/namespaces/exec.go b/libcontainer/nsinit/exec.go similarity index 63% rename from libcontainer/namespaces/exec.go rename to libcontainer/nsinit/exec.go index 8e5bf68..ef81b0e 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "github.com/dotcloud/docker/pkg/libcontainer" @@ -11,15 +11,7 @@ import ( "syscall" ) -// ExecContainer will spawn new namespaces with the specified Container configuration -// in the RootFs path and return the pid of the new containerized process. -// -// If an existing network namespace is specified the container -// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is, -// the container will be spawned with a new network namespace with no configuration. Omiting an -// existing network namespace and the CLONE_NEWNET option in the container configuration will allow -// the container to the the host's networking options and configuration. -func ExecContainer(container *libcontainer.Container) (pid int, err error) { +func execCommand(container *libcontainer.Container) (pid int, err error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err @@ -50,7 +42,19 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { } }() - term.SetRawTerminal(os.Stdin.Fd()) + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return -1, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return -1, err + } + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + command.Process.Kill() + return -1, err + } + defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { return pid, err diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/nsinit/init.go similarity index 93% rename from libcontainer/namespaces/nsinit/init.go rename to libcontainer/nsinit/init.go index 523854e..b4b7de4 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,7 +5,6 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" "github.com/dotcloud/docker/pkg/system" "log" "os" @@ -34,7 +33,7 @@ func main() { } if os.Args[1] == "exec" { - _, err := namespaces.ExecContainer(container) + _, err := execCommand(container) if err != nil { log.Fatal(err) } @@ -157,11 +156,13 @@ func openTerminal(name string, flag int) (*os.File, error) { } func setLogFile(container *libcontainer.Container) error { - f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) - if err != nil { - return err + if container.LogFile != "" { + f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) + if err != nil { + return err + } + log.SetOutput(f) } - log.SetOutput(f) return nil } diff --git a/libcontainer/namespaces/nsinit/mount.go b/libcontainer/nsinit/mount.go similarity index 100% rename from libcontainer/namespaces/nsinit/mount.go rename to libcontainer/nsinit/mount.go diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/nsinit/ns_linux.go similarity index 74% rename from libcontainer/namespaces/ns_linux.go rename to libcontainer/nsinit/ns_linux.go index 2c73e08..b54bc2b 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "github.com/dotcloud/docker/pkg/libcontainer" @@ -25,15 +25,6 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: CLONE_NEWNET, } -var namespaceFileMap = map[libcontainer.Namespace]string{ - libcontainer.CLONE_NEWNS: "mnt", - libcontainer.CLONE_NEWUTS: "uts", - libcontainer.CLONE_NEWIPC: "ipc", - libcontainer.CLONE_NEWUSER: "user", - libcontainer.CLONE_NEWPID: "pid", - libcontainer.CLONE_NEWNET: "net", -} - // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { From 8430fbf11e1a20464cdb48384f2c87a3c51af304 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 15:33:44 -0800 Subject: [PATCH 09/64] Implement init veth creation Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 9 ++++----- libcontainer/container.json | 14 +++++++++---- libcontainer/network/veth.go | 38 +++++------------------------------- libcontainer/nsinit/exec.go | 33 +++++++++++++++++++++++++++++++ libcontainer/nsinit/init.go | 14 ++++++++++--- libcontainer/ubuntu.json | 22 --------------------- 6 files changed, 63 insertions(+), 67 deletions(-) delete mode 100644 libcontainer/ubuntu.json diff --git a/libcontainer/container.go b/libcontainer/container.go index c288544..3f3961d 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -18,9 +18,8 @@ type Command struct { } type Network struct { - IP string `json:"ip,omitempty"` - Gateway string `json:"gateway,omitempty"` - Bridge string `json:"bridge,omitempty"` - Mtu int `json:"mtu,omitempty"` - TempVethName string `json:"temp_veth,omitempty"` + IP string `json:"ip,omitempty"` + Gateway string `json:"gateway,omitempty"` + Bridge string `json:"bridge,omitempty"` + Mtu int `json:"mtu,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 6e4fda5..8731170 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,6 +1,6 @@ { "id": "koye", - "namespace_pid": 3117, + "log_file": "/root/logs", "command": { "args": [ "/bin/bash" @@ -12,12 +12,12 @@ "TERM=xterm" ] }, - "rootfs": "/var/lib/docker/containers/ee76122136d691d63e09d24168a91ddb2ef9fdcf210b4de5c50aa76354892f4b/root", "namespaces": [ "NEWIPC", "NEWNS", "NEWPID", - "NEWUTS" + "NEWUTS", + "NEWNET" ], "capabilities": [ "SETPCAP", @@ -34,5 +34,11 @@ "AUDIT_CONTROL", "MAC_OVERRIDE", "MAC_ADMIN" - ] + ], + "network": { + "ip": "172.17.0.100/16", + "gateway": "172.17.42.1", + "bridge": "docker0", + "mtu": 1500 + } } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 2ecce22..05512e6 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -3,18 +3,16 @@ package network import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "os" - "syscall" ) // SetupVeth sets up an existing network namespace with the specified // network configuration. -func SetupVeth(config *libcontainer.Network) error { - if err := InterfaceDown(config.TempVethName); err != nil { - return fmt.Errorf("interface down %s %s", config.TempVethName, err) +func SetupVeth(config *libcontainer.Network, tempVethName string) error { + if err := InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) } - if err := ChangeInterfaceName(config.TempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", config.TempVethName, err) + if err := ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) } if err := SetInterfaceIp("eth0", config.IP); err != nil { return fmt.Errorf("set eth0 ip %s", err) @@ -41,29 +39,3 @@ func SetupVeth(config *libcontainer.Network) error { } return nil } - -// SetupNamespaceMountDir prepares a new root for use as a mount -// source for bind mounting namespace fd to an outside path -func SetupNamespaceMountDir(root string) error { - if err := os.MkdirAll(root, 0666); err != nil { - return err - } - // make sure mounts are not unmounted by other mnt namespaces - if err := syscall.Mount("", root, "none", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil && err != syscall.EINVAL { - return err - } - if err := syscall.Mount(root, root, "none", syscall.MS_BIND, ""); err != nil { - return err - } - return nil -} - -// DeleteNetworkNamespace unmounts the binding path and removes the -// file so that no references to the fd are present and the network -// namespace is automatically cleaned up -func DeleteNetworkNamespace(bindingPath string) error { - if err := syscall.Unmount(bindingPath, 0); err != nil { - return err - } - return os.Remove(bindingPath) -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ef81b0e..9cd1741 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,7 +1,9 @@ package main import ( + "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -25,11 +27,34 @@ func execCommand(container *libcontainer.Container) (pid int, err error) { Cloneflags: flag, } + inPipe, err := command.StdinPipe() + if err != nil { + return -1, err + } + if err := command.Start(); err != nil { return -1, err } pid = command.Process.Pid + if container.Network != nil { + name1, name2, err := createVethPair() + if err != nil { + log.Fatal(err) + } + if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { + log.Fatal(err) + } + if err := network.InterfaceUp(name1); err != nil { + log.Fatal(err) + } + if err := network.SetInterfaceInNamespacePid(name2, pid); err != nil { + log.Fatal(err) + } + fmt.Fprint(inPipe, name2) + inPipe.Close() + } + go func() { if _, err := io.Copy(os.Stdout, master); err != nil { log.Println(err) @@ -78,3 +103,11 @@ func createMasterAndConsole() (*os.File, string, error) { } return master, console, nil } + +func createVethPair() (name1 string, name2 string, err error) { + name1, name2 = "veth001", "veth002" + if err = network.CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index b4b7de4..2804f01 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,7 +5,9 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "io/ioutil" "log" "os" "path/filepath" @@ -50,6 +52,12 @@ func main() { log.Fatal(err) } + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + log.Fatalf("error reading from stdin %s", err) + } + tempVethName := string(data) + // close pipes so that we can replace it with the pty os.Stdin.Close() os.Stdout.Close() @@ -81,7 +89,7 @@ func main() { } if container.Network != nil { - if err := setupNetworking(container); err != nil { + if err := setupNetworking(container, tempVethName); err != nil { log.Fatalf("setup networking %s", err) } } @@ -166,6 +174,6 @@ func setLogFile(container *libcontainer.Container) error { return nil } -func setupNetworking(conatiner *libcontainer.Container) error { - return nil +func setupNetworking(container *libcontainer.Container, tempVethName string) error { + return network.SetupVeth(container.Network, tempVethName) } diff --git a/libcontainer/ubuntu.json b/libcontainer/ubuntu.json deleted file mode 100644 index 0a450ae..0000000 --- a/libcontainer/ubuntu.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "id": "koye", - "namespace_pid": 3745, - "command": { - "args": [ - "/sbin/init" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm" - ] - }, - "rootfs": "/var/lib/docker/btrfs/subvolumes/7c0f15df1ad2e2fe04d7a6e079aec17406e9465a6a37dd16cb0dd754fc0167b3", - "namespaces": [ - "NEWIPC", - "NEWNS", - "NEWPID", - "NEWUTS" - ] -} From ab6864d0c0e4ca7de8a2646f4cf4de4348c682ea Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 15:54:53 -0800 Subject: [PATCH 10/64] Add dynamic veth name Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 41 -------------------------------- libcontainer/nsinit/exec.go | 10 +++++++- libcontainer/nsinit/init.go | 46 ++++++++++++++++++++++++++++-------- libcontainer/utils/utils.go | 24 +++---------------- 4 files changed, 48 insertions(+), 73 deletions(-) delete mode 100644 libcontainer/network/veth.go diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go deleted file mode 100644 index 05512e6..0000000 --- a/libcontainer/network/veth.go +++ /dev/null @@ -1,41 +0,0 @@ -package network - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// SetupVeth sets up an existing network namespace with the specified -// network configuration. -func SetupVeth(config *libcontainer.Network, tempVethName string) error { - if err := InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) - } - if err := ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := SetInterfaceIp("eth0", config.IP); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - - if err := SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - - if err := SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - - if config.Gateway != "" { - if err := SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } - } - return nil -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 9cd1741..e032407 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -105,7 +106,14 @@ func createMasterAndConsole() (*os.File, string, error) { } func createVethPair() (name1 string, name2 string, err error) { - name1, name2 = "veth001", "veth002" + name1, err = utils.GenerateRandomName("dock", 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName("dock", 4) + if err != nil { + return + } if err = network.CreateVethPair(name1, name2); err != nil { return } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 2804f01..fe8fd4b 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -52,11 +52,14 @@ func main() { log.Fatal(err) } - data, err := ioutil.ReadAll(os.Stdin) - if err != nil { - log.Fatalf("error reading from stdin %s", err) + var tempVethName string + if container.Network != nil { + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + log.Fatalf("error reading from stdin %s", err) + } + tempVethName = string(data) } - tempVethName := string(data) // close pipes so that we can replace it with the pty os.Stdin.Close() @@ -73,7 +76,6 @@ func main() { if err := dupSlave(slave); err != nil { log.Fatalf("dup2 slave %s", err) } - if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) } @@ -83,13 +85,11 @@ func main() { if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) } - if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { log.Fatalf("setup mount namespace %s", err) } - if container.Network != nil { - if err := setupNetworking(container, tempVethName); err != nil { + if err := setupNetworking(container.Network, tempVethName); err != nil { log.Fatalf("setup networking %s", err) } } @@ -174,6 +174,32 @@ func setLogFile(container *libcontainer.Container) error { return nil } -func setupNetworking(container *libcontainer.Container, tempVethName string) error { - return network.SetupVeth(container.Network, tempVethName) +func setupNetworking(config *libcontainer.Network, tempVethName string) error { + if err := network.InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) + } + if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) + } + if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := network.SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := network.SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := network.SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil } diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 7289fec..d3223c3 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -4,30 +4,12 @@ import ( "crypto/rand" "encoding/hex" "io" - "os" - "syscall" ) -func WaitOnPid(pid int) (exitcode int, err error) { - child, err := os.FindProcess(pid) - if err != nil { - return -1, err - } - state, err := child.Wait() - if err != nil { - return -1, err - } - return getExitCode(state), nil -} - -func getExitCode(state *os.ProcessState) int { - return state.Sys().(syscall.WaitStatus).ExitStatus() -} - -func GenerateRandomName(size int) (string, error) { - id := make([]byte, size) +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) if _, err := io.ReadFull(rand.Reader, id); err != nil { return "", err } - return hex.EncodeToString(id), nil + return prefix + hex.EncodeToString(id)[:size], nil } From 84ba029e25539405212cb2b626c7f737e4bb0cfb Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 16:40:36 -0800 Subject: [PATCH 11/64] General cleanup of libcontainer Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/errors.go | 9 ----- libcontainer/network/network.go | 26 -------------- libcontainer/nsinit/exec.go | 51 ++++++++++++-------------- libcontainer/nsinit/init.go | 63 +++++++++------------------------ libcontainer/nsinit/main.go | 42 ++++++++++++++++++++++ libcontainer/nsinit/mount.go | 61 ++++++++++++------------------- libcontainer/nsinit/ns_linux.go | 25 ++++--------- 7 files changed, 111 insertions(+), 166 deletions(-) delete mode 100644 libcontainer/errors.go create mode 100644 libcontainer/nsinit/main.go diff --git a/libcontainer/errors.go b/libcontainer/errors.go deleted file mode 100644 index c6964ee..0000000 --- a/libcontainer/errors.go +++ /dev/null @@ -1,9 +0,0 @@ -package libcontainer - -import ( - "errors" -) - -var ( - ErrInvalidPid = errors.New("no ns pid found") -) diff --git a/libcontainer/network/network.go b/libcontainer/network/network.go index 31c5d32..8c7a4b6 100644 --- a/libcontainer/network/network.go +++ b/libcontainer/network/network.go @@ -1,15 +1,10 @@ package network import ( - "errors" "github.com/dotcloud/docker/pkg/netlink" "net" ) -var ( - ErrNoDefaultRoute = errors.New("no default network route found") -) - func InterfaceUp(name string) error { iface, err := net.InterfaceByName(name) if err != nil { @@ -46,14 +41,6 @@ func SetInterfaceInNamespacePid(name string, nsPid int) error { return netlink.NetworkSetNsPid(iface, nsPid) } -func SetInterfaceInNamespaceFd(name string, fd int) error { - iface, err := net.InterfaceByName(name) - if err != nil { - return err - } - return netlink.NetworkSetNsFd(iface, fd) -} - func SetInterfaceMaster(name, master string) error { iface, err := net.InterfaceByName(name) if err != nil { @@ -89,16 +76,3 @@ func SetMtu(name string, mtu int) error { } return netlink.NetworkSetMTU(iface, mtu) } - -func GetDefaultMtu() (int, error) { - routes, err := netlink.NetworkGetRoutes() - if err != nil { - return -1, err - } - for _, r := range routes { - if r.Default { - return r.Iface.MTU, nil - } - } - return -1, ErrNoDefaultRoute -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index e032407..4ac070d 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -8,65 +8,54 @@ import ( "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" - "log" + "io/ioutil" "os" "os/exec" "syscall" ) -func execCommand(container *libcontainer.Container) (pid int, err error) { +func execCommand(container *libcontainer.Container) (int, error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err } - // we need CLONE_VFORK so we can wait on the child - flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - - command := exec.Command("nsinit", console) + command := exec.Command("nsinit", "init", console) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: flag, + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child } inPipe, err := command.StdinPipe() if err != nil { return -1, err } - if err := command.Start(); err != nil { return -1, err } - pid = command.Process.Pid + if err := writePidFile(command); err != nil { + return -1, err + } if container.Network != nil { name1, name2, err := createVethPair() if err != nil { - log.Fatal(err) + return -1, err } if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { - log.Fatal(err) + return -1, err } if err := network.InterfaceUp(name1); err != nil { - log.Fatal(err) + return -1, err } - if err := network.SetInterfaceInNamespacePid(name2, pid); err != nil { - log.Fatal(err) + if err := network.SetInterfaceInNamespacePid(name2, command.Process.Pid); err != nil { + return -1, err } fmt.Fprint(inPipe, name2) inPipe.Close() } - go func() { - if _, err := io.Copy(os.Stdout, master); err != nil { - log.Println(err) - } - }() - - go func() { - if _, err := io.Copy(master, os.Stdin); err != nil { - log.Println(err) - } - }() + go io.Copy(os.Stdout, master) + go io.Copy(master, os.Stdin) ws, err := term.GetWinsize(os.Stdin.Fd()) if err != nil { @@ -83,9 +72,11 @@ func execCommand(container *libcontainer.Container) (pid int, err error) { defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { - return pid, err + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } } - return pid, nil + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } func createMasterAndConsole() (*os.File, string, error) { @@ -93,12 +84,10 @@ func createMasterAndConsole() (*os.File, string, error) { if err != nil { return nil, "", err } - console, err := system.Ptsname(master) if err != nil { return nil, "", err } - if err := system.Unlockpt(master); err != nil { return nil, "", err } @@ -119,3 +108,7 @@ func createVethPair() (name1 string, name2 string, err error) { } return } + +func writePidFile(command *exec.Cmd) error { + return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) +} diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index fe8fd4b..16a3081 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,7 +1,6 @@ package main import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -14,49 +13,21 @@ import ( "syscall" ) -func loadContainer() (*libcontainer.Container, error) { - f, err := os.Open("container.json") - if err != nil { - return nil, err - } - defer f.Close() - - var container *libcontainer.Container - if err := json.NewDecoder(f).Decode(&container); err != nil { - return nil, err - } - return container, nil -} - -func main() { - container, err := loadContainer() - if err != nil { - log.Fatal(err) - } - - if os.Args[1] == "exec" { - _, err := execCommand(container) - if err != nil { - log.Fatal(err) - } - os.Exit(0) - } - console := os.Args[1] - +func initCommand(container *libcontainer.Container, console string) error { if err := setLogFile(container); err != nil { - log.Fatal(err) + return err } rootfs, err := resolveRootfs() if err != nil { - log.Fatal(err) + return err } var tempVethName string if container.Network != nil { data, err := ioutil.ReadAll(os.Stdin) if err != nil { - log.Fatalf("error reading from stdin %s", err) + return fmt.Errorf("error reading from stdin %s", err) } tempVethName = string(data) } @@ -68,48 +39,48 @@ func main() { slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { - log.Fatalf("open terminal %s", err) + return fmt.Errorf("open terminal %s", err) } if slave.Fd() != 0 { - log.Fatalf("slave fd should be 0") + return fmt.Errorf("slave fd should be 0") } if err := dupSlave(slave); err != nil { - log.Fatalf("dup2 slave %s", err) + return fmt.Errorf("dup2 slave %s", err) } if _, err := system.Setsid(); err != nil { - log.Fatalf("setsid %s", err) + return fmt.Errorf("setsid %s", err) } if err := system.Setctty(); err != nil { - log.Fatalf("setctty %s", err) + return fmt.Errorf("setctty %s", err) } if err := system.ParentDeathSignal(); err != nil { - log.Fatalf("parent deth signal %s", err) + return fmt.Errorf("parent deth signal %s", err) } if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { - log.Fatalf("setup mount namespace %s", err) + return fmt.Errorf("setup mount namespace %s", err) } if container.Network != nil { if err := setupNetworking(container.Network, tempVethName); err != nil { - log.Fatalf("setup networking %s", err) + return fmt.Errorf("setup networking %s", err) } } if err := system.Sethostname(container.ID); err != nil { - log.Fatalf("sethostname %s", err) + return fmt.Errorf("sethostname %s", err) } if err := capabilities.DropCapabilities(container); err != nil { - log.Fatalf("drop capabilities %s", err) + return fmt.Errorf("drop capabilities %s", err) } if err := setupUser(container); err != nil { - log.Fatalf("setup user %s", err) + return fmt.Errorf("setup user %s", err) } if container.WorkingDir != "" { if err := system.Chdir(container.WorkingDir); err != nil { - log.Fatalf("chdir to %s %s", container.WorkingDir, err) + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { - log.Fatalf("exec %s", err) + return fmt.Errorf("exec %s", err) } panic("unreachable") } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go new file mode 100644 index 0000000..47abcce --- /dev/null +++ b/libcontainer/nsinit/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "encoding/json" + "github.com/dotcloud/docker/pkg/libcontainer" + "log" + "os" +) + +func main() { + container, err := loadContainer() + if err != nil { + log.Fatal(err) + } + + switch os.Args[1] { + case "exec": + exitCode, err := execCommand(container) + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + case "init": + if err := initCommand(container, os.Args[2]); err != nil { + log.Fatal(err) + } + } +} + +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open("container.json") + if err != nil { + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index f9ee969..13ee13e 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -3,68 +3,47 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/system" - "log" "os" "path/filepath" "syscall" ) -var ( - // default mount point options - defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV -) +// default mount point options +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } - if readonly { if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } - if err := mountSystem(rootfs); err != nil { return fmt.Errorf("mount system %s", err) } - if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } - - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - if err := setupDev(rootfs); err != nil { return err } - - if err := setupConsole(rootfs, console); err != nil { + if err := setupPtmx(rootfs, console); err != nil { return err } - if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := system.Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } @@ -90,13 +69,10 @@ func copyDevNodes(rootfs string) error { if err != nil { return err } - var ( dest = filepath.Join(rootfs, "dev", node) st = stat.Sys().(*syscall.Stat_t) ) - - log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } @@ -134,24 +110,22 @@ func setupConsole(rootfs, console string) error { if err != nil { return fmt.Errorf("stat console %s %s", console, err) } - st := stat.Sys().(*syscall.Stat_t) - - dest := filepath.Join(rootfs, "dev/console") + var ( + st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, "dev/console") + ) if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { return fmt.Errorf("remove %s %s", dest, err) } - if err := os.Chmod(console, 0600); err != nil { return err } if err := os.Chown(console, 0, 0); err != nil { return err } - if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } @@ -168,10 +142,10 @@ func mountSystem(rootfs string) error { flags int data string }{ - {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults}, - {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults}, + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, } { @@ -189,7 +163,7 @@ func remountProc() error { if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := system.Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { return err } return nil @@ -201,9 +175,20 @@ func remountSys() error { return err } } else { - if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { return err } } return nil } + +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + return setupConsole(rootfs, console) +} diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index b54bc2b..2392ffd 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -2,27 +2,16 @@ package main import ( "github.com/dotcloud/docker/pkg/libcontainer" -) - -const ( - SIGCHLD = 0x14 - CLONE_VFORK = 0x00004000 - CLONE_NEWNS = 0x00020000 - CLONE_NEWUTS = 0x04000000 - CLONE_NEWIPC = 0x08000000 - CLONE_NEWUSER = 0x10000000 - CLONE_NEWPID = 0x20000000 - CLONE_NEWNET = 0x40000000 + "syscall" ) var namespaceMap = map[libcontainer.Namespace]int{ - "": 0, - libcontainer.CLONE_NEWNS: CLONE_NEWNS, - libcontainer.CLONE_NEWUTS: CLONE_NEWUTS, - libcontainer.CLONE_NEWIPC: CLONE_NEWIPC, - libcontainer.CLONE_NEWUSER: CLONE_NEWUSER, - libcontainer.CLONE_NEWPID: CLONE_NEWPID, - libcontainer.CLONE_NEWNET: CLONE_NEWNET, + libcontainer.CLONE_NEWNS: syscall.CLONE_NEWNS, + libcontainer.CLONE_NEWUTS: syscall.CLONE_NEWUTS, + libcontainer.CLONE_NEWIPC: syscall.CLONE_NEWIPC, + libcontainer.CLONE_NEWUSER: syscall.CLONE_NEWUSER, + libcontainer.CLONE_NEWPID: syscall.CLONE_NEWPID, + libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } // getNamespaceFlags parses the container's Namespaces options to set the correct From b48bc85967d473743192951d3e9ed2f85bf5c0a2 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 19 Feb 2014 16:50:10 -0800 Subject: [PATCH 12/64] OSX compilation Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 2 ++ libcontainer/nsinit/init.go | 2 ++ libcontainer/nsinit/main.go | 13 +++++++++++++ libcontainer/nsinit/mount.go | 2 ++ 4 files changed, 19 insertions(+) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 4ac070d..5b53be2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 16a3081..1c90ecc 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 47abcce..c9f9d7b 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -2,17 +2,27 @@ package main import ( "encoding/json" + "errors" "github.com/dotcloud/docker/pkg/libcontainer" "log" "os" ) +var ( + ErrUnsupported = errors.New("Unsupported method") + ErrWrongArguments = errors.New("Wrong argument count") +) + func main() { container, err := loadContainer() if err != nil { log.Fatal(err) } + argc := len(os.Args) + if argc < 2 { + log.Fatal(ErrWrongArguments) + } switch os.Args[1] { case "exec": exitCode, err := execCommand(container) @@ -21,6 +31,9 @@ func main() { } os.Exit(exitCode) case "init": + if argc != 3 { + log.Fatal(ErrWrongArguments) + } if err := initCommand(container, os.Args[2]); err != nil { log.Fatal(err) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 13ee13e..baa850f 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( From e3d5adc9e2b4a6d7e6009ce3534a1cb2c4cbf363 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 19:14:31 -0800 Subject: [PATCH 13/64] Refactor large funcs Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 5 +-- libcontainer/container.json | 3 +- libcontainer/nsinit/exec.go | 60 ++++++++++++++++--------- libcontainer/nsinit/init.go | 87 +++++++++++++++++-------------------- libcontainer/types.go | 48 ++++++++++---------- 5 files changed, 107 insertions(+), 96 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index 3f3961d..c8dbdd6 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,14 +1,13 @@ package libcontainer type Container struct { - ID string `json:"id,omitempty"` - Command *Command `json:"command,omitempty"` + Hostname string `json:"hostname,omitempty"` ReadonlyFs bool `json:"readonly_fs,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` + Command *Command `json:"command,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` - LogFile string `json:"log_file,omitempty"` Network *Network `json:"network,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 8731170..2abf01a 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,6 +1,5 @@ { "id": "koye", - "log_file": "/root/logs", "command": { "args": [ "/bin/bash" @@ -9,7 +8,7 @@ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", "container=docker", - "TERM=xterm" + "TERM=xterm-256color" ] }, "namespaces": [ diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 5b53be2..4abebd2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -27,6 +27,8 @@ func execCommand(container *libcontainer.Container) (int, error) { Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child } + // create a pipe so that we can syncronize with the namespaced process and + // pass the veth name to the child inPipe, err := command.StdinPipe() if err != nil { return -1, err @@ -39,34 +41,17 @@ func execCommand(container *libcontainer.Container) (int, error) { } if container.Network != nil { - name1, name2, err := createVethPair() + vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) if err != nil { return -1, err } - if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { - return -1, err - } - if err := network.InterfaceUp(name1); err != nil { - return -1, err - } - if err := network.SetInterfaceInNamespacePid(name2, command.Process.Pid); err != nil { - return -1, err - } - fmt.Fprint(inPipe, name2) - inPipe.Close() + sendVethName(vethPair, inPipe) } go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) - ws, err := term.GetWinsize(os.Stdin.Fd()) - if err != nil { - return -1, err - } - if err := term.SetWinsize(master.Fd(), ws); err != nil { - return -1, err - } - state, err := term.SetRawTerminal(os.Stdin.Fd()) + state, err := setupWindow(master) if err != nil { command.Process.Kill() return -1, err @@ -81,6 +66,41 @@ func execCommand(container *libcontainer.Container) (int, error) { return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +func sendVethName(name string, pipe io.WriteCloser) { + // write the veth pair name to the child's stdin then close the + // pipe so that the child stops waiting + fmt.Fprint(pipe, name) + pipe.Close() +} + +func setupVeth(bridge string, nspid int) (string, error) { + name1, name2, err := createVethPair() + if err != nil { + return "", err + } + if err := network.SetInterfaceMaster(name1, bridge); err != nil { + return "", err + } + if err := network.InterfaceUp(name1); err != nil { + return "", err + } + if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { + return "", err + } + return name2, nil +} + +func setupWindow(master *os.File) (*term.State, error) { + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(os.Stdin.Fd()) +} + func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 1c90ecc..d853a32 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -9,17 +9,12 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "io/ioutil" - "log" "os" "path/filepath" "syscall" ) func initCommand(container *libcontainer.Container, console string) error { - if err := setLogFile(container); err != nil { - return err - } - rootfs, err := resolveRootfs() if err != nil { return err @@ -27,11 +22,10 @@ func initCommand(container *libcontainer.Container, console string) error { var tempVethName string if container.Network != nil { - data, err := ioutil.ReadAll(os.Stdin) + tempVethName, err = getVethName() if err != nil { - return fmt.Errorf("error reading from stdin %s", err) + return err } - tempVethName = string(data) } // close pipes so that we can replace it with the pty @@ -61,13 +55,10 @@ func initCommand(container *libcontainer.Container, console string) error { if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if container.Network != nil { - if err := setupNetworking(container.Network, tempVethName); err != nil { - return fmt.Errorf("setup networking %s", err) - } + if err := setupNetworking(container.Network, tempVethName); err != nil { + return fmt.Errorf("setup networking %s", err) } - - if err := system.Sethostname(container.ID); err != nil { + if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } if err := capabilities.DropCapabilities(container); err != nil { @@ -136,43 +127,45 @@ func openTerminal(name string, flag int) (*os.File, error) { return os.NewFile(uintptr(r), name), nil } -func setLogFile(container *libcontainer.Container) error { - if container.LogFile != "" { - f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) - if err != nil { - return err +func setupNetworking(config *libcontainer.Network, tempVethName string) error { + if config != nil { + if err := network.InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) + } + if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) + } + if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := network.SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := network.SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := network.SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } } - log.SetOutput(f) } return nil } -func setupNetworking(config *libcontainer.Network, tempVethName string) error { - if err := network.InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) +// getVethName reads from Stdin the temp veth name +// sent by the parent processes after the veth pair +// has been created and setup +func getVethName() (string, error) { + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + return "", fmt.Errorf("error reading from stdin %s", err) } - if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := network.SetInterfaceIp("eth0", config.IP); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - if err := network.SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - if err := network.SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - if config.Gateway != "" { - if err := network.SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } - } - return nil + return string(data), nil } diff --git a/libcontainer/types.go b/libcontainer/types.go index db1c3b9..b5d9932 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,29 +1,5 @@ package libcontainer -type Namespace string -type Namespaces []Namespace - -func (n Namespaces) Contains(ns Namespace) bool { - for _, nns := range n { - if nns == ns { - return true - } - } - return false -} - -type Capability string -type Capabilities []Capability - -func (c Capabilities) Contains(capp Capability) bool { - for _, cc := range c { - if cc == capp { - return true - } - } - return false -} - const ( CAP_SETPCAP Capability = "SETPCAP" CAP_SYS_MODULE Capability = "SYS_MODULE" @@ -47,3 +23,27 @@ const ( CLONE_NEWPID Namespace = "NEWPID" // pid CLONE_NEWNET Namespace = "NEWNET" // network ) + +type Namespace string +type Namespaces []Namespace + +func (n Namespaces) Contains(ns Namespace) bool { + for _, nns := range n { + if nns == ns { + return true + } + } + return false +} + +type Capability string +type Capabilities []Capability + +func (c Capabilities) Contains(capp Capability) bool { + for _, cc := range c { + if cc == capp { + return true + } + } + return false +} From 663518ba66f50b97144e8fb8da8a175b28f5372d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 19:53:25 -0800 Subject: [PATCH 14/64] Add execin function to running a process in a namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.json | 2 +- libcontainer/nsinit/execin.go | 115 ++++++++++++++++++++++++++++++++ libcontainer/nsinit/main.go | 8 +++ libcontainer/nsinit/ns_linux.go | 9 +++ 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 libcontainer/nsinit/execin.go diff --git a/libcontainer/container.json b/libcontainer/container.json index 2abf01a..c5807a7 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,5 +1,5 @@ { - "id": "koye", + "hostname": "koye", "command": { "args": [ "/bin/bash" diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go new file mode 100644 index 0000000..362cf5a --- /dev/null +++ b/libcontainer/nsinit/execin.go @@ -0,0 +1,115 @@ +package main + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" +) + +func execinCommand(container *libcontainer.Container) (int, error) { + nspid, err := readPid() + if err != nil { + return -1, err + } + + for _, ns := range container.Namespaces { + if err := system.Unshare(namespaceMap[ns]); err != nil { + return -1, err + } + } + fds, err := getNsFds(nspid, container) + closeFds := func() { + for _, f := range fds { + system.Closefd(f) + } + } + if err != nil { + closeFds() + return -1, err + } + + for _, fd := range fds { + if fd > 0 { + if err := system.Setns(fd, 0); err != nil { + closeFds() + return -1, fmt.Errorf("setns %s", err) + } + } + system.Closefd(fd) + } + + // if the container has a new pid and mount namespace we need to + // remount proc and sys to pick up the changes + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + + pid, err := system.Fork() + if err != nil { + return -1, err + } + if pid == 0 { + // TODO: make all raw syscalls to be fork safe + if err := system.Unshare(syscall.CLONE_NEWNS); err != nil { + return -1, err + } + if err := remountProc(); err != nil { + return -1, fmt.Errorf("remount proc %s", err) + } + if err := remountSys(); err != nil { + return -1, fmt.Errorf("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + return -1, fmt.Errorf("drop capabilities %s", err) + } + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + return -1, err + } + } + proc, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := proc.Wait() + if err != nil { + return -1, err + } + os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) + } + if err := capabilities.DropCapabilities(container); err != nil { + return -1, fmt.Errorf("drop capabilities %s", err) + } + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + return -1, err + } + panic("unreachable") +} + +func readPid() (int, error) { + data, err := ioutil.ReadFile(".nspid") + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} + +func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { + fds := make([]uintptr, len(container.Namespaces)) + for i, ns := range container.Namespaces { + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) + if err != nil { + return fds, err + } + fds[i] = f.Fd() + } + return fds, nil +} diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index c9f9d7b..8fe700e 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -37,6 +37,14 @@ func main() { if err := initCommand(container, os.Args[2]); err != nil { log.Fatal(err) } + case "execin": + exitCode, err := execinCommand(container) + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + default: + log.Fatalf("command not supported for nsinit %s", os.Args[1]) } } diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 2392ffd..a2809eb 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -14,6 +14,15 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } +var namespaceFileMap = map[libcontainer.Namespace]string{ + libcontainer.CLONE_NEWNS: "mnt", + libcontainer.CLONE_NEWUTS: "uts", + libcontainer.CLONE_NEWIPC: "ipc", + libcontainer.CLONE_NEWUSER: "user", + libcontainer.CLONE_NEWPID: "pid", + libcontainer.CLONE_NEWNET: "net", +} + // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { From bb59129b2f1190de245742bc12a7bed2c9c0b5b9 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 20:35:04 -0800 Subject: [PATCH 15/64] Refactor to remove cmd from container Pass the container's command via args Remove execin function and just look for an existing nspid file to join the namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 7 +------ libcontainer/container.json | 17 ++++++---------- libcontainer/nsinit/exec.go | 21 ++++++++++++++------ libcontainer/nsinit/execin.go | 24 +++-------------------- libcontainer/nsinit/init.go | 4 ++-- libcontainer/nsinit/main.go | 37 ++++++++++++++++++++++++++--------- 6 files changed, 55 insertions(+), 55 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index c8dbdd6..763526f 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -5,17 +5,12 @@ type Container struct { ReadonlyFs bool `json:"readonly_fs,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` - Command *Command `json:"command,omitempty"` + Env []string `json:"environment,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` Network *Network `json:"network,omitempty"` } -type Command struct { - Args []string `json:"args,omitempty"` - Env []string `json:"environment,omitempty"` -} - type Network struct { IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` diff --git a/libcontainer/container.json b/libcontainer/container.json index c5807a7..ccc9abb 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,16 +1,11 @@ { "hostname": "koye", - "command": { - "args": [ - "/bin/bash" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm-256color" - ] - }, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], "namespaces": [ "NEWIPC", "NEWNS", diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 4abebd2..67f907a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -16,17 +16,13 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container) (int, error) { +func execCommand(container *libcontainer.Container, args []string) (int, error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err } - command := exec.Command("nsinit", "init", console) - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child - } - + command := createCommand(container, console, args) // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child inPipe, err := command.StdinPipe() @@ -39,6 +35,7 @@ func execCommand(container *libcontainer.Container) (int, error) { if err := writePidFile(command); err != nil { return -1, err } + defer deletePidFile() if container.Network != nil { vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) @@ -134,3 +131,15 @@ func createVethPair() (name1 string, name2 string, err error) { func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } + +func deletePidFile() error { + return os.Remove(".nspid") +} + +func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{"init", console}, args...)...) + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child + } + return command +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 362cf5a..7f32620 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -5,19 +5,13 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/system" - "io/ioutil" "os" "path/filepath" "strconv" "syscall" ) -func execinCommand(container *libcontainer.Container) (int, error) { - nspid, err := readPid() - if err != nil { - return -1, err - } - +func execinCommand(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err @@ -67,7 +61,7 @@ func execinCommand(container *libcontainer.Container) (int, error) { if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return -1, err } } @@ -84,24 +78,12 @@ func execinCommand(container *libcontainer.Container) (int, error) { if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return -1, err } panic("unreachable") } -func readPid() (int, error) { - data, err := ioutil.ReadFile(".nspid") - if err != nil { - return -1, err - } - pid, err := strconv.Atoi(string(data)) - if err != nil { - return -1, err - } - return pid, nil -} - func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index d853a32..82706fd 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -14,7 +14,7 @@ import ( "syscall" ) -func initCommand(container *libcontainer.Container, console string) error { +func initCommand(container *libcontainer.Container, console string, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err @@ -72,7 +72,7 @@ func initCommand(container *libcontainer.Container, console string) error { return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 8fe700e..30c8b06 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -4,8 +4,10 @@ import ( "encoding/json" "errors" "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" "log" "os" + "strconv" ) var ( @@ -25,24 +27,29 @@ func main() { } switch os.Args[1] { case "exec": - exitCode, err := execCommand(container) + var exitCode int + nspid, err := readPid() + if err != nil { + if !os.IsNotExist(err) { + log.Fatal(err) + } + } + if nspid > 0 { + exitCode, err = execinCommand(container, nspid, os.Args[2:]) + } else { + exitCode, err = execCommand(container, os.Args[2:]) + } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": - if argc != 3 { + if argc < 3 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, os.Args[2]); err != nil { + if err := initCommand(container, os.Args[2], os.Args[3:]); err != nil { log.Fatal(err) } - case "execin": - exitCode, err := execinCommand(container) - if err != nil { - log.Fatal(err) - } - os.Exit(exitCode) default: log.Fatalf("command not supported for nsinit %s", os.Args[1]) } @@ -61,3 +68,15 @@ func loadContainer() (*libcontainer.Container, error) { } return container, nil } + +func readPid() (int, error) { + data, err := ioutil.ReadFile(".nspid") + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} From 3ded9b431a10ec555bc64bd313db56e0eb0766c3 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 21:15:44 -0800 Subject: [PATCH 16/64] Update readme and add TODO Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 75 +++++++++++++++++++++++++----------------- libcontainer/TODO.md | 17 ++++++++++ 2 files changed, 61 insertions(+), 31 deletions(-) create mode 100644 libcontainer/TODO.md diff --git a/libcontainer/README.md b/libcontainer/README.md index 91d7478..07fe4f7 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -1,39 +1,34 @@ ## libcontainer - reference implementation for containers -#### playground +#### background + +libcontainer specifies configuration options for what a container is. It provides a native Go implementation +for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management. -Use the cli package to test out functionality - -First setup a container configuration. You will need a root fs, better go the path to a -stopped docker container and use that. - +#### container +A container is a self contained directory that is able to run one or more processes inside without +affecting the host system. The directory is usually a full system tree. Inside the directory +a `container.json` file just be placed with the runtime configuration for how the process +should be contained and run. Environment, networking, and different capabilities for the +process are specified in this file. +Sample `container.json` file: ```json { - "id": "koye", - "namespace_pid": 12265, - "command": { - "args": [ - "/bin/bash" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm" - ] - }, - "rootfs": "/root/development/gocode/src/github.com/docker/libcontainer/namespaces/ubuntu", - "network": null, - "user": "", - "working_dir": "", + "hostname": "koye", + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], "namespaces": [ - "NEWNET", "NEWIPC", "NEWNS", "NEWPID", - "NEWUTS" + "NEWUTS", + "NEWNET" ], "capabilities": [ "SETPCAP", @@ -50,14 +45,32 @@ stopped docker container and use that. "AUDIT_CONTROL", "MAC_OVERRIDE", "MAC_ADMIN" - ] + ], + "network": { + "ip": "172.17.0.100/16", + "gateway": "172.17.42.1", + "bridge": "docker0", + "mtu": 1500 + } } ``` -After you have a json file and a rootfs path to use just run: -`./cli exec container.json` +Using this configuration and the current directory holding the rootfs for a process to live, one can se libcontainer to exec the container. Running the life of the namespace a `.nspid` file +is written to the current directory with the pid of the namespace'd process to the external word. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. -If you want to attach to an existing namespace just use the same json -file with the container still running and do: -`./cli execin container.json` +#### nsinit + +`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to +spawn or join new containers giving the current directory. To use `nsinit` cd into a linux +rootfs and copy a `container.json` file into the directory with your specified configuration. + +To execution `/bin/bash` in the current directory as a container just run: +```bash +nsinit exec /bin/bash +``` + +If you wish to spawn another process inside the container while your current bash session is +running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. + +You can identify if a process is running in a container by looking to see if `.nspid` is in the root of the directory. diff --git a/libcontainer/TODO.md b/libcontainer/TODO.md new file mode 100644 index 0000000..f18c0b4 --- /dev/null +++ b/libcontainer/TODO.md @@ -0,0 +1,17 @@ +#### goals +* small and simple - line count is not everything but less code is better +* clean lines between what we do in the pkg +* provide primitives for working with namespaces not cater to every option +* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code + +#### tasks +* proper tty for a new process in an existing container +* use exec or raw syscalls for new process in existing container +* setup proper user in namespace if specified +* implement hook or clean interface for cgroups +* example configs for different setups (host networking, boot init) +* improve pkg documentation with comments +* testing - this is hard in a low level pkg but we could do some, maybe +* pivot root +* selinux +* apparmor From ca0e7f087cadb8e20759aee81668dcf1f936c8a6 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 21:21:49 -0800 Subject: [PATCH 17/64] Add CAP_NET_ADMIN Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/capabilities/capabilities.go | 1 + libcontainer/types.go | 1 + 2 files changed, 2 insertions(+) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index 3301e10..c19b719 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -21,6 +21,7 @@ var capMap = map[libcontainer.Capability]capability.Cap{ libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, + libcontainer.CAP_NET_ADMIN: capability.CAP_NET_ADMIN, } // DropCapabilities drops capabilities for the current process based diff --git a/libcontainer/types.go b/libcontainer/types.go index b5d9932..fcd00fd 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -15,6 +15,7 @@ const ( CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" CAP_MAC_ADMIN Capability = "MAC_ADMIN" + CAP_NET_ADMIN Capability = "NET_ADMIN" CLONE_NEWNS Namespace = "NEWNS" // mount CLONE_NEWUTS Namespace = "NEWUTS" // utsname From c20c1dfb04c6b47f2febe60dda05d85309884a07 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 22:43:40 -0800 Subject: [PATCH 18/64] Add comments to many functions Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/capabilities/capabilities.go | 1 + libcontainer/container.go | 22 +++++++----- libcontainer/nsinit/exec.go | 23 ++++++++++--- libcontainer/nsinit/execin.go | 10 ++---- libcontainer/nsinit/init.go | 18 ++++++---- libcontainer/nsinit/main.go | 4 +-- libcontainer/nsinit/mount.go | 41 ++++++++++++++++------- libcontainer/nsinit/ns_linux.go | 3 ++ libcontainer/types.go | 18 +++++++--- libcontainer/utils/utils.go | 2 ++ 10 files changed, 97 insertions(+), 45 deletions(-) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index c19b719..65fd455 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -41,6 +41,7 @@ func DropCapabilities(container *libcontainer.Container) error { return nil } +// getCapabilities returns the specific cap values for the libcontainer types func getCapabilities(container *libcontainer.Container) []capability.Cap { drop := []capability.Cap{} for _, c := range container.Capabilities { diff --git a/libcontainer/container.go b/libcontainer/container.go index 763526f..a6a57da 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,16 +1,22 @@ package libcontainer +// Container defines configuration options for how a +// container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` - ReadonlyFs bool `json:"readonly_fs,omitempty"` - User string `json:"user,omitempty"` - WorkingDir string `json:"working_dir,omitempty"` - Env []string `json:"environment,omitempty"` - Namespaces Namespaces `json:"namespaces,omitempty"` - Capabilities Capabilities `json:"capabilities,omitempty"` - Network *Network `json:"network,omitempty"` + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Network *Network `json:"network,omitempty"` // nil for host's network stack } +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack type Network struct { IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 67f907a..202cfca 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -38,7 +38,7 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) defer deletePidFile() if container.Network != nil { - vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) + vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { return -1, err } @@ -63,14 +63,21 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +// sendVethName writes the veth pair name to the child's stdin then closes the +// pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { - // write the veth pair name to the child's stdin then close the - // pipe so that the child stops waiting fmt.Fprint(pipe, name) pipe.Close() } -func setupVeth(bridge string, nspid int) (string, error) { +// initializeContainerVeth will create a veth pair and setup the host's +// side of the pair by setting the specified bridge as the master and bringing +// up the interface. +// +// Then will with set the other side of the veth pair into the container's namespaced +// using the pid and returns the veth's interface name to provide to the container to +// finish setting up the interface inside the namespace +func initializeContainerVeth(bridge string, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err @@ -98,6 +105,8 @@ func setupWindow(master *os.File) (*term.State, error) { return term.SetRawTerminal(os.Stdin.Fd()) } +// createMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { @@ -113,6 +122,8 @@ func createMasterAndConsole() (*os.File, string, error) { return master, console, nil } +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created func createVethPair() (name1 string, name2 string, err error) { name1, err = utils.GenerateRandomName("dock", 4) if err != nil { @@ -128,6 +139,7 @@ func createVethPair() (name1 string, name2 string, err error) { return } +// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } @@ -136,6 +148,9 @@ func deletePidFile() error { return os.Remove(".nspid") } +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { command := exec.Command("nsinit", append([]string{"init", console}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 7f32620..d6224f9 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -28,6 +28,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) return -1, err } + // foreach namespace fd, use setns to join an existing container's namespaces for _, fd := range fds { if fd > 0 { if err := system.Setns(fd, 0); err != nil { @@ -42,7 +43,6 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) // remount proc and sys to pick up the changes if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - pid, err := system.Fork() if err != nil { return -1, err @@ -58,12 +58,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) if err := remountSys(); err != nil { return -1, fmt.Errorf("remount sys %s", err) } - if err := capabilities.DropCapabilities(container); err != nil { - return -1, fmt.Errorf("drop capabilities %s", err) - } - if err := system.Exec(args[0], args[0:], container.Env); err != nil { - return -1, err - } + goto dropAndExec } proc, err := os.FindProcess(pid) if err != nil { @@ -75,6 +70,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) } os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) } +dropAndExec: if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 82706fd..c77fd90 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -37,9 +37,6 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err != nil { return fmt.Errorf("open terminal %s", err) } - if slave.Fd() != 0 { - return fmt.Errorf("slave fd should be 0") - } if err := dupSlave(slave); err != nil { return fmt.Errorf("dup2 slave %s", err) } @@ -55,7 +52,7 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetworking(container.Network, tempVethName); err != nil { + if err := setupVethNetwork(container.Network, tempVethName); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -78,6 +75,8 @@ func initCommand(container *libcontainer.Container, console string, args []strin panic("unreachable") } +// resolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs func resolveRootfs() (string, error) { cwd, err := os.Getwd() if err != nil { @@ -104,8 +103,9 @@ func setupUser(container *libcontainer.Container) error { return nil } +// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that +// the slave's fd is 0, or stdin func dupSlave(slave *os.File) error { - // we close Stdin,etc so our pty slave should have fd 0 if slave.Fd() != 0 { return fmt.Errorf("slave fd not 0 %d", slave.Fd()) } @@ -118,7 +118,8 @@ func dupSlave(slave *os.File) error { return nil } -// openTerminal is a clone of os.OpenFile without the O_CLOEXEC addition. +// openTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace func openTerminal(name string, flag int) (*os.File, error) { r, e := syscall.Open(name, flag, 0) if e != nil { @@ -127,7 +128,10 @@ func openTerminal(name string, flag int) (*os.File, error) { return os.NewFile(uintptr(r), name), nil } -func setupNetworking(config *libcontainer.Network, tempVethName string) error { +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { if config != nil { if err := network.InterfaceDown(tempVethName); err != nil { return fmt.Errorf("interface down %s %s", tempVethName, err) diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 30c8b06..f45fe55 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -26,7 +26,7 @@ func main() { log.Fatal(ErrWrongArguments) } switch os.Args[1] { - case "exec": + case "exec": // this is executed outside of the namespace in the cwd var exitCode int nspid, err := readPid() if err != nil { @@ -43,7 +43,7 @@ func main() { log.Fatal(err) } os.Exit(exitCode) - case "init": + case "init": // this is executed inside of the namespace to setup the container if argc < 3 { log.Fatal(ErrWrongArguments) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index baa850f..6eb2e09 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -10,10 +10,16 @@ import ( "syscall" ) -// default mount point options +// default mount point flags const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +// setupNewMountNamespace is used to initialize a new mount namespace for an new +// container in the rootfs that is specified. +// +// There is no need to unmount the new mounts because as soon as the mount namespace +// is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { + // mount as slave so that the new mounts do not propagate to the host if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } @@ -55,6 +61,7 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { return nil } +// copyDevNodes mknods the hosts devices so the new container has access to them func copyDevNodes(rootfs string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -82,6 +89,8 @@ func copyDevNodes(rootfs string) error { return nil } +// setupDev symlinks the current processes pipes into the +// appropriate destination on the containers rootfs func setupDev(rootfs string) error { for _, link := range []struct { from string @@ -104,6 +113,7 @@ func setupDev(rootfs string) error { return nil } +// setupConsole ensures that the container has a proper /dev/console setup func setupConsole(rootfs, console string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -161,6 +171,24 @@ func mountSystem(rootfs string) error { return nil } +// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and +// finishes setting up /dev/console +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if err := setupConsole(rootfs, console); err != nil { + return err + } + return nil +} + +// remountProc is used to detach and remount the proc filesystem +// commonly needed with running a new process inside an existing container func remountProc() error { if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err @@ -183,14 +211,3 @@ func remountSys() error { } return nil } - -func setupPtmx(rootfs, console string) error { - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - return setupConsole(rootfs, console) -} diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index a2809eb..481bdf7 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -14,6 +14,9 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } +// namespaceFileMap is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace var namespaceFileMap = map[libcontainer.Namespace]string{ libcontainer.CLONE_NEWNS: "mnt", libcontainer.CLONE_NEWUTS: "uts", diff --git a/libcontainer/types.go b/libcontainer/types.go index fcd00fd..bb54ff5 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,5 +1,8 @@ package libcontainer +// These constants are defined as string types so that +// it is clear when adding the configuration in config files +// instead of using ints or other types const ( CAP_SETPCAP Capability = "SETPCAP" CAP_SYS_MODULE Capability = "SYS_MODULE" @@ -25,9 +28,15 @@ const ( CLONE_NEWNET Namespace = "NEWNET" // network ) -type Namespace string -type Namespaces []Namespace +type ( + Namespace string + Namespaces []Namespace + Capability string + Capabilities []Capability +) +// Contains returns true if the specified Namespace is +// in the slice func (n Namespaces) Contains(ns Namespace) bool { for _, nns := range n { if nns == ns { @@ -37,9 +46,8 @@ func (n Namespaces) Contains(ns Namespace) bool { return false } -type Capability string -type Capabilities []Capability - +// Contains returns true if the specified Capability is +// in the slice func (c Capabilities) Contains(capp Capability) bool { for _, cc := range c { if cc == capp { diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index d3223c3..5050997 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -6,6 +6,8 @@ import ( "io" ) +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value func GenerateRandomName(prefix string, size int) (string, error) { id := make([]byte, 32) if _, err := io.ReadFull(rand.Reader, id); err != nil { From d68ba7742fdcb5d0dea815e7bd27c2035301660d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 22:46:02 -0800 Subject: [PATCH 19/64] Remove privileged.json config Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/privileged.json | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 libcontainer/privileged.json diff --git a/libcontainer/privileged.json b/libcontainer/privileged.json deleted file mode 100644 index be877ad..0000000 --- a/libcontainer/privileged.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "id": "koye", - "namespace_pid": 3745, - "command": { - "args": [ - "/usr/lib/systemd/systemd" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=" - ] - }, - "rootfs": "/root/main/mycontainer", - "namespaces": [ - "NEWIPC", - "NEWNS", - "NEWPID", - "NEWUTS" - ] -} From 8590435fa025241ca28be9780026d6ca571c6816 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 12:00:54 -0800 Subject: [PATCH 20/64] WIP for setup kmsg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 6eb2e09..67f9020 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -43,6 +43,9 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupPtmx(rootfs, console); err != nil { return err } + if err := setupKmsg(rootfs); err != nil { + return err + } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } @@ -211,3 +214,32 @@ func remountSys() error { } return nil } + +func setupKmsg(rootfs string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + var ( + source = filepath.Join(rootfs, "dev/kmsg") + dest = filepath.Join(rootfs, "proc/kmsg") + ) + + if err := system.Mkfifo(source, 0600); err != nil { + return err + } + + os.Chmod(source, 0600) + os.Chown(source, 0, 0) + + if err := system.Mount(source, dest, "bind", syscall.MS_BIND, ""); err != nil { + return err + } + _, err := os.OpenFile(source, syscall.O_RDWR|syscall.O_NDELAY|syscall.O_CLOEXEC, 0) + if err != nil { + return err + } + if err := syscall.Unlink(source); err != nil { + return err + } + return nil +} From 3de41b34a2c49cea867223a4269e1c543bd66ec3 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 20 Feb 2014 23:12:08 +0100 Subject: [PATCH 21/64] libcontainer: Initial version of cgroups support This is a minimal version of raw cgroup support for libcontainer. It has only enough for what docker needs, and it has no support for systemd yet. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/cgroups.go | 16 ++- libcontainer/cgroup/cgroup.go | 177 ++++++++++++++++++++++++++++++++++ libcontainer/container.go | 7 ++ libcontainer/container.json | 5 +- libcontainer/nsinit/exec.go | 13 ++- libcontainer/nsinit/init.go | 10 +- 6 files changed, 218 insertions(+), 10 deletions(-) create mode 100644 libcontainer/cgroup/cgroup.go diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 91ac384..b9318f9 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return parseCgroupFile(subsystem, f) +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) @@ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { } text := s.Text() parts := strings.Split(text, ":") - if parts[1] == subsystem { - return parts[2], nil + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } } } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go new file mode 100644 index 0000000..e30262c --- /dev/null +++ b/libcontainer/cgroup/cgroup.go @@ -0,0 +1,177 @@ +package cgroup + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" + "os" + "path/filepath" + "strconv" +) + +// We have two implementation of cgroups support, one is based on +// systemd and the dbus api, and one is based on raw cgroup fs operations +// following the pre-single-writer model docs at: +// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +func useSystemd() bool { + return false +} + +func applyCgroupSystemd(container *libcontainer.Container, pid int) error { + return fmt.Errorf("not supported yet") +} + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { + cgroup := container.CgroupName + if container.CgroupParent != "" { + cgroup = filepath.Join(container.CgroupParent, cgroup) + } + + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + + path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) + + return path, nil +} + +func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { + path, err := getCgroup(subsystem, container) + if err != nil { + return "", err + } + + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + + return path, nil +} + +func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + + if !container.DeviceAccess { + dir, err := joinCgroup("devices", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + + if container.Memory != 0 || container.MemorySwap != 0 { + dir, err := joinCgroup("memory", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if container.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + } + if container.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { + return err + } + } + } + + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := joinCgroup("cpu", container, pid) + if err != nil { + return err + } + if container.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func CleanupCgroup(container *libcontainer.Container) error { + path, _ := getCgroup("memory", container) + os.RemoveAll(path) + path, _ = getCgroup("devices", container) + os.RemoveAll(path) + path, _ = getCgroup("cpu", container) + os.RemoveAll(path) + return nil +} + +func ApplyCgroup(container *libcontainer.Container, pid int) error { + if container.CgroupName == "" { + return nil + } + + if useSystemd() { + return applyCgroupSystemd(container, pid) + } else { + return applyCgroupRaw(container, pid) + } +} diff --git a/libcontainer/container.go b/libcontainer/container.go index a6a57da..b34ac8b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,6 +11,13 @@ type Container struct { Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Network *Network `json:"network,omitempty"` // nil for host's network stack + + CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup + CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index ccc9abb..3e23600 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -34,5 +34,8 @@ "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 - } + }, + "cgroup_name": "docker-koye", + "cgroup_parent": "docker", + "memory": 524800 } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 202cfca..acff647 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -5,6 +5,7 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/cgroup" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" @@ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return -1, err } if err := writePidFile(command); err != nil { + command.Process.Kill() return -1, err } defer deletePidFile() + // Do this before syncing with child so that no children + // can escape the cgroup + if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if container.Network != nil { vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { @@ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) sendVethName(vethPair, inPipe) } + // Sync with child + inPipe.Close() + go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { fmt.Fprint(pipe, name) - pipe.Close() } // initializeContainerVeth will create a veth pair and setup the host's diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index c77fd90..f619276 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin return err } - var tempVethName string - if container.Network != nil { - tempVethName, err = getVethName() - if err != nil { - return err - } + // We always read this as it is a way to sync with the parent as well + tempVethName, err := getVethName() + if err != nil { + return err } // close pipes so that we can replace it with the pty From 848fd7638b91f46b908e4afd90e96fc2c0155826 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 14:40:00 -0800 Subject: [PATCH 22/64] Revert "WIP for setup kmsg" This reverts commit 80db9a918337c4ae80ffa9a001da13bd24e848c8. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 67f9020..6eb2e09 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -43,9 +43,6 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupPtmx(rootfs, console); err != nil { return err } - if err := setupKmsg(rootfs); err != nil { - return err - } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } @@ -214,32 +211,3 @@ func remountSys() error { } return nil } - -func setupKmsg(rootfs string) error { - oldMask := system.Umask(0000) - defer system.Umask(oldMask) - - var ( - source = filepath.Join(rootfs, "dev/kmsg") - dest = filepath.Join(rootfs, "proc/kmsg") - ) - - if err := system.Mkfifo(source, 0600); err != nil { - return err - } - - os.Chmod(source, 0600) - os.Chown(source, 0, 0) - - if err := system.Mount(source, dest, "bind", syscall.MS_BIND, ""); err != nil { - return err - } - _, err := os.OpenFile(source, syscall.O_RDWR|syscall.O_NDELAY|syscall.O_CLOEXEC, 0) - if err != nil { - return err - } - if err := syscall.Unlink(source); err != nil { - return err - } - return nil -} From b90aaf6828eab4f2bbdba82c3e0e4cd0d356b6ad Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 14:40:36 -0800 Subject: [PATCH 23/64] Remove clone_vfork Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index acff647..f73ad32 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -165,7 +165,7 @@ func deletePidFile() error { func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { command := exec.Command("nsinit", append([]string{"init", console}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } return command } From 1cecb000035a2d16a99a80a20871ea4d207d29b1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 15:48:48 -0800 Subject: [PATCH 24/64] Refactory cgroups into general pkg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 61 +++++++++++++++- libcontainer/cgroup/cgroup.go | 131 ++++++++++++---------------------- libcontainer/container.go | 28 ++++---- libcontainer/container.json | 8 ++- 4 files changed, 124 insertions(+), 104 deletions(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index b9318f9..1e96caa 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -5,10 +5,23 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/mount" "io" + "io/ioutil" "os" + "path/filepath" + "strconv" "strings" ) +type Cgroup struct { + Name string `json:"name,omitempty"` + Parent string `json:"parent,omitempty"` + + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) +} + // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { mounts, err := mount.GetMounts() @@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } } } - return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem) } @@ -50,9 +62,50 @@ func GetInitCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func (c *Cgroup) Path(root, subsystem string) (string, error) { + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(root, subsystem, initPath, cgroup), nil +} + +func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { + path, err := c.Path(root, subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (c *Cgroup) Cleanup(root string) error { + get := func(subsystem string) string { + path, _ := c.Path(root, subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + os.RemoveAll(path) + } + return nil +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) - for s.Scan() { if err := s.Err(); err != nil { return "", err @@ -67,3 +120,7 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) } + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go index e30262c..5f27ac3 100644 --- a/libcontainer/cgroup/cgroup.go +++ b/libcontainer/cgroup/cgroup.go @@ -10,71 +10,46 @@ import ( "strconv" ) -// We have two implementation of cgroups support, one is based on -// systemd and the dbus api, and one is based on raw cgroup fs operations -// following the pre-single-writer model docs at: -// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ -const ( - cgroupRoot = "/sys/fs/cgroup" -) - -func useSystemd() bool { - return false -} - -func applyCgroupSystemd(container *libcontainer.Container, pid int) error { - return fmt.Errorf("not supported yet") -} - -func writeFile(dir, file, data string) error { - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) -} - -func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { - cgroup := container.CgroupName - if container.CgroupParent != "" { - cgroup = filepath.Join(container.CgroupParent, cgroup) +func ApplyCgroup(container *libcontainer.Container, pid int) (err error) { + if container.Cgroups == nil { + return nil } - initPath, err := cgroups.GetInitCgroupDir(subsystem) + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := cgroups.FindCgroupMountpoint("memory") if err != nil { - return "", err + return err } - - path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) - - return path, nil -} - -func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { - path, err := getCgroup(subsystem, container) - if err != nil { - return "", err - } - - if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { - return "", err - } - - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { - return "", err - } - - return path, nil -} - -func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { + cgroupRoot = filepath.Dir(cgroupRoot) if _, err := os.Stat(cgroupRoot); err != nil { return fmt.Errorf("cgroups fs not found") } + if err := setupDevices(container, cgroupRoot, pid); err != nil { + return err + } + if err := setupMemory(container, cgroupRoot, pid); err != nil { + return err + } + if err := setupCpu(container, cgroupRoot, pid); err != nil { + return err + } + return nil +} - if !container.DeviceAccess { - dir, err := joinCgroup("devices", container, pid) +func setupDevices(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { + if !container.Cgroups.DeviceAccess { + dir, err := container.Cgroups.Join(cgroupRoot, "devices", pid) if err != nil { return err } + defer func() { - if retErr != nil { + if err != nil { os.RemoveAll(dir) } }() @@ -113,65 +88,53 @@ func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { } } } + return nil +} - if container.Memory != 0 || container.MemorySwap != 0 { - dir, err := joinCgroup("memory", container, pid) +func setupMemory(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { + if container.Cgroups.Memory != 0 || container.Cgroups.MemorySwap != 0 { + dir, err := container.Cgroups.Join(cgroupRoot, "memory", pid) if err != nil { return err } defer func() { - if retErr != nil { + if err != nil { os.RemoveAll(dir) } }() - if container.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + if container.Cgroups.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { return err } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { return err } } - if container.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { + if container.Cgroups.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.Cgroups.MemorySwap, 10)); err != nil { return err } } } + return nil +} +func setupCpu(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { // We always want to join the cpu group, to allow fair cpu scheduling // on a container basis - dir, err := joinCgroup("cpu", container, pid) + dir, err := container.Cgroups.Join(cgroupRoot, "cpu", pid) if err != nil { return err } - if container.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { + if container.Cgroups.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.Cgroups.CpuShares, 10)); err != nil { return err } } return nil } -func CleanupCgroup(container *libcontainer.Container) error { - path, _ := getCgroup("memory", container) - os.RemoveAll(path) - path, _ = getCgroup("devices", container) - os.RemoveAll(path) - path, _ = getCgroup("cpu", container) - os.RemoveAll(path) - return nil -} - -func ApplyCgroup(container *libcontainer.Container, pid int) error { - if container.CgroupName == "" { - return nil - } - - if useSystemd() { - return applyCgroupSystemd(container, pid) - } else { - return applyCgroupRaw(container, pid) - } +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } diff --git a/libcontainer/container.go b/libcontainer/container.go index b34ac8b..4c0e39a 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,23 +1,21 @@ package libcontainer +import ( + "github.com/dotcloud/docker/pkg/cgroups" +) + // Container defines configuration options for how a // container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` // hostname - ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly - User string `json:"user,omitempty"` // user to execute the process as - WorkingDir string `json:"working_dir,omitempty"` // current working directory - Env []string `json:"environment,omitempty"` // environment to set - Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply - Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop - Network *Network `json:"network,omitempty"` // nil for host's network stack - - CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup - CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice - DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice - Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) - MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap - CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Network *Network `json:"network,omitempty"` // nil for host's network stack + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index 3e23600..2207543 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -35,7 +35,9 @@ "bridge": "docker0", "mtu": 1500 }, - "cgroup_name": "docker-koye", - "cgroup_parent": "docker", - "memory": 524800 + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 524800 + } } From c4769ec624becd45e3dcc56c559f15b0db3a7a4c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 15:50:55 -0800 Subject: [PATCH 25/64] Change IP to address because it includes the subnet Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 7 ++++++- libcontainer/container.go | 2 +- libcontainer/container.json | 2 +- libcontainer/nsinit/init.go | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 07fe4f7..163161c 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -47,10 +47,15 @@ Sample `container.json` file: "MAC_ADMIN" ], "network": { - "ip": "172.17.0.100/16", + "address": "172.17.0.100/16", "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 + }, + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 524800 } } ``` diff --git a/libcontainer/container.go b/libcontainer/container.go index 4c0e39a..e6e4b47 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -23,7 +23,7 @@ type Container struct { // The network configuration can be omited from a container causing the // container to be setup with the host's networking stack type Network struct { - IP string `json:"ip,omitempty"` + Address string `json:"address,omitempty"` Gateway string `json:"gateway,omitempty"` Bridge string `json:"bridge,omitempty"` Mtu int `json:"mtu,omitempty"` diff --git a/libcontainer/container.json b/libcontainer/container.json index 2207543..c1a07dc 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -30,7 +30,7 @@ "MAC_ADMIN" ], "network": { - "ip": "172.17.0.100/16", + "address": "172.17.0.100/16", "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f619276..f89e539 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -137,7 +137,7 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { return fmt.Errorf("change %s to eth0 %s", tempVethName, err) } - if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + if err := network.SetInterfaceIp("eth0", config.Address); err != nil { return fmt.Errorf("set eth0 ip %s", err) } if err := network.SetMtu("eth0", config.Mtu); err != nil { From ccc915b7b9ac0fcc4b3bc70e3c348357f6321249 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 16:11:22 -0800 Subject: [PATCH 26/64] Move rest of cgroups functions into cgroups pkg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 122 +++++++++++++++++++++++++++++ libcontainer/cgroup/cgroup.go | 140 ---------------------------------- libcontainer/nsinit/exec.go | 9 ++- 3 files changed, 127 insertions(+), 144 deletions(-) delete mode 100644 libcontainer/cgroup/cgroup.go diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 1e96caa..96002f0 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -124,3 +124,125 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { func writeFile(dir, file, data string) error { return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } + +func (c *Cgroup) Apply(pid int) error { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := FindCgroupMountpoint("memory") + if err != nil { + return err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + if err := c.setupDevices(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupMemory(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupCpu(cgroupRoot, pid); err != nil { + return err + } + return nil +} + +func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { + if !c.DeviceAccess { + dir, err := c.Join(cgroupRoot, "devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := c.Join(cgroupRoot, "memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + if c.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10)); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := c.Join(cgroupRoot, "cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go deleted file mode 100644 index 5f27ac3..0000000 --- a/libcontainer/cgroup/cgroup.go +++ /dev/null @@ -1,140 +0,0 @@ -package cgroup - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/cgroups" - "github.com/dotcloud/docker/pkg/libcontainer" - "io/ioutil" - "os" - "path/filepath" - "strconv" -) - -func ApplyCgroup(container *libcontainer.Container, pid int) (err error) { - if container.Cgroups == nil { - return nil - } - - // We have two implementation of cgroups support, one is based on - // systemd and the dbus api, and one is based on raw cgroup fs operations - // following the pre-single-writer model docs at: - // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ - // - // we can pick any subsystem to find the root - cgroupRoot, err := cgroups.FindCgroupMountpoint("memory") - if err != nil { - return err - } - cgroupRoot = filepath.Dir(cgroupRoot) - if _, err := os.Stat(cgroupRoot); err != nil { - return fmt.Errorf("cgroups fs not found") - } - if err := setupDevices(container, cgroupRoot, pid); err != nil { - return err - } - if err := setupMemory(container, cgroupRoot, pid); err != nil { - return err - } - if err := setupCpu(container, cgroupRoot, pid); err != nil { - return err - } - return nil -} - -func setupDevices(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - if !container.Cgroups.DeviceAccess { - dir, err := container.Cgroups.Join(cgroupRoot, "devices", pid) - if err != nil { - return err - } - - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if err := writeFile(dir, "devices.deny", "a"); err != nil { - return err - } - - allow := []string{ - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { - return err - } - } - } - return nil -} - -func setupMemory(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - if container.Cgroups.Memory != 0 || container.Cgroups.MemorySwap != 0 { - dir, err := container.Cgroups.Join(cgroupRoot, "memory", pid) - if err != nil { - return err - } - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if container.Cgroups.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { - return err - } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { - return err - } - } - if container.Cgroups.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.Cgroups.MemorySwap, 10)); err != nil { - return err - } - } - } - return nil -} - -func setupCpu(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - dir, err := container.Cgroups.Join(cgroupRoot, "cpu", pid) - if err != nil { - return err - } - if container.Cgroups.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.Cgroups.CpuShares, 10)); err != nil { - return err - } - } - return nil -} - -func writeFile(dir, file, data string) error { - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f73ad32..f04e9be 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -5,7 +5,6 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/cgroup" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" @@ -41,9 +40,11 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // Do this before syncing with child so that no children // can escape the cgroup - if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { - command.Process.Kill() - return -1, err + if container.Cgroups != nil { + if err := container.Cgroups.Apply(command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } } if container.Network != nil { From 3677967f4e48fb39c6e915891188a85a374c6710 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:53:50 -0800 Subject: [PATCH 27/64] Use flag for init Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 2 +- libcontainer/nsinit/main.go | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f04e9be..6d87f3b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -164,7 +164,7 @@ func deletePidFile() error { // defined on the container's configuration and use the current binary as the init with the // args provided func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"init", console}, args...)...) + command := exec.Command("nsinit", append([]string{"-console", console, "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index f45fe55..e7240df 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -3,6 +3,7 @@ package main import ( "encoding/json" "errors" + "flag" "github.com/dotcloud/docker/pkg/libcontainer" "io/ioutil" "log" @@ -16,16 +17,18 @@ var ( ) func main() { + console := flag.String("console", "", "Console (pty slave) name") + flag.Parse() + container, err := loadContainer() if err != nil { log.Fatal(err) } - argc := len(os.Args) - if argc < 2 { + if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } - switch os.Args[1] { + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd var exitCode int nspid, err := readPid() @@ -35,23 +38,23 @@ func main() { } } if nspid > 0 { - exitCode, err = execinCommand(container, nspid, os.Args[2:]) + exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, os.Args[2:]) + exitCode, err = execCommand(container, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container - if argc < 3 { + if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, os.Args[2], os.Args[3:]); err != nil { + if err := initCommand(container, *console, flag.Args()[1:]); err != nil { log.Fatal(err) } default: - log.Fatalf("command not supported for nsinit %s", os.Args[1]) + log.Fatalf("command not supported for nsinit %s", flag.Arg(0)) } } From b2e01cbe8cd0ce3b541fd0cd6250e21995cd4236 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:58:13 -0800 Subject: [PATCH 28/64] Use a custom pipe instead of stdin for sync net namespace Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 16 ++++++++++------ libcontainer/nsinit/init.go | 11 +++++++---- libcontainer/nsinit/main.go | 3 ++- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6d87f3b..8007ed4 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -22,16 +22,20 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return -1, err } - command := createCommand(container, console, args) // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child - inPipe, err := command.StdinPipe() + r, w, err := os.Pipe() if err != nil { return -1, err } + system.UsetCloseOnExec(r.Fd()) + + command := createCommand(container, console, r.Fd(), args) + if err := command.Start(); err != nil { return -1, err } + if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -52,11 +56,11 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) if err != nil { return -1, err } - sendVethName(vethPair, inPipe) + sendVethName(vethPair, w) } // Sync with child - inPipe.Close() + w.Close() go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -163,8 +167,8 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"-console", console, "init"}, args...)...) +func createCommand(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{"-console", console, "-pipe", fmt.Sprint(pipe), "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f89e539..a0815ee 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -8,20 +8,21 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "io" "io/ioutil" "os" "path/filepath" "syscall" ) -func initCommand(container *libcontainer.Container, console string, args []string) error { +func initCommand(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err } // We always read this as it is a way to sync with the parent as well - tempVethName, err := getVethName() + tempVethName, err := getVethName(pipe) if err != nil { return err } @@ -164,8 +165,10 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { // getVethName reads from Stdin the temp veth name // sent by the parent processes after the veth pair // has been created and setup -func getVethName() (string, error) { - data, err := ioutil.ReadAll(os.Stdin) +func getVethName(pipe io.ReadCloser) (string, error) { + defer pipe.Close() + + data, err := ioutil.ReadAll(pipe) if err != nil { return "", fmt.Errorf("error reading from stdin %s", err) } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index e7240df..6f2825b 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -18,6 +18,7 @@ var ( func main() { console := flag.String("console", "", "Console (pty slave) name") + pipeFd := flag.Int("pipe", 0, "sync pipe fd") flag.Parse() container, err := loadContainer() @@ -50,7 +51,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, *console, flag.Args()[1:]); err != nil { + if err := initCommand(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From 41696722fa70acd53db889d13a0a05b2413d36a3 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:59:08 -0800 Subject: [PATCH 29/64] Minor cleanup Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 8007ed4..7f552c2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -56,11 +56,12 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) if err != nil { return -1, err } - sendVethName(vethPair, w) + sendVethName(w, vethPair) } // Sync with child w.Close() + r.Close() go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -82,7 +83,7 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // sendVethName writes the veth pair name to the child's stdin then closes the // pipe so that the child stops waiting for more data -func sendVethName(name string, pipe io.WriteCloser) { +func sendVethName(pipe io.Writer, name string) { fmt.Fprint(pipe, name) } From 97738ffed35855bfe5ad91767a8e6a7c222937a3 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 18:05:40 -0800 Subject: [PATCH 30/64] Handle non-tty mode Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 55 ++++++++++++++++++++++++++++-------- libcontainer/nsinit/init.go | 30 +++++++++++--------- libcontainer/nsinit/main.go | 9 ++++-- libcontainer/nsinit/mount.go | 6 ++-- 4 files changed, 70 insertions(+), 30 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 7f552c2..b290ace 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -16,10 +16,21 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container, args []string) (int, error) { - master, console, err := createMasterAndConsole() - if err != nil { - return -1, err +func execCommand(container *libcontainer.Container, tty bool, args []string) (int, error) { + var ( + master *os.File + console string + err error + + inPipe io.WriteCloser + outPipe, errPipe io.ReadCloser + ) + + if tty { + master, console, err = createMasterAndConsole() + if err != nil { + return -1, err + } } // create a pipe so that we can syncronize with the namespaced process and @@ -32,6 +43,21 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) command := createCommand(container, console, r.Fd(), args) + if !tty { + inPipe, err = command.StdinPipe() + if err != nil { + return -1, err + } + outPipe, err = command.StdoutPipe() + if err != nil { + return -1, err + } + errPipe, err = command.StderrPipe() + if err != nil { + return -1, err + } + } + if err := command.Start(); err != nil { return -1, err } @@ -63,15 +89,20 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) w.Close() r.Close() - go io.Copy(os.Stdout, master) - go io.Copy(master, os.Stdin) - - state, err := setupWindow(master) - if err != nil { - command.Process.Kill() - return -1, err + if tty { + go io.Copy(os.Stdout, master) + go io.Copy(master, os.Stdin) + state, err := setupWindow(master) + if err != nil { + command.Process.Kill() + return -1, err + } + defer term.RestoreTerminal(os.Stdin.Fd(), state) + } else { + go io.Copy(inPipe, os.Stdin) + go io.Copy(os.Stdout, outPipe) + go io.Copy(os.Stderr, errPipe) } - defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index a0815ee..ef7fc4e 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -27,23 +27,27 @@ func initCommand(container *libcontainer.Container, console string, pipe io.Read return err } - // close pipes so that we can replace it with the pty - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() + if console != "" { + // close pipes so that we can replace it with the pty + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() + slave, err := openTerminal(console, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + return fmt.Errorf("dup2 slave %s", err) + } + } - slave, err := openTerminal(console, syscall.O_RDWR) - if err != nil { - return fmt.Errorf("open terminal %s", err) - } - if err := dupSlave(slave); err != nil { - return fmt.Errorf("dup2 slave %s", err) - } if _, err := system.Setsid(); err != nil { return fmt.Errorf("setsid %s", err) } - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) + if console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } } if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent deth signal %s", err) diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 6f2825b..f66ff0d 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -17,8 +17,11 @@ var ( ) func main() { - console := flag.String("console", "", "Console (pty slave) name") - pipeFd := flag.Int("pipe", 0, "sync pipe fd") + var ( + console = flag.String("console", "", "Console (pty slave) name") + tty = flag.Bool("tty", false, "Create a tty") + pipeFd = flag.Int("pipe", 0, "sync pipe fd") + ) flag.Parse() container, err := loadContainer() @@ -41,7 +44,7 @@ func main() { if nspid > 0 { exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, flag.Args()[1:]) + exitCode, err = execCommand(container, *tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 6eb2e09..9cf69f4 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -40,8 +40,10 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupDev(rootfs); err != nil { return err } - if err := setupPtmx(rootfs, console); err != nil { - return err + if console != "" { + if err := setupPtmx(rootfs, console); err != nil { + return err + } } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) From 52fa4de610a7e77da6b8afecebd09f759faa367d Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 18:10:30 -0800 Subject: [PATCH 31/64] Make sure to close the pipe upon ctrl-d Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b290ace..44d9aff 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -99,7 +99,10 @@ func execCommand(container *libcontainer.Container, tty bool, args []string) (in } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { - go io.Copy(inPipe, os.Stdin) + go func() { + defer inPipe.Close() + io.Copy(inPipe, os.Stdin) + }() go io.Copy(os.Stdout, outPipe) go io.Copy(os.Stderr, errPipe) } @@ -109,6 +112,7 @@ func execCommand(container *libcontainer.Container, tty bool, args []string) (in return -1, err } } + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } From 5d71533d4eb75a96c9d163e6a2ec8181a91395fe Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 18:27:42 -0800 Subject: [PATCH 32/64] Make nsinit a proper go pkg and add the main in another dir Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 4 +++- libcontainer/nsinit/exec.go | 6 ++++-- libcontainer/nsinit/execin.go | 5 +++-- libcontainer/nsinit/init.go | 6 ++++-- libcontainer/nsinit/mount.go | 2 +- libcontainer/nsinit/ns_linux.go | 2 +- libcontainer/nsinit/{ => nsinit}/main.go | 7 ++++--- 7 files changed, 20 insertions(+), 12 deletions(-) rename libcontainer/nsinit/{ => nsinit}/main.go (87%) diff --git a/libcontainer/README.md b/libcontainer/README.md index 163161c..3a2a843 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -72,9 +72,11 @@ rootfs and copy a `container.json` file into the directory with your specified c To execution `/bin/bash` in the current directory as a container just run: ```bash -nsinit exec /bin/bash +nsinit -tty exec /bin/bash ``` +If you want a proper tty setup inside the new container you must use the `-tty` flag when running nsinit. + If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 44d9aff..9d0f7ff 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" @@ -16,7 +16,9 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container, tty bool, args []string) (int, error) { +// Exec performes setup outside of a namespace so that a container can be +// executed. Exec is a high level function for working with container namespaces. +func Exec(container *libcontainer.Container, tty bool, args []string) (int, error) { var ( master *os.File console string diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index d6224f9..85a8990 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -1,4 +1,4 @@ -package main +package nsinit import ( "fmt" @@ -11,7 +11,8 @@ import ( "syscall" ) -func execinCommand(container *libcontainer.Container, nspid int, args []string) (int, error) { +// ExecIn uses an existing pid and joins the pid's namespaces with the new command. +func ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index ef7fc4e..f80d785 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" @@ -15,7 +15,9 @@ import ( "syscall" ) -func initCommand(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { +// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, +// and other options required for the new container. +func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 9cf69f4..a73e97e 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 481bdf7..e42d4b8 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -1,4 +1,4 @@ -package main +package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go similarity index 87% rename from libcontainer/nsinit/main.go rename to libcontainer/nsinit/nsinit/main.go index f66ff0d..9d3c201 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -5,6 +5,7 @@ import ( "errors" "flag" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" "io/ioutil" "log" "os" @@ -42,9 +43,9 @@ func main() { } } if nspid > 0 { - exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) + exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, *tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, *tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) @@ -54,7 +55,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From 6054bda2b8a5c51d2663b3c7a563ad024ac12e80 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 18:38:28 -0800 Subject: [PATCH 33/64] Refactor the flag management for main Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 13 ++++--------- libcontainer/nsinit/init.go | 1 - libcontainer/nsinit/nsinit/main.go | 27 +++++++++++++++++---------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 9d0f7ff..d2b87b6 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -44,18 +44,14 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro system.UsetCloseOnExec(r.Fd()) command := createCommand(container, console, r.Fd(), args) - if !tty { - inPipe, err = command.StdinPipe() - if err != nil { + if inPipe, err = command.StdinPipe(); err != nil { return -1, err } - outPipe, err = command.StdoutPipe() - if err != nil { + if outPipe, err = command.StdoutPipe(); err != nil { return -1, err } - errPipe, err = command.StderrPipe() - if err != nil { + if errPipe, err = command.StderrPipe(); err != nil { return -1, err } } @@ -63,7 +59,6 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro if err := command.Start(); err != nil { return -1, err } - if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -94,6 +89,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro if tty { go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) + state, err := setupWindow(master) if err != nil { command.Process.Kill() @@ -114,7 +110,6 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro return -1, err } } - return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f80d785..88a5c3c 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -173,7 +173,6 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { // has been created and setup func getVethName(pipe io.ReadCloser) (string, error) { defer pipe.Close() - data, err := ioutil.ReadAll(pipe) if err != nil { return "", fmt.Errorf("error reading from stdin %s", err) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 9d3c201..33a7747 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -12,27 +12,34 @@ import ( "strconv" ) +var ( + console string + tty bool + pipeFd int +) + var ( ErrUnsupported = errors.New("Unsupported method") ErrWrongArguments = errors.New("Wrong argument count") ) -func main() { - var ( - console = flag.String("console", "", "Console (pty slave) name") - tty = flag.Bool("tty", false, "Create a tty") - pipeFd = flag.Int("pipe", 0, "sync pipe fd") - ) - flag.Parse() +func init() { + flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.BoolVar(&tty, "tty", false, "create a tty") + flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.Parse() +} + +func main() { container, err := loadContainer() if err != nil { log.Fatal(err) } - if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd var exitCode int @@ -45,7 +52,7 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, *tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) @@ -55,7 +62,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From d67915851dc941c4dc59ea6375ec21a327bf48f4 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 13:53:11 -0800 Subject: [PATCH 34/64] Move tty into container.json Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 7 +++---- libcontainer/container.go | 1 + libcontainer/container.json | 3 ++- libcontainer/nsinit/exec.go | 8 ++++---- libcontainer/nsinit/nsinit/main.go | 4 +--- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 3a2a843..89a4ec0 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -17,6 +17,7 @@ Sample `container.json` file: ```json { "hostname": "koye", + "tty": true, "environment": [ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", @@ -55,7 +56,7 @@ Sample `container.json` file: "cgroups": { "name": "docker-koye", "parent": "docker", - "memory": 524800 + "memory": 5248000 } } ``` @@ -72,11 +73,9 @@ rootfs and copy a `container.json` file into the directory with your specified c To execution `/bin/bash` in the current directory as a container just run: ```bash -nsinit -tty exec /bin/bash +nsinit exec /bin/bash ``` -If you want a proper tty setup inside the new container you must use the `-tty` flag when running nsinit. - If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. diff --git a/libcontainer/container.go b/libcontainer/container.go index e6e4b47..3c1b62b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -12,6 +12,7 @@ type Container struct { User string `json:"user,omitempty"` // user to execute the process as WorkingDir string `json:"working_dir,omitempty"` // current working directory Env []string `json:"environment,omitempty"` // environment to set + Tty bool `json:"tty,omitempty"` // setup a proper tty or not Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Network *Network `json:"network,omitempty"` // nil for host's network stack diff --git a/libcontainer/container.json b/libcontainer/container.json index c1a07dc..07e52df 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,5 +1,6 @@ { "hostname": "koye", + "tty": true, "environment": [ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", @@ -38,6 +39,6 @@ "cgroups": { "name": "docker-koye", "parent": "docker", - "memory": 524800 + "memory": 5248000 } } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index d2b87b6..e2adf3d 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -18,7 +18,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, tty bool, args []string) (int, error) { +func Exec(container *libcontainer.Container, args []string) (int, error) { var ( master *os.File console string @@ -28,7 +28,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro outPipe, errPipe io.ReadCloser ) - if tty { + if container.Tty { master, console, err = createMasterAndConsole() if err != nil { return -1, err @@ -44,7 +44,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro system.UsetCloseOnExec(r.Fd()) command := createCommand(container, console, r.Fd(), args) - if !tty { + if !container.Tty { if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -86,7 +86,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro w.Close() r.Close() - if tty { + if container.Tty { go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 33a7747..6508a3e 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -14,7 +14,6 @@ import ( var ( console string - tty bool pipeFd int ) @@ -25,7 +24,6 @@ var ( func init() { flag.StringVar(&console, "console", "", "console (pty slave) path") - flag.BoolVar(&tty, "tty", false, "create a tty") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() @@ -52,7 +50,7 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, flag.Args()[1:]) } if err != nil { log.Fatal(err) From d40fbbb69be7ee4e8aef1b2730a911f3fa93e36c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 14:49:55 -0800 Subject: [PATCH 35/64] Add good logging support to both sides Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 35 ++++++++++++++++++++++----- libcontainer/nsinit/init.go | 23 ++++++++++-------- libcontainer/nsinit/nsinit/main.go | 39 ++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index e2adf3d..24e722a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -11,6 +11,7 @@ import ( "github.com/dotcloud/docker/pkg/term" "io" "io/ioutil" + "log" "os" "os/exec" "syscall" @@ -18,7 +19,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, args []string) (int, error) { +func Exec(container *libcontainer.Container, logFile string, args []string) (int, error) { var ( master *os.File console string @@ -29,6 +30,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { ) if container.Tty { + log.Printf("setting up master and console") master, console, err = createMasterAndConsole() if err != nil { return -1, err @@ -43,8 +45,9 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } system.UsetCloseOnExec(r.Fd()) - command := createCommand(container, console, r.Fd(), args) + command := createCommand(container, console, logFile, r.Fd(), args) if !container.Tty { + log.Printf("opening pipes on command") if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -56,9 +59,11 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } } + log.Printf("staring init") if err := command.Start(); err != nil { return -1, err } + log.Printf("writting state file") if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -68,6 +73,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { // Do this before syncing with child so that no children // can escape the cgroup if container.Cgroups != nil { + log.Printf("setting up cgroups") if err := container.Cgroups.Apply(command.Process.Pid); err != nil { command.Process.Kill() return -1, err @@ -75,18 +81,22 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } if container.Network != nil { - vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) + log.Printf("creating veth pair") + vethPair, err := initializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) if err != nil { return -1, err } + log.Printf("sending %s as veth pair name", vethPair) sendVethName(w, vethPair) } // Sync with child + log.Printf("closing sync pipes") w.Close() r.Close() if container.Tty { + log.Printf("starting copy for tty") go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -97,6 +107,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { + log.Printf("starting copy for std pipes") go func() { defer inPipe.Close() io.Copy(inPipe, os.Stdin) @@ -105,11 +116,13 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { go io.Copy(os.Stderr, errPipe) } + log.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } + log.Printf("process ended") return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } @@ -126,17 +139,22 @@ func sendVethName(pipe io.Writer, name string) { // Then will with set the other side of the veth pair into the container's namespaced // using the pid and returns the veth's interface name to provide to the container to // finish setting up the interface inside the namespace -func initializeContainerVeth(bridge string, nspid int) (string, error) { +func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err } + log.Printf("veth pair created %s <> %s", name1, name2) if err := network.SetInterfaceMaster(name1, bridge); err != nil { return "", err } + if err := network.SetMtu(name1, mtu); err != nil { + return "", err + } if err := network.InterfaceUp(name1); err != nil { return "", err } + log.Printf("setting %s inside %d namespace", name2, nspid) if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { return "", err } @@ -200,8 +218,13 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"-console", console, "-pipe", fmt.Sprint(pipe), "init"}, args...)...) +func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{ + "-console", console, + "-pipe", fmt.Sprint(pipe), + "-log", logFile, + "init"}, args...)...) + command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 88a5c3c..8fc5f3d 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -10,6 +10,7 @@ import ( "github.com/dotcloud/docker/pkg/system" "io" "io/ioutil" + "log" "os" "path/filepath" "syscall" @@ -17,19 +18,23 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { - rootfs, err := resolveRootfs() +func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe io.ReadCloser, args []string) error { + rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err } + log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well tempVethName, err := getVethName(pipe) if err != nil { return err } - + if tempVethName != "" { + log.Printf("received veth name %s", tempVethName) + } if console != "" { + log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty os.Stdin.Close() os.Stdout.Close() @@ -42,7 +47,6 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, return fmt.Errorf("dup2 slave %s", err) } } - if _, err := system.Setsid(); err != nil { return fmt.Errorf("setsid %s", err) } @@ -63,9 +67,11 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } + log.Printf("dropping capabilities") if err := capabilities.DropCapabilities(container); err != nil { return fmt.Errorf("drop capabilities %s", err) } + log.Printf("setting user in namespace") if err := setupUser(container); err != nil { return fmt.Errorf("setup user %s", err) } @@ -74,6 +80,7 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } + log.Printf("execing %s goodbye", args[0]) if err := system.Exec(args[0], args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } @@ -82,12 +89,8 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, // resolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs -func resolveRootfs() (string, error) { - cwd, err := os.Getwd() - if err != nil { - return "", err - } - rootfs, err := filepath.Abs(cwd) +func resolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) if err != nil { return "", err } diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 6508a3e..0873c09 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -6,6 +6,7 @@ import ( "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "io" "io/ioutil" "log" "os" @@ -15,6 +16,7 @@ import ( var ( console string pipeFd int + logFile string ) var ( @@ -24,22 +26,27 @@ var ( func init() { flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() } func main() { + if flag.NArg() < 1 { + log.Fatal(ErrWrongArguments) + } container, err := loadContainer() if err != nil { log.Fatal(err) } - if flag.NArg() < 1 { - log.Fatal(ErrWrongArguments) + if err := setupLogging(); err != nil { + log.Fatal(err) } - switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd + log.SetPrefix("[nsinit exec] ") + var exitCode int nspid, err := readPid() if err != nil { @@ -50,17 +57,22 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, logFile, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container + log.SetPrefix("[nsinit init] ") + cwd, err := os.Getwd() + if err != nil { + log.Fatal(err) + } if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, cwd, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: @@ -93,3 +105,20 @@ func readPid() (int, error) { } return pid, nil } + +func setupLogging() (err error) { + var writer io.Writer + switch logFile { + case "stderr": + writer = os.Stderr + case "none", "": + writer = ioutil.Discard + default: + writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) + if err != nil { + return err + } + } + log.SetOutput(writer) + return nil +} From e90b85bdc06ffaef70f801e00eb62158d016e2a7 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 15:32:50 -0800 Subject: [PATCH 36/64] User os.Args[0] as name to reexec Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 24e722a..ba548a2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -219,7 +219,9 @@ func deletePidFile() error { // defined on the container's configuration and use the current binary as the init with the // args provided func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{ + // get our binary name so we can always reexec ourself + name := os.Args[0] + command := exec.Command(name, append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), "-log", logFile, From b9bc36a8bb9cb928a0be940e22e77fb851175f87 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:17:18 -0800 Subject: [PATCH 37/64] Use lookup path for init Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 1 + libcontainer/nsinit/init.go | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ba548a2..80fe849 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -230,5 +230,6 @@ func createCommand(container *libcontainer.Container, console, logFile string, p command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } + command.Env = container.Env return command } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 8fc5f3d..04716ba 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -12,6 +12,7 @@ import ( "io/ioutil" "log" "os" + "os/exec" "path/filepath" "syscall" ) @@ -80,8 +81,13 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - log.Printf("execing %s goodbye", args[0]) - if err := system.Exec(args[0], args[0:], container.Env); err != nil { + name, err := exec.LookPath(args[0]) + if err != nil { + return err + } + + log.Printf("execing %s goodbye", name) + if err := system.Exec(name, args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") From b3d2325c5fc91e3065715ef94aacabd619035e2d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:28:43 -0800 Subject: [PATCH 38/64] Pass pipes into Exec function Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 14 +++++++------- libcontainer/nsinit/nsinit/main.go | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 80fe849..98f5209 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -19,7 +19,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, logFile string, args []string) (int, error) { var ( master *os.File console string @@ -97,23 +97,23 @@ func Exec(container *libcontainer.Container, logFile string, args []string) (int if container.Tty { log.Printf("starting copy for tty") - go io.Copy(os.Stdout, master) - go io.Copy(master, os.Stdin) + go io.Copy(stdout, master) + go io.Copy(master, stdin) state, err := setupWindow(master) if err != nil { command.Process.Kill() return -1, err } - defer term.RestoreTerminal(os.Stdin.Fd(), state) + defer term.RestoreTerminal(uintptr(syscall.Stdin), state) } else { log.Printf("starting copy for std pipes") go func() { defer inPipe.Close() - io.Copy(inPipe, os.Stdin) + io.Copy(inPipe, stdin) }() - go io.Copy(os.Stdout, outPipe) - go io.Copy(os.Stderr, errPipe) + go io.Copy(stdout, outPipe) + go io.Copy(stderr, errPipe) } log.Printf("waiting on process") diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 0873c09..e6e3827 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -57,7 +57,9 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, logFile, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, + os.Stdin, os.Stdout, os.Stderr, + logFile, flag.Args()[1:]) } if err != nil { log.Fatal(err) From fa64bff7151c8050d7e6cab5e6d965bc389ae6e6 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:40:32 -0800 Subject: [PATCH 39/64] Pass tty master to Exec Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 3 +-- libcontainer/nsinit/nsinit/main.go | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 98f5209..3622196 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -19,9 +19,8 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { var ( - master *os.File console string err error diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e6e3827..28d42d4 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -58,7 +58,7 @@ func main() { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { exitCode, err = nsinit.Exec(container, - os.Stdin, os.Stdout, os.Stderr, + os.Stdin, os.Stdout, os.Stderr, nil, logFile, flag.Args()[1:]) } if err != nil { From d2fa488fa2a8cf1d5d1ccc323246fb138e64a1f2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 17:11:57 -0800 Subject: [PATCH 40/64] Initial commit of libcontainer running docker Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 2 +- libcontainer/nsinit/ns_linux.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 3622196..6671ebe 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -227,7 +227,7 @@ func createCommand(container *libcontainer.Container, console, logFile string, p "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), + Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), } command.Env = container.Env return command diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index e42d4b8..58af247 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -28,7 +28,7 @@ var namespaceFileMap = map[libcontainer.Namespace]string{ // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns -func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { flag |= namespaceMap[ns] } From cda4f27f57919dcfd0d2cc380456551ea50bdaa2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 21:14:21 -0800 Subject: [PATCH 41/64] Export functions of nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6671ebe..b2eaa0b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -30,7 +30,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if container.Tty { log.Printf("setting up master and console") - master, console, err = createMasterAndConsole() + master, console, err = CreateMasterAndConsole() if err != nil { return -1, err } @@ -44,7 +44,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } system.UsetCloseOnExec(r.Fd()) - command := createCommand(container, console, logFile, r.Fd(), args) + command := CreateCommand(container, console, logFile, r.Fd(), args) if !container.Tty { log.Printf("opening pipes on command") if inPipe, err = command.StdinPipe(); err != nil { @@ -81,12 +81,12 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if container.Network != nil { log.Printf("creating veth pair") - vethPair, err := initializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) + vethPair, err := InitializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) if err != nil { return -1, err } log.Printf("sending %s as veth pair name", vethPair) - sendVethName(w, vethPair) + SendVethName(w, vethPair) } // Sync with child @@ -99,7 +99,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. go io.Copy(stdout, master) go io.Copy(master, stdin) - state, err := setupWindow(master) + state, err := SetupWindow(master, os.Stdin) if err != nil { command.Process.Kill() return -1, err @@ -125,9 +125,9 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } -// sendVethName writes the veth pair name to the child's stdin then closes the +// SendVethName writes the veth pair name to the child's stdin then closes the // pipe so that the child stops waiting for more data -func sendVethName(pipe io.Writer, name string) { +func SendVethName(pipe io.Writer, name string) { fmt.Fprint(pipe, name) } @@ -138,7 +138,7 @@ func sendVethName(pipe io.Writer, name string) { // Then will with set the other side of the veth pair into the container's namespaced // using the pid and returns the veth's interface name to provide to the container to // finish setting up the interface inside the namespace -func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { +func InitializeContainerVeth(bridge string, mtu, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err @@ -160,20 +160,22 @@ func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { return name2, nil } -func setupWindow(master *os.File) (*term.State, error) { - ws, err := term.GetWinsize(os.Stdin.Fd()) +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func SetupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) if err != nil { return nil, err } if err := term.SetWinsize(master.Fd(), ws); err != nil { return nil, err } - return term.SetRawTerminal(os.Stdin.Fd()) + return term.SetRawTerminal(parent.Fd()) } -// createMasterAndConsole will open /dev/ptmx on the host and retreive the +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the // pts name for use as the pty slave inside the container -func createMasterAndConsole() (*os.File, string, error) { +func CreateMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, "", err @@ -217,7 +219,7 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { +func CreateCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { // get our binary name so we can always reexec ourself name := os.Args[0] command := exec.Command(name, append([]string{ From 609c298810f3c22176493a425dae22f274bfe20b Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:20:15 -0800 Subject: [PATCH 42/64] Refactor network creation and initialization into strategies Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 9 ++- libcontainer/container.go | 13 ++-- libcontainer/container.json | 9 ++- libcontainer/network/strategy.go | 32 ++++++++++ libcontainer/network/veth.go | 103 +++++++++++++++++++++++++++++++ libcontainer/nsinit/exec.go | 97 +++++++++++------------------ libcontainer/nsinit/init.go | 55 ++++++----------- 7 files changed, 211 insertions(+), 107 deletions(-) create mode 100644 libcontainer/network/strategy.go create mode 100644 libcontainer/network/veth.go diff --git a/libcontainer/README.md b/libcontainer/README.md index 89a4ec0..36553af 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -45,12 +45,17 @@ Sample `container.json` file: "AUDIT_WRITE", "AUDIT_CONTROL", "MAC_OVERRIDE", - "MAC_ADMIN" + "MAC_ADMIN", + "NET_ADMIN" ], "network": { + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, "address": "172.17.0.100/16", "gateway": "172.17.42.1", - "bridge": "docker0", "mtu": 1500 }, "cgroups": { diff --git a/libcontainer/container.go b/libcontainer/container.go index 3c1b62b..4a47977 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -4,6 +4,10 @@ import ( "github.com/dotcloud/docker/pkg/cgroups" ) +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + // Container defines configuration options for how a // container is setup inside a directory and how a process should be executed type Container struct { @@ -24,8 +28,9 @@ type Container struct { // The network configuration can be omited from a container causing the // container to be setup with the host's networking stack type Network struct { - Address string `json:"address,omitempty"` - Gateway string `json:"gateway,omitempty"` - Bridge string `json:"bridge,omitempty"` - Mtu int `json:"mtu,omitempty"` + Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc + Context Context `json:"context,omitempty"` // generic context for type specific networking options + Address string `json:"address,omitempty"` + Gateway string `json:"gateway,omitempty"` + Mtu int `json:"mtu,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 07e52df..c2b21f8 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -28,12 +28,17 @@ "AUDIT_WRITE", "AUDIT_CONTROL", "MAC_OVERRIDE", - "MAC_ADMIN" + "MAC_ADMIN", + "NET_ADMIN" ], "network": { + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, "address": "172.17.0.100/16", "gateway": "172.17.42.1", - "bridge": "docker0", "mtu": 1500 }, "cgroups": { diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go new file mode 100644 index 0000000..8ecc11a --- /dev/null +++ b/libcontainer/network/strategy.go @@ -0,0 +1,32 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +var ( + ErrNotValidStrategyType = errors.New("not a valid network strategy type") +) + +var strategies = map[string]NetworkStrategy{ + "veth": &Veth{}, +} + +// NetworkStrategy represends a specific network configuration for +// a containers networking stack +type NetworkStrategy interface { + Create(*libcontainer.Network, int) (libcontainer.Context, error) + Initialize(*libcontainer.Network, libcontainer.Context) error +} + +// GetStrategy returns the specific network strategy for the +// provided type. If no strategy is registered for the type an +// ErrNotValidStrategyType is returned. +func GetStrategy(tpe string) (NetworkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, ErrNotValidStrategyType + } + return s, nil +} diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go new file mode 100644 index 0000000..61fec55 --- /dev/null +++ b/libcontainer/network/veth.go @@ -0,0 +1,103 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "log" +) + +type Veth struct { +} + +func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { + log.Printf("creating veth network") + var ( + bridge string + prefix string + exists bool + ) + if bridge, exists = n.Context["bridge"]; !exists { + return nil, fmt.Errorf("bridge does not exist in network context") + } + if prefix, exists = n.Context["prefix"]; !exists { + return nil, fmt.Errorf("veth prefix does not exist in network context") + } + name1, name2, err := createVethPair(prefix) + if err != nil { + return nil, err + } + context := libcontainer.Context{ + "vethHost": name1, + "vethChild": name2, + } + log.Printf("veth pair created %s <> %s", name1, name2) + if err := SetInterfaceMaster(name1, bridge); err != nil { + return context, err + } + if err := SetMtu(name1, n.Mtu); err != nil { + return context, err + } + if err := InterfaceUp(name1); err != nil { + return context, err + } + log.Printf("setting %s inside %d namespace", name2, nspid) + if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { + return context, err + } + return context, nil +} + +func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { + var ( + vethChild string + exists bool + ) + if vethChild, exists = context["vethChild"]; !exists { + return fmt.Errorf("vethChild does not exist in network context") + } + if err := InterfaceDown(vethChild); err != nil { + return fmt.Errorf("interface down %s %s", vethChild, err) + } + if err := ChangeInterfaceName(vethChild, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", vethChild, err) + } + if err := SetInterfaceIp("eth0", config.Address); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created +func createVethPair(prefix string) (name1 string, name2 string, err error) { + name1, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + if err = CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b2eaa0b..6c4d766 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,10 +3,10 @@ package nsinit import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" - "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -19,11 +19,11 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, + master *os.File, logFile string, args []string) (int, error) { var ( - console string - err error - + console string + err error inPipe io.WriteCloser outPipe, errPipe io.ReadCloser ) @@ -46,7 +46,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command := CreateCommand(container, console, logFile, r.Fd(), args) if !container.Tty { - log.Printf("opening pipes on command") + log.Printf("opening std pipes") if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -78,15 +78,9 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return -1, err } } - - if container.Network != nil { - log.Printf("creating veth pair") - vethPair, err := InitializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) - if err != nil { - return -1, err - } - log.Printf("sending %s as veth pair name", vethPair) - SendVethName(w, vethPair) + if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + command.Process.Kill() + return -1, err } // Sync with child @@ -104,7 +98,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command.Process.Kill() return -1, err } - defer term.RestoreTerminal(uintptr(syscall.Stdin), state) + defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { log.Printf("starting copy for std pipes") go func() { @@ -125,39 +119,34 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } -// SendVethName writes the veth pair name to the child's stdin then closes the -// pipe so that the child stops waiting for more data -func SendVethName(pipe io.Writer, name string) { - fmt.Fprint(pipe, name) +func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { + if container.Network != nil { + log.Printf("creating host network configuration type %s", container.Network.Type) + strategy, err := network.GetStrategy(container.Network.Type) + if err != nil { + return err + } + networkContext, err := strategy.Create(container.Network, nspid) + if err != nil { + return err + } + log.Printf("sending %v as network context", networkContext) + if err := SendContext(pipe, networkContext); err != nil { + return err + } + } + return nil } -// initializeContainerVeth will create a veth pair and setup the host's -// side of the pair by setting the specified bridge as the master and bringing -// up the interface. -// -// Then will with set the other side of the veth pair into the container's namespaced -// using the pid and returns the veth's interface name to provide to the container to -// finish setting up the interface inside the namespace -func InitializeContainerVeth(bridge string, mtu, nspid int) (string, error) { - name1, name2, err := createVethPair() +// SendContext writes the veth pair name to the child's stdin then closes the +// pipe so that the child stops waiting for more data +func SendContext(pipe io.Writer, context libcontainer.Context) error { + data, err := json.Marshal(context) if err != nil { - return "", err + return err } - log.Printf("veth pair created %s <> %s", name1, name2) - if err := network.SetInterfaceMaster(name1, bridge); err != nil { - return "", err - } - if err := network.SetMtu(name1, mtu); err != nil { - return "", err - } - if err := network.InterfaceUp(name1); err != nil { - return "", err - } - log.Printf("setting %s inside %d namespace", name2, nspid) - if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { - return "", err - } - return name2, nil + pipe.Write(data) + return nil } // SetupWindow gets the parent window size and sets the master @@ -190,29 +179,13 @@ func CreateMasterAndConsole() (*os.File, string, error) { return master, console, nil } -// createVethPair will automatically generage two random names for -// the veth pair and ensure that they have been created -func createVethPair() (name1 string, name2 string, err error) { - name1, err = utils.GenerateRandomName("dock", 4) - if err != nil { - return - } - name2, err = utils.GenerateRandomName("dock", 4) - if err != nil { - return - } - if err = network.CreateVethPair(name1, name2); err != nil { - return - } - return -} - // writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } func deletePidFile() error { + log.Printf("removing .nspid file") return os.Remove(".nspid") } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 04716ba..f530d4a 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -3,6 +3,7 @@ package nsinit import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -27,13 +28,10 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well - tempVethName, err := getVethName(pipe) + context, err := GetContextFromParent(pipe) if err != nil { return err } - if tempVethName != "" { - log.Printf("received veth name %s", tempVethName) - } if console != "" { log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty @@ -62,7 +60,7 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupVethNetwork(container.Network, tempVethName); err != nil { + if err := setupNetwork(container.Network, context); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -145,46 +143,29 @@ func openTerminal(name string, flag int) (*os.File, error) { // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway -func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { +func setupNetwork(config *libcontainer.Network, context libcontainer.Context) error { if config != nil { - if err := network.InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) - } - if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := network.SetInterfaceIp("eth0", config.Address); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - if err := network.SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - if err := network.SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - if config.Gateway != "" { - if err := network.SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err } + return strategy.Initialize(config, context) } return nil } -// getVethName reads from Stdin the temp veth name -// sent by the parent processes after the veth pair -// has been created and setup -func getVethName(pipe io.ReadCloser) (string, error) { +func GetContextFromParent(pipe io.ReadCloser) (libcontainer.Context, error) { defer pipe.Close() data, err := ioutil.ReadAll(pipe) if err != nil { - return "", fmt.Errorf("error reading from stdin %s", err) + return nil, fmt.Errorf("error reading from stdin %s", err) } - return string(data), nil + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + log.Printf("received context %v", context) + } + return context, nil } From c71bc032798742df70551a7b6cbbe8ca6dbd03f2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:37:09 -0800 Subject: [PATCH 43/64] Refactor exec method Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 138 ++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 61 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6c4d766..3cbe43a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -22,20 +22,10 @@ import ( func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { var ( - console string - err error - inPipe io.WriteCloser - outPipe, errPipe io.ReadCloser + console string + err error ) - if container.Tty { - log.Printf("setting up master and console") - master, console, err = CreateMasterAndConsole() - if err != nil { - return -1, err - } - } - // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child r, w, err := os.Pipe() @@ -44,49 +34,15 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } system.UsetCloseOnExec(r.Fd()) + if container.Tty { + log.Printf("setting up master and console") + master, console, err = CreateMasterAndConsole() + if err != nil { + return -1, err + } + } + command := CreateCommand(container, console, logFile, r.Fd(), args) - if !container.Tty { - log.Printf("opening std pipes") - if inPipe, err = command.StdinPipe(); err != nil { - return -1, err - } - if outPipe, err = command.StdoutPipe(); err != nil { - return -1, err - } - if errPipe, err = command.StderrPipe(); err != nil { - return -1, err - } - } - - log.Printf("staring init") - if err := command.Start(); err != nil { - return -1, err - } - log.Printf("writting state file") - if err := writePidFile(command); err != nil { - command.Process.Kill() - return -1, err - } - defer deletePidFile() - - // Do this before syncing with child so that no children - // can escape the cgroup - if container.Cgroups != nil { - log.Printf("setting up cgroups") - if err := container.Cgroups.Apply(command.Process.Pid); err != nil { - command.Process.Kill() - return -1, err - } - } - if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { - command.Process.Kill() - return -1, err - } - - // Sync with child - log.Printf("closing sync pipes") - w.Close() - r.Close() if container.Tty { log.Printf("starting copy for tty") @@ -100,15 +56,39 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { - log.Printf("starting copy for std pipes") - go func() { - defer inPipe.Close() - io.Copy(inPipe, stdin) - }() - go io.Copy(stdout, outPipe) - go io.Copy(stderr, errPipe) + if err := startStdCopy(command, stdin, stdout, stderr); err != nil { + command.Process.Kill() + return -1, err + } } + log.Printf("staring init") + if err := command.Start(); err != nil { + return -1, err + } + log.Printf("writing state file") + if err := writePidFile(command); err != nil { + command.Process.Kill() + return -1, err + } + defer deletePidFile() + + // Do this before syncing with child so that no children + // can escape the cgroup + if err := SetupCgroups(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + command.Process.Kill() + return -1, err + } + + // Sync with child + log.Printf("closing sync pipes") + w.Close() + r.Close() + log.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { @@ -119,6 +99,16 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +func SetupCgroups(container *libcontainer.Container, nspid int) error { + if container.Cgroups != nil { + log.Printf("setting up cgroups") + if err := container.Cgroups.Apply(nspid); err != nil { + return err + } + } + return nil +} + func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { if container.Network != nil { log.Printf("creating host network configuration type %s", container.Network.Type) @@ -207,3 +197,29 @@ func CreateCommand(container *libcontainer.Container, console, logFile string, p command.Env = container.Env return command } + +func startStdCopy(command *exec.Cmd, stdin io.Reader, stdout, stderr io.Writer) error { + log.Printf("opening std pipes") + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + log.Printf("starting copy for std pipes") + go func() { + defer inPipe.Close() + io.Copy(inPipe, stdin) + }() + go io.Copy(stdout, outPipe) + go io.Copy(stderr, errPipe) + + return nil +} From 118ca3ae64768042997def8f3ff0d389d2049567 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:58:30 -0800 Subject: [PATCH 44/64] Add syncpipe for passing context Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 27 +++-------- libcontainer/nsinit/init.go | 44 ++++++++---------- libcontainer/nsinit/nsinit/main.go | 6 ++- libcontainer/nsinit/sync_pipe.go | 73 ++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 48 deletions(-) create mode 100644 libcontainer/nsinit/sync_pipe.go diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 3cbe43a..ec75e9c 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,7 +3,6 @@ package nsinit import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" @@ -28,11 +27,10 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child - r, w, err := os.Pipe() + syncPipe, err := NewSyncPipe() if err != nil { return -1, err } - system.UsetCloseOnExec(r.Fd()) if container.Tty { log.Printf("setting up master and console") @@ -42,8 +40,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } } - command := CreateCommand(container, console, logFile, r.Fd(), args) - + command := CreateCommand(container, console, logFile, syncPipe.child.Fd(), args) if container.Tty { log.Printf("starting copy for tty") go io.Copy(stdout, master) @@ -79,15 +76,14 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command.Process.Kill() return -1, err } - if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + if err := InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() return -1, err } // Sync with child log.Printf("closing sync pipes") - w.Close() - r.Close() + syncPipe.Close() log.Printf("waiting on process") if err := command.Wait(); err != nil { @@ -109,7 +105,7 @@ func SetupCgroups(container *libcontainer.Container, nspid int) error { return nil } -func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { +func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { log.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) @@ -121,24 +117,13 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io. return err } log.Printf("sending %v as network context", networkContext) - if err := SendContext(pipe, networkContext); err != nil { + if err := pipe.SendToChild(networkContext); err != nil { return err } } return nil } -// SendContext writes the veth pair name to the child's stdin then closes the -// pipe so that the child stops waiting for more data -func SendContext(pipe io.Writer, context libcontainer.Context) error { - data, err := json.Marshal(context) - if err != nil { - return err - } - pipe.Write(data) - return nil -} - // SetupWindow gets the parent window size and sets the master // pty to the current size and set the parents mode to RAW func SetupWindow(master, parent *os.File) (*term.State, error) { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f530d4a..cdedc14 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -3,14 +3,11 @@ package nsinit import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "io" - "io/ioutil" "log" "os" "os/exec" @@ -20,7 +17,7 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe io.ReadCloser, args []string) error { +func Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err @@ -28,16 +25,18 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well - context, err := GetContextFromParent(pipe) + context, err := syncPipe.ReadFromParent() if err != nil { + syncPipe.Close() return err } + syncPipe.Close() + log.Printf("received context from parent %v", context) + if console != "" { log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() + closeStdPipes() slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) @@ -79,18 +78,27 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } + return execArgs(args, container.Env) +} + +func execArgs(args []string, env []string) error { name, err := exec.LookPath(args[0]) if err != nil { return err } - log.Printf("execing %s goodbye", name) - if err := system.Exec(name, args[0:], container.Env); err != nil { + if err := system.Exec(name, args[0:], env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") } +func closeStdPipes() { + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() +} + // resolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func resolveRootfs(uncleanRootfs string) (string, error) { @@ -153,19 +161,3 @@ func setupNetwork(config *libcontainer.Network, context libcontainer.Context) er } return nil } - -func GetContextFromParent(pipe io.ReadCloser) (libcontainer.Context, error) { - defer pipe.Close() - data, err := ioutil.ReadAll(pipe) - if err != nil { - return nil, fmt.Errorf("error reading from stdin %s", err) - } - var context libcontainer.Context - if len(data) > 0 { - if err := json.Unmarshal(data, &context); err != nil { - return nil, err - } - log.Printf("received context %v", context) - } - return context, nil -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 28d42d4..2400ab6 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -74,7 +74,11 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, cwd, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) + if err != nil { + log.Fatal(err) + } + if err := nsinit.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { log.Fatal(err) } default: diff --git a/libcontainer/nsinit/sync_pipe.go b/libcontainer/nsinit/sync_pipe.go new file mode 100644 index 0000000..7b29e98 --- /dev/null +++ b/libcontainer/nsinit/sync_pipe.go @@ -0,0 +1,73 @@ +package nsinit + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" +) + +// SyncPipe allows communication to and from the child processes +// to it's parent and allows the two independent processes to +// syncronize their state. +type SyncPipe struct { + parent, child *os.File +} + +func NewSyncPipe() (s *SyncPipe, err error) { + s = &SyncPipe{} + s.child, s.parent, err = os.Pipe() + if err != nil { + return nil, err + } + system.UsetCloseOnExec(s.child.Fd()) + return s, nil +} + +func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) { + s := &SyncPipe{} + if parendFd > 0 { + s.parent = os.NewFile(parendFd, "parendPipe") + } else if childFd > 0 { + s.child = os.NewFile(childFd, "childPipe") + } else { + return nil, fmt.Errorf("no valid sync pipe fd specified") + } + return s, nil +} + +func (s *SyncPipe) SendToChild(context libcontainer.Context) error { + data, err := json.Marshal(context) + if err != nil { + return err + } + s.parent.Write(data) + return nil +} + +func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) { + data, err := ioutil.ReadAll(s.child) + if err != nil { + return nil, fmt.Errorf("error reading from sync pipe %s", err) + } + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + } + return context, nil + +} + +func (s *SyncPipe) Close() error { + if s.parent != nil { + s.parent.Close() + } + if s.child != nil { + s.child.Close() + } + return nil +} From 1271ddcd61f90b3c25654bdb5acf130b6e107189 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 22 Feb 2014 00:29:21 -0800 Subject: [PATCH 45/64] Abstract out diff implementations for importing Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/command.go | 34 +++++++++ libcontainer/nsinit/exec.go | 107 ++++------------------------ libcontainer/nsinit/nsinit/main.go | 4 +- libcontainer/nsinit/state.go | 24 +++++++ libcontainer/nsinit/term.go | 109 +++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 94 deletions(-) create mode 100644 libcontainer/nsinit/command.go create mode 100644 libcontainer/nsinit/state.go create mode 100644 libcontainer/nsinit/term.go diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go new file mode 100644 index 0000000..b1c5631 --- /dev/null +++ b/libcontainer/nsinit/command.go @@ -0,0 +1,34 @@ +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "os" + "os/exec" + "syscall" +) + +type CommandFactory interface { + Create(container *libcontainer.Container, console, logFile string, syncFd uintptr, args []string) *exec.Cmd +} + +type DefaultCommandFactory struct{} + +// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { + // get our binary name so we can always reexec ourself + name := os.Args[0] + command := exec.Command(name, append([]string{ + "-console", console, + "-pipe", fmt.Sprint(pipe), + "-log", logFile, + "init"}, args...)...) + + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), + } + command.Env = container.Env + return command +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ec75e9c..ee83f4f 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,13 +3,9 @@ package nsinit import ( - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "github.com/dotcloud/docker/pkg/term" - "io" - "io/ioutil" "log" "os" "os/exec" @@ -18,9 +14,11 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, - master *os.File, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, + factory CommandFactory, state StateWriter, term Terminal, + logFile string, args []string) (int, error) { var ( + master *os.File console string err error ) @@ -38,37 +36,28 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if err != nil { return -1, err } + term.SetMaster(master) } - command := CreateCommand(container, console, logFile, syncPipe.child.Fd(), args) - if container.Tty { - log.Printf("starting copy for tty") - go io.Copy(stdout, master) - go io.Copy(master, stdin) - - state, err := SetupWindow(master, os.Stdin) - if err != nil { - command.Process.Kill() - return -1, err - } - defer term.RestoreTerminal(os.Stdin.Fd(), state) - } else { - if err := startStdCopy(command, stdin, stdout, stderr); err != nil { - command.Process.Kill() - return -1, err - } + command := factory.Create(container, console, logFile, syncPipe.child.Fd(), args) + if err := term.Attach(command); err != nil { + return -1, err } + defer term.Close() log.Printf("staring init") if err := command.Start(); err != nil { return -1, err } log.Printf("writing state file") - if err := writePidFile(command); err != nil { + if err := state.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } - defer deletePidFile() + defer func() { + log.Printf("removing state file") + state.DeletePid() + }() // Do this before syncing with child so that no children // can escape the cgroup @@ -124,19 +113,6 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *Sy return nil } -// SetupWindow gets the parent window size and sets the master -// pty to the current size and set the parents mode to RAW -func SetupWindow(master, parent *os.File) (*term.State, error) { - ws, err := term.GetWinsize(parent.Fd()) - if err != nil { - return nil, err - } - if err := term.SetWinsize(master.Fd(), ws); err != nil { - return nil, err - } - return term.SetRawTerminal(parent.Fd()) -} - // CreateMasterAndConsole will open /dev/ptmx on the host and retreive the // pts name for use as the pty slave inside the container func CreateMasterAndConsole() (*os.File, string, error) { @@ -153,58 +129,3 @@ func CreateMasterAndConsole() (*os.File, string, error) { } return master, console, nil } - -// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container -func writePidFile(command *exec.Cmd) error { - return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) -} - -func deletePidFile() error { - log.Printf("removing .nspid file") - return os.Remove(".nspid") -} - -// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces -// defined on the container's configuration and use the current binary as the init with the -// args provided -func CreateCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { - // get our binary name so we can always reexec ourself - name := os.Args[0] - command := exec.Command(name, append([]string{ - "-console", console, - "-pipe", fmt.Sprint(pipe), - "-log", logFile, - "init"}, args...)...) - - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), - } - command.Env = container.Env - return command -} - -func startStdCopy(command *exec.Cmd, stdin io.Reader, stdout, stderr io.Writer) error { - log.Printf("opening std pipes") - inPipe, err := command.StdinPipe() - if err != nil { - return err - } - outPipe, err := command.StdoutPipe() - if err != nil { - return err - } - errPipe, err := command.StderrPipe() - if err != nil { - return err - } - - log.Printf("starting copy for std pipes") - go func() { - defer inPipe.Close() - io.Copy(inPipe, stdin) - }() - go io.Copy(stdout, outPipe) - go io.Copy(stderr, errPipe) - - return nil -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 2400ab6..c299412 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -57,8 +57,10 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { + term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) exitCode, err = nsinit.Exec(container, - os.Stdin, os.Stdout, os.Stderr, nil, + &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}, + term, logFile, flag.Args()[1:]) } if err != nil { diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go new file mode 100644 index 0000000..1f0fedd --- /dev/null +++ b/libcontainer/nsinit/state.go @@ -0,0 +1,24 @@ +package nsinit + +import ( + "fmt" + "io/ioutil" + "os" +) + +type StateWriter interface { + WritePid(pid int) error + DeletePid() error +} + +type DefaultStateWriter struct { +} + +// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container +func (*DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(pid)), 0655) +} + +func (*DefaultStateWriter) DeletePid() error { + return os.Remove(".nspid") +} diff --git a/libcontainer/nsinit/term.go b/libcontainer/nsinit/term.go new file mode 100644 index 0000000..6492468 --- /dev/null +++ b/libcontainer/nsinit/term.go @@ -0,0 +1,109 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/term" + "io" + "os" + "os/exec" +) + +type Terminal interface { + io.Closer + SetMaster(*os.File) + Attach(*exec.Cmd) error +} + +func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { + if tty { + return &TtyTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } + } + return &StdTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } +} + +type TtyTerminal struct { + stdin io.Reader + stdout, stderr io.Writer + master *os.File + state *term.State +} + +func (t *TtyTerminal) SetMaster(master *os.File) { + t.master = master +} + +func (t *TtyTerminal) Attach(command *exec.Cmd) error { + go io.Copy(t.stdout, t.master) + go io.Copy(t.master, t.stdin) + + state, err := t.setupWindow(t.master, os.Stdin) + if err != nil { + command.Process.Kill() + return err + } + t.state = state + return err +} + +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(parent.Fd()) +} + +func (t *TtyTerminal) Close() error { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + return t.master.Close() +} + +type StdTerminal struct { + stdin io.Reader + stdout, stderr io.Writer +} + +func (s *StdTerminal) SetMaster(*os.File) { + // no need to set master on non tty +} + +func (s *StdTerminal) Close() error { + return nil +} + +func (s *StdTerminal) Attach(command *exec.Cmd) error { + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + go func() { + defer inPipe.Close() + io.Copy(inPipe, s.stdin) + }() + + go io.Copy(s.stdout, outPipe) + go io.Copy(s.stderr, errPipe) + + return nil +} From a42c6fafbedef2718a149c8f2fe364203e9e133f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 22 Feb 2014 01:21:26 -0800 Subject: [PATCH 46/64] Refactor driver to use Exec function from nsini Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/state.go | 10 ++++++---- libcontainer/nsinit/term.go | 9 +++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 1f0fedd..2dbaaa5 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -4,6 +4,7 @@ import ( "fmt" "io/ioutil" "os" + "path/filepath" ) type StateWriter interface { @@ -12,13 +13,14 @@ type StateWriter interface { } type DefaultStateWriter struct { + Root string } // writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container -func (*DefaultStateWriter) WritePid(pid int) error { - return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(pid)), 0655) +func (d *DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(filepath.Join(d.Root, ".nspid"), []byte(fmt.Sprint(pid)), 0655) } -func (*DefaultStateWriter) DeletePid() error { - return os.Remove(".nspid") +func (d *DefaultStateWriter) DeletePid() error { + return os.Remove(filepath.Join(d.Root, ".nspid")) } diff --git a/libcontainer/nsinit/term.go b/libcontainer/nsinit/term.go index 6492468..58dccab 100644 --- a/libcontainer/nsinit/term.go +++ b/libcontainer/nsinit/term.go @@ -11,6 +11,7 @@ type Terminal interface { io.Closer SetMaster(*os.File) Attach(*exec.Cmd) error + Resize(h, w int) error } func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { @@ -35,6 +36,10 @@ type TtyTerminal struct { state *term.State } +func (t *TtyTerminal) Resize(h, w int) error { + return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + func (t *TtyTerminal) SetMaster(master *os.File) { t.master = master } @@ -83,6 +88,10 @@ func (s *StdTerminal) Close() error { return nil } +func (s *StdTerminal) Resize(h, w int) error { + return nil +} + func (s *StdTerminal) Attach(command *exec.Cmd) error { inPipe, err := command.StdinPipe() if err != nil { From d388db815c75b5f34909167ead0683559c2ea074 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 10:46:20 -0800 Subject: [PATCH 47/64] Look for cpu subsystem instead of memory Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 96002f0..e260d67 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -132,7 +132,7 @@ func (c *Cgroup) Apply(pid int) error { // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ // // we can pick any subsystem to find the root - cgroupRoot, err := FindCgroupMountpoint("memory") + cgroupRoot, err := FindCgroupMountpoint("cpu") if err != nil { return err } From b899d9bc446dfa6a6bb5f1b62b0b7800bab5fc74 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 13:40:17 -0800 Subject: [PATCH 48/64] Fix tests with dockerinit lookup path Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/nsinit/main.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index c299412..786c9c1 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -24,7 +24,7 @@ var ( ErrWrongArguments = errors.New("Wrong argument count") ) -func init() { +func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") @@ -33,6 +33,8 @@ func init() { } func main() { + registerFlags() + if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } From d50dc3cb7e46ad36201bbe74a802d9a4f4817cab Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 13:52:56 -0800 Subject: [PATCH 49/64] Honor user passed on container in nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 34 +++++++++++++++++++++++++--------- system/calls_linux.go | 8 ++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cdedc14..23303cd 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -8,6 +8,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/user" "log" "os" "os/exec" @@ -110,15 +111,30 @@ func resolveRootfs(uncleanRootfs string) (string, error) { } func setupUser(container *libcontainer.Container) error { - // TODO: honor user passed on container - if err := system.Setgroups(nil); err != nil { - return err - } - if err := system.Setresgid(0, 0, 0); err != nil { - return err - } - if err := system.Setresuid(0, 0, 0); err != nil { - return err + if container.User != "" { + uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) + if err != nil { + return err + } + if err := system.Setgroups(suppGids); err != nil { + return err + } + if err := system.Setgid(gid); err != nil { + return err + } + if err := system.Setuid(uid); err != nil { + return err + } + } else { + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } } return nil } diff --git a/system/calls_linux.go b/system/calls_linux.go index 42afa34..0bf42e3 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -71,6 +71,14 @@ func Setresuid(ruid, euid, suid int) error { return syscall.Setresuid(ruid, euid, suid) } +func Setgid(gid int) error { + return syscall.Setgid(gid) +} + +func Setuid(uid int) error { + return syscall.Setuid(uid) +} + func Sethostname(name string) error { return syscall.Sethostname([]byte(name)) } From c8ad8184ec75d3abfcb9654e62827a60cc3fd79f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 15:47:23 -0800 Subject: [PATCH 50/64] Cgroups allow devices for privileged containers Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 23303cd..d6d7dc3 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -9,7 +9,6 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" - "log" "os" "os/exec" "path/filepath" @@ -23,7 +22,6 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync if err != nil { return err } - log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well context, err := syncPipe.ReadFromParent() @@ -32,10 +30,8 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync return err } syncPipe.Close() - log.Printf("received context from parent %v", context) if console != "" { - log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty closeStdPipes() slave, err := openTerminal(console, syscall.O_RDWR) @@ -66,11 +62,9 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - log.Printf("dropping capabilities") if err := capabilities.DropCapabilities(container); err != nil { return fmt.Errorf("drop capabilities %s", err) } - log.Printf("setting user in namespace") if err := setupUser(container); err != nil { return fmt.Errorf("setup user %s", err) } @@ -87,7 +81,6 @@ func execArgs(args []string, env []string) error { if err != nil { return err } - log.Printf("execing %s goodbye", name) if err := system.Exec(name, args[0:], env); err != nil { return fmt.Errorf("exec %s", err) } @@ -111,7 +104,7 @@ func resolveRootfs(uncleanRootfs string) (string, error) { } func setupUser(container *libcontainer.Container) error { - if container.User != "" { + if container.User != "" && container.User != "root" { uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) if err != nil { return err From 0e4d946dc4a5ed1c689cf2a57b3da38bf99ba1b1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 18:38:24 -0800 Subject: [PATCH 51/64] Improve logging for nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 4 --- libcontainer/nsinit/exec.go | 43 +++++++++++++++--------------- libcontainer/nsinit/execin.go | 2 +- libcontainer/nsinit/init.go | 2 +- libcontainer/nsinit/nsinit.go | 29 ++++++++++++++++++++ libcontainer/nsinit/nsinit/main.go | 36 ++++++++++++++----------- 6 files changed, 72 insertions(+), 44 deletions(-) create mode 100644 libcontainer/nsinit/nsinit.go diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 61fec55..321c68e 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -4,14 +4,12 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/utils" - "log" ) type Veth struct { } func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { - log.Printf("creating veth network") var ( bridge string prefix string @@ -31,7 +29,6 @@ func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, "vethHost": name1, "vethChild": name2, } - log.Printf("veth pair created %s <> %s", name1, name2) if err := SetInterfaceMaster(name1, bridge); err != nil { return context, err } @@ -41,7 +38,6 @@ func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, if err := InterfaceUp(name1); err != nil { return context, err } - log.Printf("setting %s inside %d namespace", name2, nspid) if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { return context, err } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ee83f4f..c407323 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -6,7 +6,6 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "log" "os" "os/exec" "syscall" @@ -14,9 +13,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, - factory CommandFactory, state StateWriter, term Terminal, - logFile string, args []string) (int, error) { +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { var ( master *os.File console string @@ -31,7 +28,7 @@ func Exec(container *libcontainer.Container, } if container.Tty { - log.Printf("setting up master and console") + ns.logger.Printf("setting up master and console") master, console, err = CreateMasterAndConsole() if err != nil { return -1, err @@ -39,54 +36,56 @@ func Exec(container *libcontainer.Container, term.SetMaster(master) } - command := factory.Create(container, console, logFile, syncPipe.child.Fd(), args) + command := ns.commandFactory.Create(container, console, ns.logFile, syncPipe.child.Fd(), args) if err := term.Attach(command); err != nil { return -1, err } defer term.Close() - log.Printf("staring init") + ns.logger.Printf("staring init") if err := command.Start(); err != nil { return -1, err } - log.Printf("writing state file") - if err := state.WritePid(command.Process.Pid); err != nil { + ns.logger.Printf("writing state file") + if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } defer func() { - log.Printf("removing state file") - state.DeletePid() + ns.logger.Printf("removing state file") + ns.stateWriter.DeletePid() }() // Do this before syncing with child so that no children // can escape the cgroup - if err := SetupCgroups(container, command.Process.Pid); err != nil { + if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { command.Process.Kill() return -1, err } - if err := InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { + if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() return -1, err } // Sync with child - log.Printf("closing sync pipes") + ns.logger.Printf("closing sync pipes") syncPipe.Close() - log.Printf("waiting on process") + ns.logger.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } - log.Printf("process ended") - return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil + + exitCode := command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() + ns.logger.Printf("process ended with exit code %d", exitCode) + return exitCode, nil } -func SetupCgroups(container *libcontainer.Container, nspid int) error { +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { if container.Cgroups != nil { - log.Printf("setting up cgroups") + ns.logger.Printf("setting up cgroups") if err := container.Cgroups.Apply(nspid); err != nil { return err } @@ -94,9 +93,9 @@ func SetupCgroups(container *libcontainer.Container, nspid int) error { return nil } -func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { +func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { - log.Printf("creating host network configuration type %s", container.Network.Type) + ns.logger.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) if err != nil { return err @@ -105,7 +104,7 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *Sy if err != nil { return err } - log.Printf("sending %v as network context", networkContext) + ns.logger.Printf("sending %v as network context", networkContext) if err := pipe.SendToChild(networkContext); err != nil { return err } diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 85a8990..9c33f69 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -12,7 +12,7 @@ import ( ) // ExecIn uses an existing pid and joins the pid's namespaces with the new command. -func ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index d6d7dc3..5e33169 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -17,7 +17,7 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err diff --git a/libcontainer/nsinit/nsinit.go b/libcontainer/nsinit/nsinit.go new file mode 100644 index 0000000..599461e --- /dev/null +++ b/libcontainer/nsinit/nsinit.go @@ -0,0 +1,29 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "log" +) + +type NsInit interface { + Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) + ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) + Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error +} + +type linuxNs struct { + root string + logFile string + logger *log.Logger + commandFactory CommandFactory + stateWriter StateWriter +} + +func NewNsInit(logger *log.Logger, logFile string, command CommandFactory, state StateWriter) NsInit { + return &linuxNs{ + logger: logger, + commandFactory: command, + stateWriter: state, + logFile: logFile, + } +} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 786c9c1..c25037f 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -42,13 +42,13 @@ func main() { if err != nil { log.Fatal(err) } - if err := setupLogging(); err != nil { + ns, err := newNsInit() + if err != nil { log.Fatal(err) } + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd - log.SetPrefix("[nsinit exec] ") - var exitCode int nspid, err := readPid() if err != nil { @@ -57,20 +57,16 @@ func main() { } } if nspid > 0 { - exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) + exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:]) } else { term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) - exitCode, err = nsinit.Exec(container, - &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}, - term, - logFile, flag.Args()[1:]) + exitCode, err = ns.Exec(container, term, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container - log.SetPrefix("[nsinit init] ") cwd, err := os.Getwd() if err != nil { log.Fatal(err) @@ -82,7 +78,7 @@ func main() { if err != nil { log.Fatal(err) } - if err := nsinit.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { + if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { log.Fatal(err) } default: @@ -116,19 +112,27 @@ func readPid() (int, error) { return pid, nil } -func setupLogging() (err error) { +func newNsInit() (nsinit.NsInit, error) { + logger, err := setupLogging() + if err != nil { + return nil, err + } + return nsinit.NewNsInit(logger, logFile, &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil +} + +func setupLogging() (logger *log.Logger, err error) { var writer io.Writer + switch logFile { case "stderr": writer = os.Stderr case "none", "": writer = ioutil.Discard default: - writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) - if err != nil { - return err + if writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755); err != nil { + return } } - log.SetOutput(writer) - return nil + logger = log.New(writer, "", log.LstdFlags) + return } From 6daf56799fc9a6c024bcdfe15957d91b02437bd0 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 21:11:52 -0800 Subject: [PATCH 52/64] Refactor and improve libcontainer and driver Remove logging for now because it is complicating things Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 3 ++ libcontainer/nsinit/command.go | 8 ++-- libcontainer/nsinit/exec.go | 35 ++-------------- libcontainer/nsinit/execin.go | 6 +-- libcontainer/nsinit/init.go | 64 ++++++++---------------------- libcontainer/nsinit/nsinit.go | 9 ++--- libcontainer/nsinit/nsinit/main.go | 26 +----------- libcontainer/nsinit/state.go | 2 + libcontainer/utils/utils.go | 11 +++++ system/calls_linux.go | 9 +++++ system/pty_linux.go | 27 +++++++++++++ 11 files changed, 84 insertions(+), 116 deletions(-) diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 321c68e..49e63f0 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -6,6 +6,9 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/utils" ) +// Veth is a network strategy that uses a bridge and creates +// a veth pair, one that stays outside on the host and the other +// is placed inside the container's namespace type Veth struct { } diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go index b1c5631..5eb378a 100644 --- a/libcontainer/nsinit/command.go +++ b/libcontainer/nsinit/command.go @@ -8,8 +8,11 @@ import ( "syscall" ) +// CommandFactory takes the container's configuration and options passed by the +// parent processes and creates an *exec.Cmd that will be used to fork/exec the +// namespaced init process type CommandFactory interface { - Create(container *libcontainer.Container, console, logFile string, syncFd uintptr, args []string) *exec.Cmd + Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd } type DefaultCommandFactory struct{} @@ -17,13 +20,12 @@ type DefaultCommandFactory struct{} // Create will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { // get our binary name so we can always reexec ourself name := os.Args[0] command := exec.Command(name, append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), - "-log", logFile, "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index c407323..b13326b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -28,31 +28,27 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ } if container.Tty { - ns.logger.Printf("setting up master and console") - master, console, err = CreateMasterAndConsole() + master, console, err = system.CreateMasterAndConsole() if err != nil { return -1, err } term.SetMaster(master) } - command := ns.commandFactory.Create(container, console, ns.logFile, syncPipe.child.Fd(), args) + command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args) if err := term.Attach(command); err != nil { return -1, err } defer term.Close() - ns.logger.Printf("staring init") if err := command.Start(); err != nil { return -1, err } - ns.logger.Printf("writing state file") if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } defer func() { - ns.logger.Printf("removing state file") ns.stateWriter.DeletePid() }() @@ -68,24 +64,18 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ } // Sync with child - ns.logger.Printf("closing sync pipes") syncPipe.Close() - ns.logger.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } - - exitCode := command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() - ns.logger.Printf("process ended with exit code %d", exitCode) - return exitCode, nil + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { if container.Cgroups != nil { - ns.logger.Printf("setting up cgroups") if err := container.Cgroups.Apply(nspid); err != nil { return err } @@ -95,7 +85,6 @@ func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) er func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { - ns.logger.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) if err != nil { return err @@ -104,27 +93,9 @@ func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid if err != nil { return err } - ns.logger.Printf("sending %v as network context", networkContext) if err := pipe.SendToChild(networkContext); err != nil { return err } } return nil } - -// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the -// pts name for use as the pty slave inside the container -func CreateMasterAndConsole() (*os.File, string, error) { - master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) - if err != nil { - return nil, "", err - } - console, err := system.Ptsname(master) - if err != nil { - return nil, "", err - } - if err := system.Unlockpt(master); err != nil { - return nil, "", err - } - return master, console, nil -} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 9c33f69..463196c 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -18,7 +18,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s return -1, err } } - fds, err := getNsFds(nspid, container) + fds, err := ns.getNsFds(nspid, container) closeFds := func() { for _, f := range fds { system.Closefd(f) @@ -75,13 +75,13 @@ dropAndExec: if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(args[0], args[0:], container.Env); err != nil { + if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err } panic("unreachable") } -func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { +func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 5e33169..1229560 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -7,18 +7,17 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" "os" - "os/exec" - "path/filepath" "syscall" ) // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { - rootfs, err := resolveRootfs(uncleanRootfs) + rootfs, err := utils.ResolveRootfs(uncleanRootfs) if err != nil { return err } @@ -34,7 +33,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if console != "" { // close pipes so that we can replace it with the pty closeStdPipes() - slave, err := openTerminal(console, syscall.O_RDWR) + slave, err := system.OpenTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) } @@ -50,6 +49,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setctty %s", err) } } + if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent deth signal %s", err) } @@ -73,18 +73,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - return execArgs(args, container.Env) -} - -func execArgs(args []string, env []string) error { - name, err := exec.LookPath(args[0]) - if err != nil { - return err - } - if err := system.Exec(name, args[0:], env); err != nil { - return fmt.Errorf("exec %s", err) - } - panic("unreachable") + return system.Execv(args[0], args[0:], container.Env) } func closeStdPipes() { @@ -93,18 +82,19 @@ func closeStdPipes() { os.Stderr.Close() } -// resolveRootfs ensures that the current working directory is -// not a symlink and returns the absolute path to the rootfs -func resolveRootfs(uncleanRootfs string) (string, error) { - rootfs, err := filepath.Abs(uncleanRootfs) - if err != nil { - return "", err - } - return filepath.EvalSymlinks(rootfs) -} - func setupUser(container *libcontainer.Container) error { - if container.User != "" && container.User != "root" { + switch container.User { + case "root", "": + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } + default: uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) if err != nil { return err @@ -118,16 +108,6 @@ func setupUser(container *libcontainer.Container) error { if err := system.Setuid(uid); err != nil { return err } - } else { - if err := system.Setgroups(nil); err != nil { - return err - } - if err := system.Setresgid(0, 0, 0); err != nil { - return err - } - if err := system.Setresuid(0, 0, 0); err != nil { - return err - } } return nil } @@ -147,16 +127,6 @@ func dupSlave(slave *os.File) error { return nil } -// openTerminal is a clone of os.OpenFile without the O_CLOEXEC -// used to open the pty slave inside the container namespace -func openTerminal(name string, flag int) (*os.File, error) { - r, e := syscall.Open(name, flag, 0) - if e != nil { - return nil, &os.PathError{"open", name, e} - } - return os.NewFile(uintptr(r), name), nil -} - // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway diff --git a/libcontainer/nsinit/nsinit.go b/libcontainer/nsinit/nsinit.go index 599461e..f09a130 100644 --- a/libcontainer/nsinit/nsinit.go +++ b/libcontainer/nsinit/nsinit.go @@ -2,9 +2,10 @@ package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" - "log" ) +// NsInit is an interface with the public facing methods to provide high level +// exec operations on a container type NsInit interface { Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) @@ -13,17 +14,13 @@ type NsInit interface { type linuxNs struct { root string - logFile string - logger *log.Logger commandFactory CommandFactory stateWriter StateWriter } -func NewNsInit(logger *log.Logger, logFile string, command CommandFactory, state StateWriter) NsInit { +func NewNsInit(command CommandFactory, state StateWriter) NsInit { return &linuxNs{ - logger: logger, commandFactory: command, stateWriter: state, - logFile: logFile, } } diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index c25037f..e385e7f 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -6,7 +6,6 @@ import ( "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" - "io" "io/ioutil" "log" "os" @@ -16,7 +15,6 @@ import ( var ( console string pipeFd int - logFile string ) var ( @@ -26,7 +24,6 @@ var ( func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") - flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() @@ -113,26 +110,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - logger, err := setupLogging() - if err != nil { - return nil, err - } - return nsinit.NewNsInit(logger, logFile, &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil -} - -func setupLogging() (logger *log.Logger, err error) { - var writer io.Writer - - switch logFile { - case "stderr": - writer = os.Stderr - case "none", "": - writer = ioutil.Discard - default: - if writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755); err != nil { - return - } - } - logger = log.New(writer, "", log.LstdFlags) - return + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil } diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 2dbaaa5..5c719e1 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -7,6 +7,8 @@ import ( "path/filepath" ) +// StateWriter handles writing and deleting the pid file +// on disk type StateWriter interface { WritePid(pid int) error DeletePid() error diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 5050997..0d919bc 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -4,6 +4,7 @@ import ( "crypto/rand" "encoding/hex" "io" + "path/filepath" ) // GenerateRandomName returns a new name joined with a prefix. This size @@ -15,3 +16,13 @@ func GenerateRandomName(prefix string, size int) (string, error) { } return prefix + hex.EncodeToString(id)[:size], nil } + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} diff --git a/system/calls_linux.go b/system/calls_linux.go index 0bf42e3..b7a8f14 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -1,6 +1,7 @@ package system import ( + "os/exec" "syscall" ) @@ -16,6 +17,14 @@ func Exec(cmd string, args []string, env []string) error { return syscall.Exec(cmd, args, env) } +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + return Exec(name, args, env) +} + func Fork() (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) diff --git a/system/pty_linux.go b/system/pty_linux.go index b281b71..ca588d8 100644 --- a/system/pty_linux.go +++ b/system/pty_linux.go @@ -24,8 +24,35 @@ func Ptsname(f *os.File) (string, error) { return fmt.Sprintf("/dev/pts/%d", n), nil } +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container +func CreateMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + console, err := Ptsname(master) + if err != nil { + return nil, "", err + } + if err := Unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + // OpenPtmx opens /dev/ptmx, i.e. the PTY master. func OpenPtmx() (*os.File, error) { // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) } + +// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace +func OpenTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} From 357ca32831c94462f68cba3dafa78a6b739f07b7 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Mon, 24 Feb 2014 21:52:29 -0800 Subject: [PATCH 53/64] Better capability/namespace management Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/capabilities/capabilities.go | 20 +-- libcontainer/nsinit/execin.go | 7 +- libcontainer/nsinit/ns_linux.go | 24 +--- libcontainer/types.go | 155 ++++++++++++++++------ 4 files changed, 119 insertions(+), 87 deletions(-) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index 65fd455..3c6d752 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -6,24 +6,6 @@ import ( "os" ) -var capMap = map[libcontainer.Capability]capability.Cap{ - libcontainer.CAP_SETPCAP: capability.CAP_SETPCAP, - libcontainer.CAP_SYS_MODULE: capability.CAP_SYS_MODULE, - libcontainer.CAP_SYS_RAWIO: capability.CAP_SYS_RAWIO, - libcontainer.CAP_SYS_PACCT: capability.CAP_SYS_PACCT, - libcontainer.CAP_SYS_ADMIN: capability.CAP_SYS_ADMIN, - libcontainer.CAP_SYS_NICE: capability.CAP_SYS_NICE, - libcontainer.CAP_SYS_RESOURCE: capability.CAP_SYS_RESOURCE, - libcontainer.CAP_SYS_TIME: capability.CAP_SYS_TIME, - libcontainer.CAP_SYS_TTY_CONFIG: capability.CAP_SYS_TTY_CONFIG, - libcontainer.CAP_MKNOD: capability.CAP_MKNOD, - libcontainer.CAP_AUDIT_WRITE: capability.CAP_AUDIT_WRITE, - libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, - libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, - libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, - libcontainer.CAP_NET_ADMIN: capability.CAP_NET_ADMIN, -} - // DropCapabilities drops capabilities for the current process based // on the container's configuration. func DropCapabilities(container *libcontainer.Container) error { @@ -45,7 +27,7 @@ func DropCapabilities(container *libcontainer.Container) error { func getCapabilities(container *libcontainer.Container) []capability.Cap { drop := []capability.Cap{} for _, c := range container.Capabilities { - drop = append(drop, capMap[c]) + drop = append(drop, c.Value) } return drop } diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 463196c..306250c 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -14,7 +14,7 @@ import ( // ExecIn uses an existing pid and joins the pid's namespaces with the new command. func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { - if err := system.Unshare(namespaceMap[ns]); err != nil { + if err := system.Unshare(ns.Value); err != nil { return -1, err } } @@ -42,8 +42,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s // if the container has a new pid and mount namespace we need to // remount proc and sys to pick up the changes - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + if container.Namespaces.Contains("CLONE_NEWNS") && container.Namespaces.Contains("CLONE_NEWPID") { pid, err := system.Fork() if err != nil { return -1, err @@ -84,7 +83,7 @@ dropAndExec: func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { - f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0) if err != nil { return fds, err } diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 58af247..ab6322e 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -2,35 +2,13 @@ package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" - "syscall" ) -var namespaceMap = map[libcontainer.Namespace]int{ - libcontainer.CLONE_NEWNS: syscall.CLONE_NEWNS, - libcontainer.CLONE_NEWUTS: syscall.CLONE_NEWUTS, - libcontainer.CLONE_NEWIPC: syscall.CLONE_NEWIPC, - libcontainer.CLONE_NEWUSER: syscall.CLONE_NEWUSER, - libcontainer.CLONE_NEWPID: syscall.CLONE_NEWPID, - libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, -} - -// namespaceFileMap is used to convert the libcontainer types -// into the names of the files located in /proc//ns/* for -// each namespace -var namespaceFileMap = map[libcontainer.Namespace]string{ - libcontainer.CLONE_NEWNS: "mnt", - libcontainer.CLONE_NEWUTS: "uts", - libcontainer.CLONE_NEWIPC: "ipc", - libcontainer.CLONE_NEWUSER: "user", - libcontainer.CLONE_NEWPID: "pid", - libcontainer.CLONE_NEWNET: "net", -} - // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { - flag |= namespaceMap[ns] + flag |= ns.Value } return flag } diff --git a/libcontainer/types.go b/libcontainer/types.go index bb54ff5..cb64db1 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,58 +1,131 @@ package libcontainer -// These constants are defined as string types so that -// it is clear when adding the configuration in config files -// instead of using ints or other types -const ( - CAP_SETPCAP Capability = "SETPCAP" - CAP_SYS_MODULE Capability = "SYS_MODULE" - CAP_SYS_RAWIO Capability = "SYS_RAWIO" - CAP_SYS_PACCT Capability = "SYS_PACCT" - CAP_SYS_ADMIN Capability = "SYS_ADMIN" - CAP_SYS_NICE Capability = "SYS_NICE" - CAP_SYS_RESOURCE Capability = "SYS_RESOURCE" - CAP_SYS_TIME Capability = "SYS_TIME" - CAP_SYS_TTY_CONFIG Capability = "SYS_TTY_CONFIG" - CAP_MKNOD Capability = "MKNOD" - CAP_AUDIT_WRITE Capability = "AUDIT_WRITE" - CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" - CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" - CAP_MAC_ADMIN Capability = "MAC_ADMIN" - CAP_NET_ADMIN Capability = "NET_ADMIN" +import ( + "encoding/json" + "errors" + "github.com/syndtr/gocapability/capability" + "os" + "syscall" +) - CLONE_NEWNS Namespace = "NEWNS" // mount - CLONE_NEWUTS Namespace = "NEWUTS" // utsname - CLONE_NEWIPC Namespace = "NEWIPC" // ipc - CLONE_NEWUSER Namespace = "NEWUSER" // user - CLONE_NEWPID Namespace = "NEWPID" // pid - CLONE_NEWNET Namespace = "NEWNET" // network +var ( + ErrUnkownNamespace error = errors.New("Unkown namespace") +) + +// namespaceList is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace +var ( + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } + capabilityList = Capabilities{ + {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, + {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, + {Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO}, + {Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT}, + {Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN}, + {Key: "SYS_NICE", Value: capability.CAP_SYS_NICE}, + {Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE}, + {Key: "SYS_TIME", Value: capability.CAP_SYS_TIME}, + {Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG}, + {Key: "MKNOD", Value: capability.CAP_MKNOD}, + {Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE}, + {Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL}, + {Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE}, + {Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN}, + {Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN}, + } ) type ( - Namespace string - Namespaces []Namespace - Capability string - Capabilities []Capability + Namespace struct { + Key string + Value int + File string + } + Namespaces []*Namespace ) +func (ns *Namespace) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Namespace) UnmarshalJSON(src []byte) error { + var nsName string + if err := json.Unmarshal(src, &nsName); err != nil { + return err + } + ret := GetNamespace(nsName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetNamespace(key string) *Namespace { + for _, ns := range namespaceList { + if ns.Key == key { + return ns + } + } + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Namespace not found") + } + return nil +} + // Contains returns true if the specified Namespace is // in the slice -func (n Namespaces) Contains(ns Namespace) bool { - for _, nns := range n { - if nns == ns { - return true +func (n Namespaces) Contains(ns string) bool { + return GetNamespace(ns) != nil +} + +type ( + Capability struct { + Key string + Value capability.Cap + } + Capabilities []*Capability +) + +func (ns *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Capability) UnmarshalJSON(src []byte) error { + var capName string + if err := json.Unmarshal(src, &capName); err != nil { + return err + } + ret := GetCapability(capName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetCapability(key string) *Capability { + for _, capp := range capabilityList { + if capp.Key == key { + return capp } } - return false + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Namespace not found") + } + return nil } // Contains returns true if the specified Capability is // in the slice -func (c Capabilities) Contains(capp Capability) bool { - for _, cc := range c { - if cc == capp { - return true - } - } - return false +func (c Capabilities) Contains(capp string) bool { + return GetCapability(capp) != nil } From 5982af496947d5049b7b20caf6ebfd5071d1ef6d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 10:54:41 -0800 Subject: [PATCH 54/64] Address initial feedback from pr Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- system/setns_linux.go | 21 ++++++++++++++++++++- system/setns_linux_amd64.go | 8 -------- 2 files changed, 20 insertions(+), 9 deletions(-) delete mode 100644 system/setns_linux_amd64.go diff --git a/system/setns_linux.go b/system/setns_linux.go index be6f3ed..07b1c93 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -1,11 +1,30 @@ package system import ( + "errors" + "fmt" + "runtime" "syscall" ) +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +// +// We need different setns values for the different platforms and arch +// We are declaring the macro here because the SETNS syscall does not exist in th stdlib +var setNsMap = map[string]uintptr{ + "linux/amd64": 308, +} + func Setns(fd uintptr, flags uintptr) error { - _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + if !exists { + return ErrNotSupportedPlatform + } + _, _, err := syscall.RawSyscall(ns, fd, flags, 0) if err != 0 { return err } diff --git a/system/setns_linux_amd64.go b/system/setns_linux_amd64.go deleted file mode 100644 index 4e30625..0000000 --- a/system/setns_linux_amd64.go +++ /dev/null @@ -1,8 +0,0 @@ -// +build linux,amd64 - -package system - -// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 -const ( - SYS_SETNS = 308 -) From 2acaf7ca82a249db5bcef6f67f702c591b7413d3 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 12:41:31 -0800 Subject: [PATCH 55/64] Move container.json and pid file into a root specific driver dir Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/execin.go | 2 +- libcontainer/nsinit/nsinit/main.go | 12 +++++++----- libcontainer/nsinit/state.go | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 306250c..253fbdc 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -42,7 +42,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s // if the container has a new pid and mount namespace we need to // remount proc and sys to pick up the changes - if container.Namespaces.Contains("CLONE_NEWNS") && container.Namespaces.Contains("CLONE_NEWPID") { + if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") { pid, err := system.Fork() if err != nil { return -1, err diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e385e7f..e6b020b 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -9,12 +9,13 @@ import ( "io/ioutil" "log" "os" + "path/filepath" "strconv" ) var ( - console string - pipeFd int + root, console string + pipeFd int ) var ( @@ -25,6 +26,7 @@ var ( func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.StringVar(&root, "root", ".", "root for storing configuration data") flag.Parse() } @@ -84,7 +86,7 @@ func main() { } func loadContainer() (*libcontainer.Container, error) { - f, err := os.Open("container.json") + f, err := os.Open(filepath.Join(root, "container.json")) if err != nil { return nil, err } @@ -98,7 +100,7 @@ func loadContainer() (*libcontainer.Container, error) { } func readPid() (int, error) { - data, err := ioutil.ReadFile(".nspid") + data, err := ioutil.ReadFile(filepath.Join(root, "pid")) if err != nil { return -1, err } @@ -110,5 +112,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{root}), nil } diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 5c719e1..af38008 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -18,11 +18,11 @@ type DefaultStateWriter struct { Root string } -// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container +// writePidFile writes the namespaced processes pid to pid in the rootfs for the container func (d *DefaultStateWriter) WritePid(pid int) error { - return ioutil.WriteFile(filepath.Join(d.Root, ".nspid"), []byte(fmt.Sprint(pid)), 0655) + return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655) } func (d *DefaultStateWriter) DeletePid() error { - return os.Remove(filepath.Join(d.Root, ".nspid")) + return os.Remove(filepath.Join(d.Root, "pid")) } From f85823b53de774a6d690832b8f27bc1bde0f01bc Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 15:19:13 -0800 Subject: [PATCH 56/64] Fix cross compile for make cross Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/command.go | 25 ++++++++++++++------- libcontainer/nsinit/execin.go | 2 ++ libcontainer/nsinit/ns_linux.go | 14 ------------ libcontainer/nsinit/nsinit/main.go | 12 +++------- libcontainer/nsinit/unsupported.go | 19 ++++++++++++++++ libcontainer/types.go | 35 ++++++++++++++++-------------- libcontainer/types_linux.go | 16 ++++++++++++++ system/calls_linux.go | 7 ++++++ system/errors.go | 9 ++++++++ system/setns_linux.go | 5 ----- system/unsupported.go | 15 +++++++++++++ 11 files changed, 107 insertions(+), 52 deletions(-) delete mode 100644 libcontainer/nsinit/ns_linux.go create mode 100644 libcontainer/nsinit/unsupported.go create mode 100644 libcontainer/types_linux.go create mode 100644 system/errors.go create mode 100644 system/unsupported.go diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go index 5eb378a..8ddf1e7 100644 --- a/libcontainer/nsinit/command.go +++ b/libcontainer/nsinit/command.go @@ -3,9 +3,9 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" "os" "os/exec" - "syscall" ) // CommandFactory takes the container's configuration and options passed by the @@ -15,22 +15,31 @@ type CommandFactory interface { Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd } -type DefaultCommandFactory struct{} +type DefaultCommandFactory struct { + Root string +} // Create will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { - // get our binary name so we can always reexec ourself - name := os.Args[0] - command := exec.Command(name, append([]string{ + // get our binary name from arg0 so we can always reexec ourself + command := exec.Command(os.Args[0], append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), + "-root", c.Root, "init"}, args...)...) - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), - } + system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces))) command.Env = container.Env return command } + +// GetNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= ns.Value + } + return flag +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 253fbdc..55f7b96 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -1,3 +1,5 @@ +// +build linux + package nsinit import ( diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go deleted file mode 100644 index ab6322e..0000000 --- a/libcontainer/nsinit/ns_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -package nsinit - -import ( - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// getNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare, and setns -func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { - for _, ns := range namespaces { - flag |= ns.Value - } - return flag -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e6b020b..61921c5 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -2,7 +2,6 @@ package main import ( "encoding/json" - "errors" "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" @@ -18,11 +17,6 @@ var ( pipeFd int ) -var ( - ErrUnsupported = errors.New("Unsupported method") - ErrWrongArguments = errors.New("Wrong argument count") -) - func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") @@ -35,7 +29,7 @@ func main() { registerFlags() if flag.NArg() < 1 { - log.Fatal(ErrWrongArguments) + log.Fatalf("wrong number of argments %d", flag.NArg()) } container, err := loadContainer() if err != nil { @@ -71,7 +65,7 @@ func main() { log.Fatal(err) } if flag.NArg() < 2 { - log.Fatal(ErrWrongArguments) + log.Fatalf("wrong number of argments %d", flag.NArg()) } syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) if err != nil { @@ -112,5 +106,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{root}), nil + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil } diff --git a/libcontainer/nsinit/unsupported.go b/libcontainer/nsinit/unsupported.go new file mode 100644 index 0000000..2412223 --- /dev/null +++ b/libcontainer/nsinit/unsupported.go @@ -0,0 +1,19 @@ +// +build !linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + return libcontainer.ErrUnsupported +} diff --git a/libcontainer/types.go b/libcontainer/types.go index cb64db1..8c28530 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -5,25 +5,20 @@ import ( "errors" "github.com/syndtr/gocapability/capability" "os" - "syscall" ) var ( - ErrUnkownNamespace error = errors.New("Unkown namespace") + ErrUnkownNamespace = errors.New("Unknown namespace") + ErrUnkownCapability = errors.New("Unknown capability") + ErrUnsupported = errors.New("Unsupported method") ) // namespaceList is used to convert the libcontainer types // into the names of the files located in /proc//ns/* for // each namespace var ( - namespaceList = Namespaces{ - {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, - {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, - {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, - {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, - {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, - {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, - } + namespaceList = Namespaces{} + capabilityList = Capabilities{ {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, @@ -52,6 +47,10 @@ type ( Namespaces []*Namespace ) +func (ns *Namespace) String() string { + return ns.Key +} + func (ns *Namespace) MarshalJSON() ([]byte, error) { return json.Marshal(ns.Key) } @@ -95,20 +94,24 @@ type ( Capabilities []*Capability ) -func (ns *Capability) MarshalJSON() ([]byte, error) { - return json.Marshal(ns.Key) +func (c *Capability) String() string { + return c.Key } -func (ns *Capability) UnmarshalJSON(src []byte) error { +func (c *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(c.Key) +} + +func (c *Capability) UnmarshalJSON(src []byte) error { var capName string if err := json.Unmarshal(src, &capName); err != nil { return err } ret := GetCapability(capName) if ret == nil { - return ErrUnkownNamespace + return ErrUnkownCapability } - *ns = *ret + *c = *ret return nil } @@ -119,7 +122,7 @@ func GetCapability(key string) *Capability { } } if os.Getenv("DEBUG") != "" { - panic("Unreachable: Namespace not found") + panic("Unreachable: Capability not found") } return nil } diff --git a/libcontainer/types_linux.go b/libcontainer/types_linux.go new file mode 100644 index 0000000..c14531d --- /dev/null +++ b/libcontainer/types_linux.go @@ -0,0 +1,16 @@ +package libcontainer + +import ( + "syscall" +) + +func init() { + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } +} diff --git a/system/calls_linux.go b/system/calls_linux.go index b7a8f14..bf667c5 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -136,3 +136,10 @@ func Mkfifo(name string, mode uint32) error { func Umask(mask int) int { return syscall.Umask(mask) } + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Cloneflags = flag +} diff --git a/system/errors.go b/system/errors.go new file mode 100644 index 0000000..6304518 --- /dev/null +++ b/system/errors.go @@ -0,0 +1,9 @@ +package system + +import ( + "errors" +) + +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) diff --git a/system/setns_linux.go b/system/setns_linux.go index 07b1c93..2b6f9e7 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -1,16 +1,11 @@ package system import ( - "errors" "fmt" "runtime" "syscall" ) -var ( - ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") -) - // Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 // // We need different setns values for the different platforms and arch diff --git a/system/unsupported.go b/system/unsupported.go new file mode 100644 index 0000000..eb3ec7e --- /dev/null +++ b/system/unsupported.go @@ -0,0 +1,15 @@ +// +build !linux + +package system + +import ( + "os/exec" +) + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + +} + +func UsetCloseOnExec(fd uintptr) error { + return ErrNotSupportedPlatform +} From d8025d106638fba6f18ee177bb15dffaf08483a2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 19:45:57 -0800 Subject: [PATCH 57/64] Fix cgroups swap issue when it is not supported Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index e260d67..b40e1a3 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -223,8 +223,10 @@ func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { return err } } - if c.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10)); err != nil { + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { return err } } From 4f6cdc6f08c4762e7cd377124e734c907cad3f9c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 14:19:39 -0800 Subject: [PATCH 58/64] Make network a slice to support multiple types Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 21 +++++++++++---------- libcontainer/container.go | 2 +- libcontainer/container.json | 21 +++++++++++---------- libcontainer/network/strategy.go | 2 +- libcontainer/network/veth.go | 26 ++++++++++++-------------- libcontainer/nsinit/exec.go | 13 +++++-------- libcontainer/nsinit/init.go | 6 +++--- 7 files changed, 44 insertions(+), 47 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 36553af..4c8da8e 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -48,16 +48,17 @@ Sample `container.json` file: "MAC_ADMIN", "NET_ADMIN" ], - "network": { - "type": "veth", - "context": { - "bridge": "docker0", - "prefix": "dock" - }, - "address": "172.17.0.100/16", - "gateway": "172.17.42.1", - "mtu": 1500 - }, + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], "cgroups": { "name": "docker-koye", "parent": "docker", diff --git a/libcontainer/container.go b/libcontainer/container.go index 4a47977..12a3d7b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -19,7 +19,7 @@ type Container struct { Tty bool `json:"tty,omitempty"` // setup a proper tty or not Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop - Network *Network `json:"network,omitempty"` // nil for host's network stack + Networks []*Network `json:"networks,omitempty"` // nil for host's network stack Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index c2b21f8..83e4074 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -31,16 +31,17 @@ "MAC_ADMIN", "NET_ADMIN" ], - "network": { - "type": "veth", - "context": { - "bridge": "docker0", - "prefix": "dock" - }, - "address": "172.17.0.100/16", - "gateway": "172.17.42.1", - "mtu": 1500 - }, + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], "cgroups": { "name": "docker-koye", "parent": "docker", diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go index 8ecc11a..a2f4f8f 100644 --- a/libcontainer/network/strategy.go +++ b/libcontainer/network/strategy.go @@ -16,7 +16,7 @@ var strategies = map[string]NetworkStrategy{ // NetworkStrategy represends a specific network configuration for // a containers networking stack type NetworkStrategy interface { - Create(*libcontainer.Network, int) (libcontainer.Context, error) + Create(*libcontainer.Network, int, libcontainer.Context) error Initialize(*libcontainer.Network, libcontainer.Context) error } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 49e63f0..3ab1b23 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -12,39 +12,37 @@ import ( type Veth struct { } -func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { +func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error { var ( bridge string prefix string exists bool ) if bridge, exists = n.Context["bridge"]; !exists { - return nil, fmt.Errorf("bridge does not exist in network context") + return fmt.Errorf("bridge does not exist in network context") } if prefix, exists = n.Context["prefix"]; !exists { - return nil, fmt.Errorf("veth prefix does not exist in network context") + return fmt.Errorf("veth prefix does not exist in network context") } name1, name2, err := createVethPair(prefix) if err != nil { - return nil, err - } - context := libcontainer.Context{ - "vethHost": name1, - "vethChild": name2, + return err } + context["veth-host"] = name1 + context["veth-child"] = name2 if err := SetInterfaceMaster(name1, bridge); err != nil { - return context, err + return err } if err := SetMtu(name1, n.Mtu); err != nil { - return context, err + return err } if err := InterfaceUp(name1); err != nil { - return context, err + return err } if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { - return context, err + return err } - return context, nil + return nil } func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { @@ -52,7 +50,7 @@ func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Con vethChild string exists bool ) - if vethChild, exists = context["vethChild"]; !exists { + if vethChild, exists = context["veth-child"]; !exists { return fmt.Errorf("vethChild does not exist in network context") } if err := InterfaceDown(vethChild); err != nil { diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b13326b..f7a9c17 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -84,18 +84,15 @@ func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) er } func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { - if container.Network != nil { - strategy, err := network.GetStrategy(container.Network.Type) + context := libcontainer.Context{} + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) if err != nil { return err } - networkContext, err := strategy.Create(container.Network, nspid) - if err != nil { - return err - } - if err := pipe.SendToChild(networkContext); err != nil { + if err := strategy.Create(config, nspid, context); err != nil { return err } } - return nil + return pipe.SendToChild(context) } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 1229560..cfc5058 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -56,7 +56,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetwork(container.Network, context); err != nil { + if err := setupNetwork(container, context); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -130,8 +130,8 @@ func dupSlave(slave *os.File) error { // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway -func setupNetwork(config *libcontainer.Network, context libcontainer.Context) error { - if config != nil { +func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error { + for _, config := range container.Networks { strategy, err := network.GetStrategy(config.Type) if err != nil { return err From f8262b5748caa42763b2b33ff182da159fb08b8a Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 17:21:09 -0800 Subject: [PATCH 59/64] Ensure that loopback devices are mounted inside the conatiner Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 53 +++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index a73e97e..55c2655 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -37,6 +37,9 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } + if err := setupLoopbackDevices(rootfs); err != nil { + return fmt.Errorf("setup loopback devices %s", err) + } if err := setupDev(rootfs); err != nil { return err } @@ -76,21 +79,57 @@ func copyDevNodes(rootfs string) error { "urandom", "tty", } { - stat, err := os.Stat(filepath.Join("/dev", node)) + if err := copyDevNode(rootfs, node); err != nil { + return err + } + } + return nil +} + +func setupLoopbackDevices(rootfs string) error { + for i := 0; ; i++ { + var ( + device = fmt.Sprintf("loop%d", i) + source = filepath.Join("/dev", device) + dest = filepath.Join(rootfs, "dev", device) + ) + + if _, err := os.Stat(source); err != nil { + if !os.IsNotExist(err) { + return err + } + return nil + } + if _, err := os.Stat(dest); err == nil { + os.Remove(dest) + } + f, err := os.Create(dest) if err != nil { return err } - var ( - dest = filepath.Join(rootfs, "dev", node) - st = stat.Sys().(*syscall.Stat_t) - ) - if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { - return fmt.Errorf("copy %s %s", node, err) + f.Close() + if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil { + return err } } return nil } +func copyDevNode(rootfs, node string) error { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + return nil +} + // setupDev symlinks the current processes pipes into the // appropriate destination on the containers rootfs func setupDev(rootfs string) error { From 34301be2008095904c586d5fec9a81d165347526 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 19:19:14 -0800 Subject: [PATCH 60/64] Code review updates Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 12 +++++++----- libcontainer/network/strategy.go | 4 ++-- libcontainer/nsinit/exec.go | 4 +--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 4c8da8e..b81401c 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -9,9 +9,9 @@ for using linux namespaces with no external dependencies. libcontainer provides #### container A container is a self contained directory that is able to run one or more processes inside without affecting the host system. The directory is usually a full system tree. Inside the directory -a `container.json` file just be placed with the runtime configuration for how the process -should be contained and run. Environment, networking, and different capabilities for the -process are specified in this file. +a `container.json` file is placed with the runtime configuration for how the processes +should be contained and ran. Environment, networking, and different capabilities for the +process are specified in this file. The configuration is used for each process executed inside the container. Sample `container.json` file: ```json @@ -67,10 +67,12 @@ Sample `container.json` file: } ``` -Using this configuration and the current directory holding the rootfs for a process to live, one can se libcontainer to exec the container. Running the life of the namespace a `.nspid` file -is written to the current directory with the pid of the namespace'd process to the external word. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +Using this configuration and the current directory holding the rootfs for a process to live, one can use libcontainer to exec the container. Running the life of the namespace a `pid` file +is written to the current directory with the pid of the namespace'd process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +You may also specify an alternate root to to place the `container.json` file is read and where the `pid` file will be saved. + #### nsinit `nsinit` is a cli application used as the reference implementation of libcontainer. It is able to diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go index a2f4f8f..234fcc0 100644 --- a/libcontainer/network/strategy.go +++ b/libcontainer/network/strategy.go @@ -13,8 +13,8 @@ var strategies = map[string]NetworkStrategy{ "veth": &Veth{}, } -// NetworkStrategy represends a specific network configuration for -// a containers networking stack +// NetworkStrategy represents a specific network configuration for +// a container's networking stack type NetworkStrategy interface { Create(*libcontainer.Network, int, libcontainer.Context) error Initialize(*libcontainer.Network, libcontainer.Context) error diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f7a9c17..f1a4e24 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -48,9 +48,7 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ command.Process.Kill() return -1, err } - defer func() { - ns.stateWriter.DeletePid() - }() + defer ns.stateWriter.DeletePid() // Do this before syncing with child so that no children // can escape the cgroup From 85696fdb679e368030818a0ddc90783e713b46f0 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 27 Feb 2014 09:28:26 -0800 Subject: [PATCH 61/64] Allow child process to live if daemon dies Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cfc5058..cc481e2 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -50,9 +50,11 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol } } - if err := system.ParentDeathSignal(); err != nil { - return fmt.Errorf("parent deth signal %s", err) - } + /* + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } + */ if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } From ab952e250272af790ae95e134a840d26e51be1bd Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 3 Mar 2014 11:31:37 -0800 Subject: [PATCH 62/64] Update readme to remove .nspid Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index b81401c..1032531 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -87,4 +87,4 @@ nsinit exec /bin/bash If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. -You can identify if a process is running in a container by looking to see if `.nspid` is in the root of the directory. +You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory. From 7dc071dca54e9c939f8b2376406cc5b2a4d824f8 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 3 Mar 2014 12:15:47 -0800 Subject: [PATCH 63/64] Factor out finalize namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/execin.go | 5 ++--- libcontainer/nsinit/init.go | 29 +++++++++++++++++++---------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 55f7b96..488fe0e 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -5,7 +5,6 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/system" "os" "path/filepath" @@ -73,8 +72,8 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) } dropAndExec: - if err := capabilities.DropCapabilities(container); err != nil { - return -1, fmt.Errorf("drop capabilities %s", err) + if err := finalizeNamespace(container); err != nil { + return -1, err } if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cc481e2..565030f 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -64,16 +64,8 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - if err := capabilities.DropCapabilities(container); err != nil { - return fmt.Errorf("drop capabilities %s", err) - } - if err := setupUser(container); err != nil { - return fmt.Errorf("setup user %s", err) - } - if container.WorkingDir != "" { - if err := system.Chdir(container.WorkingDir); err != nil { - return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) - } + if err := finalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) } return system.Execv(args[0], args[0:], container.Env) } @@ -142,3 +134,20 @@ func setupNetwork(container *libcontainer.Container, context libcontainer.Contex } return nil } + +// finalizeNamespace drops the caps and sets the correct user +// and working dir before execing the command inside the namespace +func finalizeNamespace(container *libcontainer.Container) error { + if err := capabilities.DropCapabilities(container); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + return fmt.Errorf("setup user %s", err) + } + if container.WorkingDir != "" { + if err := system.Chdir(container.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) + } + } + return nil +} From 313d6a9e13f800bc963f6ea67fe5666ba55f3a6d Mon Sep 17 00:00:00 2001 From: Sven Dowideit Date: Thu, 27 Feb 2014 23:36:19 -0800 Subject: [PATCH 64/64] very minor spelling Docker-DCO-1.1-Signed-off-by: Sven Dowideit (github: SvenDowideit) --- libcontainer/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 1032531..d6e4ded 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -7,7 +7,7 @@ for using linux namespaces with no external dependencies. libcontainer provides #### container -A container is a self contained directory that is able to run one or more processes inside without +A container is a self contained directory that is able to run one or more processes without affecting the host system. The directory is usually a full system tree. Inside the directory a `container.json` file is placed with the runtime configuration for how the processes should be contained and ran. Environment, networking, and different capabilities for the @@ -67,11 +67,11 @@ Sample `container.json` file: } ``` -Using this configuration and the current directory holding the rootfs for a process to live, one can use libcontainer to exec the container. Running the life of the namespace a `pid` file -is written to the current directory with the pid of the namespace'd process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file +is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process. -You may also specify an alternate root to to place the `container.json` file is read and where the `pid` file will be saved. +You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved. #### nsinit @@ -79,7 +79,7 @@ You may also specify an alternate root to to place the `container.json` file is spawn or join new containers giving the current directory. To use `nsinit` cd into a linux rootfs and copy a `container.json` file into the directory with your specified configuration. -To execution `/bin/bash` in the current directory as a container just run: +To execute `/bin/bash` in the current directory as a container just run: ```bash nsinit exec /bin/bash ```