Initial commit of libcontainer

Docker-DCO-1.1-Signed-off-by: Michael Crosby <michael@crosbymichael.com> (github: crosbymichael)
This commit is contained in:
Michael Crosby 2014-02-18 16:56:11 -08:00
parent f8923d8060
commit 81d2c67492
20 changed files with 1531 additions and 0 deletions

View file

@ -0,0 +1,164 @@
package namespaces
import (
"fmt"
"os"
"syscall"
"unsafe"
)
const (
TIOCGPTN = 0x80045430
TIOCSPTLCK = 0x40045431
)
func chroot(dir string) error {
return syscall.Chroot(dir)
}
func chdir(dir string) error {
return syscall.Chdir(dir)
}
func exec(cmd string, args []string, env []string) error {
return syscall.Exec(cmd, args, env)
}
func fork() (int, error) {
syscall.ForkLock.Lock()
pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
syscall.ForkLock.Unlock()
if err != 0 {
return -1, err
}
return int(pid), nil
}
func vfork() (int, error) {
syscall.ForkLock.Lock()
pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0)
syscall.ForkLock.Unlock()
if err != 0 {
return -1, err
}
return int(pid), nil
}
func mount(source, target, fstype string, flags uintptr, data string) error {
return syscall.Mount(source, target, fstype, flags, data)
}
func unmount(target string, flags int) error {
return syscall.Unmount(target, flags)
}
func pivotroot(newroot, putold string) error {
return syscall.PivotRoot(newroot, putold)
}
func unshare(flags int) error {
return syscall.Unshare(flags)
}
func clone(flags uintptr) (int, error) {
syscall.ForkLock.Lock()
pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0)
syscall.ForkLock.Unlock()
if err != 0 {
return -1, err
}
return int(pid), nil
}
func setns(fd uintptr, flags uintptr) error {
_, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0)
if err != 0 {
return err
}
return nil
}
func usetCloseOnExec(fd uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 {
return err
}
return nil
}
func setgroups(gids []int) error {
return syscall.Setgroups(gids)
}
func setresgid(rgid, egid, sgid int) error {
return syscall.Setresgid(rgid, egid, sgid)
}
func setresuid(ruid, euid, suid int) error {
return syscall.Setresuid(ruid, euid, suid)
}
func sethostname(name string) error {
return syscall.Sethostname([]byte(name))
}
func setsid() (int, error) {
return syscall.Setsid()
}
func ioctl(fd uintptr, flag, data uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
return err
}
return nil
}
func openpmtx() (*os.File, error) {
return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
}
func unlockpt(f *os.File) error {
var u int
return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
}
func ptsname(f *os.File) (string, error) {
var n int
if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
return "", err
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}
func closefd(fd uintptr) error {
return syscall.Close(int(fd))
}
func dup2(fd1, fd2 uintptr) error {
return syscall.Dup2(int(fd1), int(fd2))
}
func mknod(path string, mode uint32, dev int) error {
return syscall.Mknod(path, mode, dev)
}
func parentDeathSignal() error {
if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 {
return err
}
return nil
}
func setctty() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 {
return err
}
return nil
}
func mkfifo(name string, mode uint32) error {
return syscall.Mkfifo(name, mode)
}
func umask(mask int) int {
return syscall.Umask(mask)
}

View file

@ -0,0 +1,266 @@
/*
Higher level convience functions for setting up a container
*/
package namespaces
import (
"errors"
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
"github.com/dotcloud/docker/pkg/libcontainer/utils"
"io"
"log"
"os"
"path/filepath"
"syscall"
)
var (
ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace")
)
// Exec will spawn new namespaces with the specified Container configuration
// in the RootFs path and return the pid of the new containerized process.
//
// If an existing network namespace is specified the container
// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is,
// the container will be spawned with a new network namespace with no configuration. Omiting an
// existing network namespace and the CLONE_NEWNET option in the container configuration will allow
// the container to the the host's networking options and configuration.
func Exec(container *libcontainer.Container) (pid int, err error) {
// a user cannot pass CLONE_NEWNET and an existing net namespace fd to join
if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) {
return -1, ErrExistingNetworkNamespace
}
rootfs, err := resolveRootfs(container)
if err != nil {
return -1, err
}
master, console, err := createMasterAndConsole()
if err != nil {
return -1, err
}
logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755)
if err != nil {
return -1, err
}
log.SetOutput(logger)
// we need CLONE_VFORK so we can wait on the child
flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK
if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil {
return -1, fmt.Errorf("error cloning process: %s", err)
}
if pid == 0 {
// welcome to your new namespace ;)
//
// any errors encoutered inside the namespace we should write
// out to a log or a pipe to our parent and exit(1)
// because writing to stderr will not work after we close
if err := closeMasterAndStd(master); err != nil {
writeError("close master and std %s", err)
}
slave, err := openTerminal(console, syscall.O_RDWR)
if err != nil {
writeError("open terminal %s", err)
}
if err := dupSlave(slave); err != nil {
writeError("dup2 slave %s", err)
}
if container.NetNsFd > 0 {
if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil {
writeError("join existing net namespace %s", err)
}
}
if _, err := setsid(); err != nil {
writeError("setsid %s", err)
}
if err := setctty(); err != nil {
writeError("setctty %s", err)
}
if err := parentDeathSignal(); err != nil {
writeError("parent deth signal %s", err)
}
if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil {
writeError("setup mount namespace %s", err)
}
if err := sethostname(container.ID); err != nil {
writeError("sethostname %s", err)
}
if err := capabilities.DropCapabilities(container); err != nil {
writeError("drop capabilities %s", err)
}
if err := setupUser(container); err != nil {
writeError("setup user %s", err)
}
if container.WorkingDir != "" {
if err := chdir(container.WorkingDir); err != nil {
writeError("chdir to %s %s", container.WorkingDir, err)
}
}
if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil {
writeError("exec %s", err)
}
panic("unreachable")
}
go func() {
if _, err := io.Copy(os.Stdout, master); err != nil {
log.Println(err)
}
}()
go func() {
if _, err := io.Copy(master, os.Stdin); err != nil {
log.Println(err)
}
}()
return pid, nil
}
// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's
// pid and namespace configuration is needed along with the specific capabilities that should
// be dropped once inside the namespace.
func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) {
if container.NsPid <= 0 {
return -1, libcontainer.ErrInvalidPid
}
fds, err := getNsFds(container)
if err != nil {
return -1, err
}
if container.NetNsFd > 0 {
fds = append(fds, container.NetNsFd)
}
pid, err := fork()
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
return -1, err
}
if pid == 0 {
for _, fd := range fds {
if fd > 0 {
if err := JoinExistingNamespace(fd, ""); err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
writeError("join existing namespace for %d %s", fd, err)
}
}
syscall.Close(int(fd))
}
if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) &&
container.Namespaces.Contains(libcontainer.CLONE_NEWPID) {
// important:
//
// we need to fork and unshare so that re can remount proc and sys within
// the namespace so the CLONE_NEWPID namespace will take effect
// if we don't fork we would end up unmounting proc and sys for the entire
// namespace
child, err := fork()
if err != nil {
writeError("fork child %s", err)
}
if child == 0 {
if err := unshare(CLONE_NEWNS); err != nil {
writeError("unshare newns %s", err)
}
if err := remountProc(); err != nil {
writeError("remount proc %s", err)
}
if err := remountSys(); err != nil {
writeError("remount sys %s", err)
}
if err := capabilities.DropCapabilities(container); err != nil {
writeError("drop caps %s", err)
}
if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil {
writeError("exec %s", err)
}
panic("unreachable")
}
exit, err := utils.WaitOnPid(child)
if err != nil {
writeError("wait on child %s", err)
}
os.Exit(exit)
}
if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil {
writeError("exec %s", err)
}
panic("unreachable")
}
return pid, err
}
func resolveRootfs(container *libcontainer.Container) (string, error) {
rootfs, err := filepath.Abs(container.RootFs)
if err != nil {
return "", err
}
return filepath.EvalSymlinks(rootfs)
}
func createMasterAndConsole() (*os.File, string, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, "", err
}
console, err := ptsname(master)
if err != nil {
return nil, "", err
}
if err := unlockpt(master); err != nil {
return nil, "", err
}
return master, console, nil
}
func closeMasterAndStd(master *os.File) error {
closefd(master.Fd())
closefd(0)
closefd(1)
closefd(2)
return nil
}
func dupSlave(slave *os.File) error {
// we close Stdin,etc so our pty slave should have fd 0
if slave.Fd() != 0 {
return fmt.Errorf("slave fd not 0 %d", slave.Fd())
}
if err := dup2(slave.Fd(), 1); err != nil {
return err
}
if err := dup2(slave.Fd(), 2); err != nil {
return err
}
return nil
}
func openTerminal(name string, flag int) (*os.File, error) {
r, e := syscall.Open(name, flag, 0)
if e != nil {
return nil, &os.PathError{"open", name, e}
}
return os.NewFile(uintptr(r), name), nil
}

View file

@ -0,0 +1,7 @@
// +build linux,x86_64
package namespaces
// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092
const (
SYS_SETNS = 308
)

View file

@ -0,0 +1,207 @@
package namespaces
import (
"fmt"
"log"
"os"
"path/filepath"
"syscall"
)
var (
// default mount point options
defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
)
func SetupNewMountNamespace(rootfs, console string, readonly bool) error {
if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mounting / as slave %s", err)
}
if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mouting %s as bind %s", rootfs, err)
}
if readonly {
if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
}
}
if err := mountSystem(rootfs); err != nil {
return fmt.Errorf("mount system %s", err)
}
if err := copyDevNodes(rootfs); err != nil {
return fmt.Errorf("copy dev nodes %s", err)
}
ptmx := filepath.Join(rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
if err := setupDev(rootfs); err != nil {
return err
}
if err := setupConsole(rootfs, console); err != nil {
return err
}
if err := chdir(rootfs); err != nil {
return fmt.Errorf("chdir into %s %s", rootfs, err)
}
if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
return fmt.Errorf("mount move %s into / %s", rootfs, err)
}
if err := chroot("."); err != nil {
return fmt.Errorf("chroot . %s", err)
}
if err := chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
umask(0022)
return nil
}
func copyDevNodes(rootfs string) error {
umask(0000)
for _, node := range []string{
"null",
"zero",
"full",
"random",
"urandom",
"tty",
} {
stat, err := os.Stat(filepath.Join("/dev", node))
if err != nil {
return err
}
var (
dest = filepath.Join(rootfs, "dev", node)
st = stat.Sys().(*syscall.Stat_t)
)
log.Printf("copy %s to %s %d\n", node, dest, st.Rdev)
if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
return fmt.Errorf("copy %s %s", node, err)
}
}
return nil
}
func setupDev(rootfs string) error {
for _, link := range []struct {
from string
to string
}{
{"/proc/kcore", "/dev/core"},
{"/proc/self/fd", "/dev/fd"},
{"/proc/self/fd/0", "/dev/stdin"},
{"/proc/self/fd/1", "/dev/stdout"},
{"/proc/self/fd/2", "/dev/stderr"},
} {
dest := filepath.Join(rootfs, link.to)
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("remove %s %s", dest, err)
}
if err := os.Symlink(link.from, dest); err != nil {
return fmt.Errorf("symlink %s %s", dest, err)
}
}
return nil
}
func setupConsole(rootfs, console string) error {
umask(0000)
stat, err := os.Stat(console)
if err != nil {
return fmt.Errorf("stat console %s %s", console, err)
}
st := stat.Sys().(*syscall.Stat_t)
dest := filepath.Join(rootfs, "dev/console")
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("remove %s %s", dest, err)
}
if err := os.Chmod(console, 0600); err != nil {
return err
}
if err := os.Chown(console, 0, 0); err != nil {
return err
}
if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
return fmt.Errorf("mknod %s %s", dest, err)
}
if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
return fmt.Errorf("bind %s to %s %s", console, dest, err)
}
return nil
}
// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
// inside the mount namespace
func mountSystem(rootfs string) error {
mounts := []struct {
source string
path string
device string
flags int
data string
}{
{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults},
{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults},
{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"},
{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"},
{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"},
{source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"},
}
for _, m := range mounts {
if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
return fmt.Errorf("mkdirall %s %s", m.path, err)
}
if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
}
}
return nil
}
func remountProc() error {
if err := unmount("/proc", syscall.MNT_DETACH); err != nil {
return err
}
if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil {
return err
}
return nil
}
func remountSys() error {
if err := unmount("/sys", syscall.MNT_DETACH); err != nil {
if err != syscall.EINVAL {
return err
}
} else {
if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil {
return err
}
}
return nil
}

View file

@ -0,0 +1,70 @@
/*
TODO
pivot root
cgroups
more mount stuff that I probably am forgetting
apparmor
*/
package namespaces
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/utils"
"os"
"path/filepath"
"syscall"
)
// CreateNewNamespace creates a new namespace and binds it's fd to the specified path
func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error {
var (
flag = namespaceMap[namespace]
name = namespaceFileMap[namespace]
nspath = filepath.Join("/proc/self/ns", name)
)
// TODO: perform validation on name and flag
pid, err := fork()
if err != nil {
return err
}
if pid == 0 {
if err := unshare(flag); err != nil {
writeError("unshare %s", err)
}
if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil {
writeError("bind mount %s", err)
}
os.Exit(0)
}
exit, err := utils.WaitOnPid(pid)
if err != nil {
return err
}
if exit != 0 {
return fmt.Errorf("exit status %d", exit)
}
return err
}
// JoinExistingNamespace uses the fd of an existing linux namespace and
// has the current process join that namespace or the spacespace specified by ns
func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error {
flag := namespaceMap[ns]
if err := setns(fd, uintptr(flag)); err != nil {
return err
}
return nil
}
// getNamespaceFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare, and setns
func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) {
for _, ns := range namespaces {
flag |= namespaceMap[ns]
}
return
}

View file

@ -0,0 +1,35 @@
package namespaces
import (
"github.com/dotcloud/docker/pkg/libcontainer"
)
const (
SIGCHLD = 0x14
CLONE_VFORK = 0x00004000
CLONE_NEWNS = 0x00020000
CLONE_NEWUTS = 0x04000000
CLONE_NEWIPC = 0x08000000
CLONE_NEWUSER = 0x10000000
CLONE_NEWPID = 0x20000000
CLONE_NEWNET = 0x40000000
)
var namespaceMap = map[libcontainer.Namespace]int{
"": 0,
libcontainer.CLONE_NEWNS: CLONE_NEWNS,
libcontainer.CLONE_NEWUTS: CLONE_NEWUTS,
libcontainer.CLONE_NEWIPC: CLONE_NEWIPC,
libcontainer.CLONE_NEWUSER: CLONE_NEWUSER,
libcontainer.CLONE_NEWPID: CLONE_NEWPID,
libcontainer.CLONE_NEWNET: CLONE_NEWNET,
}
var namespaceFileMap = map[libcontainer.Namespace]string{
libcontainer.CLONE_NEWNS: "mnt",
libcontainer.CLONE_NEWUTS: "uts",
libcontainer.CLONE_NEWIPC: "ipc",
libcontainer.CLONE_NEWUSER: "user",
libcontainer.CLONE_NEWPID: "pid",
libcontainer.CLONE_NEWNET: "net",
}

View file

@ -0,0 +1,108 @@
package namespaces
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
)
func addEnvIfNotSet(container *libcontainer.Container, key, value string) {
jv := fmt.Sprintf("%s=%s", key, value)
if len(container.Command.Env) == 0 {
container.Command.Env = []string{jv}
return
}
for _, v := range container.Command.Env {
parts := strings.Split(v, "=")
if parts[0] == key {
return
}
}
container.Command.Env = append(container.Command.Env, jv)
}
// print and error to stderr and exit(1)
func writeError(format string, v ...interface{}) {
fmt.Fprintf(os.Stderr, format, v...)
os.Exit(1)
}
// getNsFds inspects the container's namespace configuration and opens the fds to
// each of the namespaces.
func getNsFds(container *libcontainer.Container) ([]uintptr, error) {
var (
namespaces = []string{}
fds = []uintptr{}
)
for _, ns := range container.Namespaces {
namespaces = append(namespaces, namespaceFileMap[ns])
}
for _, ns := range namespaces {
fd, err := getNsFd(container.NsPid, ns)
if err != nil {
for _, fd = range fds {
syscall.Close(int(fd))
}
return nil, err
}
fds = append(fds, fd)
}
return fds, nil
}
// getNsFd returns the fd for a specific pid and namespace option
func getNsFd(pid int, ns string) (uintptr, error) {
nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns)
// OpenFile adds closOnExec
f, err := os.OpenFile(nspath, os.O_RDONLY, 0666)
if err != nil {
return 0, err
}
return f.Fd(), nil
}
// setupEnvironment adds additional environment variables to the container's
// Command such as USER, LOGNAME, container, and TERM
func setupEnvironment(container *libcontainer.Container) {
addEnvIfNotSet(container, "container", "docker")
// TODO: check if pty
addEnvIfNotSet(container, "TERM", "xterm")
// TODO: get username from container
addEnvIfNotSet(container, "USER", "root")
addEnvIfNotSet(container, "LOGNAME", "root")
}
func setupUser(container *libcontainer.Container) error {
// TODO: honor user passed on container
if err := setgroups(nil); err != nil {
return err
}
if err := setresgid(0, 0, 0); err != nil {
return err
}
if err := setresuid(0, 0, 0); err != nil {
return err
}
return nil
}
func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) {
master, err := openpmtx()
if err != nil {
return "", nil, err
}
console, err := ptsname(master)
if err != nil {
master.Close()
return "", nil, err
}
return console, master, nil
}