diff --git a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go new file mode 100644 index 00000000..8981b2a2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go @@ -0,0 +1,114 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/syndtr/gocapability/capability" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS + +var capabilityMap map[string]capability.Cap + +func init() { + capabilityMap = make(map[string]capability.Cap) + last := capability.CAP_LAST_CAP + // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap + if last == capability.Cap(63) { + last = capability.CAP_BLOCK_SUSPEND + } + for _, cap := range capability.List() { + if cap > last { + continue + } + capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())) + capabilityMap[capKey] = cap + } +} + +func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) { + bounding := []capability.Cap{} + for _, c := range capConfig.Bounding { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + bounding = append(bounding, v) + } + effective := []capability.Cap{} + for _, c := range capConfig.Effective { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + effective = append(effective, v) + } + inheritable := []capability.Cap{} + for _, c := range capConfig.Inheritable { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + inheritable = append(inheritable, v) + } + permitted := []capability.Cap{} + for _, c := range capConfig.Permitted { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + permitted = append(permitted, v) + } + ambient := []capability.Cap{} + for _, c := range capConfig.Ambient { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + ambient = append(ambient, v) + } + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + return &containerCapabilities{ + bounding: bounding, + effective: effective, + inheritable: inheritable, + permitted: permitted, + ambient: ambient, + pid: pid, + }, nil +} + +type containerCapabilities struct { + pid capability.Capabilities + bounding []capability.Cap + effective []capability.Cap + inheritable []capability.Cap + permitted []capability.Cap + ambient []capability.Cap +} + +// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. +func (c *containerCapabilities) ApplyBoundingSet() error { + c.pid.Clear(capability.BOUNDS) + c.pid.Set(capability.BOUNDS, c.bounding...) + return c.pid.Apply(capability.BOUNDS) +} + +// Apply sets all the capabilities for the current process in the config. +func (c *containerCapabilities) ApplyCaps() error { + c.pid.Clear(allCapabilityTypes) + c.pid.Set(capability.BOUNDS, c.bounding...) + c.pid.Set(capability.PERMITTED, c.permitted...) + c.pid.Set(capability.INHERITABLE, c.inheritable...) + c.pid.Set(capability.EFFECTIVE, c.effective...) + c.pid.Set(capability.AMBIENT, c.ambient...) + return c.pid.Apply(allCapabilityTypes) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go new file mode 100644 index 00000000..b1efbfd9 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go @@ -0,0 +1,128 @@ +// +build linux + +package rootless + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" +) + +// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code +// needlessly. We should probably export this list. + +var subsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.NetClsGroup{}, + &fs.NetPrioGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error +} + +// The noop cgroup manager is used for rootless containers, because we currently +// cannot manage cgroups if we are in a rootless setup. This manager is chosen +// by factory if we are in rootless mode. We error out if any cgroup options are +// set in the config -- this may change in the future with upcoming kernel features +// like the cgroup namespace. + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func (m *Manager) Apply(pid int) error { + // If there are no cgroup settings, there's nothing to do. + if m.Cgroups == nil { + return nil + } + + // We can't set paths. + // TODO(cyphar): Implement the case where the runner of a rootless container + // owns their own cgroup, which would allow us to set up a + // cgroup for each path. + if m.Cgroups.Paths != nil { + return fmt.Errorf("cannot change cgroup path in rootless container") + } + + // We load the paths into the manager. + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + + path, err := cgroups.GetOwnCgroupPath(name) + if err != nil { + // Ignore paths we couldn't resolve. + continue + } + + paths[name] = path + } + + m.Paths = paths + return nil +} + +func (m *Manager) GetPaths() map[string]string { + return m.Paths +} + +func (m *Manager) Set(container *configs.Config) error { + // We have to re-do the validation here, since someone might decide to + // update a rootless container. + return validate.New().Validate(container) +} + +func (m *Manager) GetPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(dir) +} + +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. While this doesn't + // actually require write access to a cgroup directory, the + // statistics are not useful if they can be affected by + // non-container processes. + return nil, fmt.Errorf("cannot get cgroup stats in rootless container") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. + return fmt.Errorf("cannot use freezer cgroup in rootless container") +} + +func (m *Manager) Destroy() error { + // We don't have to do anything here because we didn't do any setup. + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go new file mode 100644 index 00000000..c7bdf1f6 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go @@ -0,0 +1,10 @@ +// +build linux,!go1.5 + +package libcontainer + +import "syscall" + +// GidMappingsEnableSetgroups was added in Go 1.5, so do nothing when building +// with earlier versions +func enableSetgroups(sys *syscall.SysProcAttr) { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go new file mode 100644 index 00000000..0cebfaf8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + // Currently, cgroups cannot effectively be used in rootless containers. + // The new cgroup namespace doesn't really help us either because it doesn't + // have nice interactions with the user namespace (we're working with upstream + // to fix this). + if err := rootlessCgroup(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func rootlessMappings(config *configs.Config) error { + rootuid, err := config.HostRootUID() + if err != nil { + return fmt.Errorf("failed to get root uid from uidMappings: %v", err) + } + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + if rootuid != euid { + return fmt.Errorf("rootless containers cannot map container root to a different host user") + } + } + + rootgid, err := config.HostRootGID() + if err != nil { + return fmt.Errorf("failed to get root gid from gidMappings: %v", err) + } + + // Similar to the above test, we need to make sure that we aren't trying to + // map to a group ID that we don't have the right to be. + if rootgid != getegid() { + return fmt.Errorf("rootless containers cannot map container root to a different host group") + } + + // We can only map one user and group inside a container (our own). + if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one user") + } + if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one group") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { + return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + } + if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { + return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + } + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go new file mode 100644 index 00000000..82843454 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -0,0 +1,195 @@ +package validate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + selinux "github.com/opencontainers/selinux/go-selinux" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.GetEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return fmt.Errorf("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { + if err := checkHostNs(s, path); err != nil { + return err + } + } + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} + +func isSymbolicLink(path string) (bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil +} + +// checkHostNs checks whether network sysctl is used in host namespace. +func checkHostNs(sysctlConfig string, path string) error { + var currentProcessNetns = "/proc/self/ns/net" + // readlink on the current processes network namespace + destOfCurrentProcess, err := os.Readlink(currentProcessNetns) + if err != nil { + return fmt.Errorf("read soft link %q error", currentProcessNetns) + } + + // First check if the provided path is a symbolic link + symLink, err := isSymbolicLink(path) + if err != nil { + return fmt.Errorf("could not check that %q is a symlink: %v", path, err) + } + + if symLink == false { + // The provided namespace is not a symbolic link, + // it is not the host namespace. + return nil + } + + // readlink on the path provided in the struct + destOfContainer, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("read soft link %q error", path) + } + if destOfContainer == destOfCurrentProcess { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console.go b/vendor/github.com/opencontainers/runc/libcontainer/console.go new file mode 100644 index 00000000..917acc70 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console.go @@ -0,0 +1,17 @@ +package libcontainer + +import ( + "io" + "os" +) + +// Console represents a pseudo TTY. +type Console interface { + io.ReadWriteCloser + + // Path returns the filesystem path to the slave side of the pty. + Path() string + + // Fd returns the fd for the master of the pty. + File() *os.File +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go new file mode 100644 index 00000000..b7166a31 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go @@ -0,0 +1,13 @@ +// +build freebsd + +package libcontainer + +import ( + "errors" +) + +// newConsole returns an initialized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func newConsole() (Console, error) { + return nil, errors.New("libcontainer console is not supported on FreeBSD") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go new file mode 100644 index 00000000..f70de384 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go @@ -0,0 +1,152 @@ +package libcontainer + +import ( + "fmt" + "os" + "unsafe" + + "golang.org/x/sys/unix" +) + +func ConsoleFromFile(f *os.File) Console { + return &linuxConsole{ + master: f, + } +} + +// newConsole returns an initialized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func newConsole() (Console, error) { + master, err := os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + console, err := ptsname(master) + if err != nil { + return nil, err + } + if err := unlockpt(master); err != nil { + return nil, err + } + return &linuxConsole{ + slavePath: console, + master: master, + }, nil +} + +// linuxConsole is a linux pseudo TTY for use within a container. +type linuxConsole struct { + master *os.File + slavePath string +} + +func (c *linuxConsole) File() *os.File { + return c.master +} + +func (c *linuxConsole) Path() string { + return c.slavePath +} + +func (c *linuxConsole) Read(b []byte) (int, error) { + return c.master.Read(b) +} + +func (c *linuxConsole) Write(b []byte) (int, error) { + return c.master.Write(b) +} + +func (c *linuxConsole) Close() error { + if m := c.master; m != nil { + return m.Close() + } + return nil +} + +// mount initializes the console inside the rootfs mounting with the specified mount label +// and applying the correct ownership of the console. +func (c *linuxConsole) mount() error { + oldMask := unix.Umask(0000) + defer unix.Umask(oldMask) + f, err := os.Create("/dev/console") + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return unix.Mount(c.slavePath, "/dev/console", "bind", unix.MS_BIND, "") +} + +// dupStdio opens the slavePath for the console and dups the fds to the current +// processes stdio, fd 0,1,2. +func (c *linuxConsole) dupStdio() error { + slave, err := c.open(unix.O_RDWR) + if err != nil { + return err + } + fd := int(slave.Fd()) + for _, i := range []int{0, 1, 2} { + if err := unix.Dup3(fd, i, 0); err != nil { + return err + } + } + return nil +} + +// open is a clone of os.OpenFile without the O_CLOEXEC used to open the pty slave. +func (c *linuxConsole) open(flag int) (*os.File, error) { + r, e := unix.Open(c.slavePath, flag, 0) + if e != nil { + return nil, &os.PathError{ + Op: "open", + Path: c.slavePath, + Err: e, + } + } + return os.NewFile(uintptr(r), c.slavePath), nil +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// unlockpt should be called before opening the slave side of a pty. +func unlockpt(f *os.File) error { + var u int32 + return ioctl(f.Fd(), unix.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// ptsname retrieves the name of the first available pts for the given master. +func ptsname(f *os.File) (string, error) { + n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN) + if err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// SaneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair +// created by us acts normally. In particular, a not-very-well-known default of +// Linux unix98 ptys is that they have +onlcr by default. While this isn't a +// problem for terminal emulators, because we relay data from the terminal we +// also relay that funky line discipline. +func SaneTerminal(terminal *os.File) error { + termios, err := unix.IoctlGetTermios(int(terminal.Fd()), unix.TCGETS) + if err != nil { + return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error()) + } + + // Set -onlcr so we don't have to deal with \r. + termios.Oflag &^= unix.ONLCR + + if err := unix.IoctlSetTermios(int(terminal.Fd()), unix.TCSETS, termios); err != nil { + return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error()) + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go new file mode 100644 index 00000000..e5ca5459 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go @@ -0,0 +1,11 @@ +package libcontainer + +import ( + "errors" +) + +// newConsole returns an initialized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func newConsole() (Console, error) { + return nil, errors.New("libcontainer console is not supported on Solaris") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go new file mode 100644 index 00000000..c61e866a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go @@ -0,0 +1,30 @@ +package libcontainer + +// newConsole returns an initialized console that can be used within a container +func newConsole() (Console, error) { + return &windowsConsole{}, nil +} + +// windowsConsole is a Windows pseudo TTY for use within a container. +type windowsConsole struct { +} + +func (c *windowsConsole) Fd() uintptr { + return 0 +} + +func (c *windowsConsole) Path() string { + return "" +} + +func (c *windowsConsole) Read(b []byte) (int, error) { + return 0, nil +} + +func (c *windowsConsole) Write(b []byte) (int, error) { + return 0, nil +} + +func (c *windowsConsole) Close() error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go new file mode 100644 index 00000000..2e31b4d4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -0,0 +1,166 @@ +// Package libcontainer provides a native Go implementation for creating containers +// with namespaces, cgroups, capabilities, and filesystem access controls. +// It allows you to manage the lifecycle of the container performing additional operations +// after the container is created. +package libcontainer + +import ( + "os" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// Status is the status of a container. +type Status int + +const ( + // Created is the status that denotes the container exists but has not been run yet. + Created Status = iota + // Running is the status that denotes the container exists and is running. + Running + // Pausing is the status that denotes the container exists, it is in the process of being paused. + Pausing + // Paused is the status that denotes the container exists, but all its processes are paused. + Paused + // Stopped is the status that denotes the container does not have a created or running process. + Stopped +) + +func (s Status) String() string { + switch s { + case Created: + return "created" + case Running: + return "running" + case Pausing: + return "pausing" + case Paused: + return "paused" + case Stopped: + return "stopped" + default: + return "unknown" + } +} + +// BaseState represents the platform agnostic pieces relating to a +// running container's state +type BaseState struct { + // ID is the container ID. + ID string `json:"id"` + + // InitProcessPid is the init process id in the parent namespace. + InitProcessPid int `json:"init_process_pid"` + + // InitProcessStartTime is the init process start time in clock cycles since boot time. + InitProcessStartTime uint64 `json:"init_process_start"` + + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + + // Config is the container's configuration. + Config configs.Config `json:"config"` +} + +// BaseContainer is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. BaseContainer includes methods that are platform agnostic. +type BaseContainer interface { + // Returns the ID of the container + ID() string + + // Returns the current status of the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Status() (Status, error) + + // State returns the current container's state information. + // + // errors: + // SystemError - System error. + State() (*State, error) + + // Returns the current config of the container. + Config() configs.Config + + // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + // + // Some of the returned PIDs may no longer refer to processes in the Container, unless + // the Container state is PAUSED in which case every PID in the slice is valid. + Processes() ([]int, error) + + // Returns statistics for the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Stats() (*Stats, error) + + // Set resources of container as configured + // + // We can use this to change resources when containers are running. + // + // errors: + // SystemError - System error. + Set(config configs.Config) error + + // Start a process inside the container. Returns error if process fails to + // start. You can track process lifecycle with passed Process structure. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Start(process *Process) (err error) + + // Run immediately starts the process inside the container. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + + // Destroys the container, if its in a valid state, after killing any + // remaining running processes. + // + // Any event registrations are removed before the container is destroyed. + // No error is returned if the container is already destroyed. + // + // Running containers must first be stopped using Signal(..). + // Paused containers must first be resumed using Resume(..). + // + // errors: + // ContainerNotStopped - Container is still running, + // ContainerPaused - Container is paused, + // SystemError - System error. + Destroy() error + + // Signal sends the provided signal code to the container's initial process. + // + // If all is specified the signal is sent to all processes in the container + // including the initial process. + // + // errors: + // SystemError - System error. + Signal(s os.Signal, all bool) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go new file mode 100644 index 00000000..7a319ef8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -0,0 +1,1574 @@ +// +build linux + +package libcontainer + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "os/exec" + "path/filepath" + "reflect" + "strings" + "sync" + "syscall" // only for SysProcAttr and Signal + "time" + + "golang.org/x/sys/unix" + + "github.com/Sirupsen/logrus" + "github.com/golang/protobuf/proto" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" + "github.com/vishvananda/netlink/nl" +) + +const stdioFdCount = 3 + +type linuxContainer struct { + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + initArgs []string + initProcess parentProcess + initProcessStartTime uint64 + criuPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time +} + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here + + // Specifies if the container was started under the rootless mode. + Rootless bool `json:"rootless"` + + // Path to all the cgroups setup for a container. Key is cgroup subsystem name + // with the value as the path. + CgroupPaths map[string]string `json:"cgroup_paths"` + + // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type + // with the value as the path. + NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` + + // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore + ExternalDescriptors []string `json:"external_descriptors,omitempty"` +} + +// Container is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + + // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Checkpoint(criuOpts *CriuOpts) error + + // Restore restores the checkpointed container to a running state using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Restore(process *Process, criuOpts *CriuOpts) error + + // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses + // the execution of any user processes. Asynchronously, when the container finished being paused the + // state is changed to PAUSED. + // If the Container state is PAUSED, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotRunning - Container not running or created, + // Systemerror - System error. + Pause() error + + // If the Container state is PAUSED, resumes the execution of any user processes in the + // Container before setting the Container state to RUNNING. + // If the Container state is RUNNING, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotPaused - Container is not paused, + // Systemerror - System error. + Resume() error + + // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. + // + // errors: + // Systemerror - System error. + NotifyOOM() (<-chan struct{}, error) + + // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level + // + // errors: + // Systemerror - System error. + NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) +} + +// ID returns the container's unique ID +func (c *linuxContainer) ID() string { + return c.id +} + +// Config returns the container's configuration +func (c *linuxContainer) Config() configs.Config { + return *c.config +} + +func (c *linuxContainer) Status() (Status, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentStatus() +} + +func (c *linuxContainer) State() (*State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentState() +} + +func (c *linuxContainer) Processes() ([]int, error) { + pids, err := c.cgroupManager.GetAllPids() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") + } + return pids, nil +} + +func (c *linuxContainer) Stats() (*Stats, error) { + var ( + err error + stats = &Stats{} + ) + if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") + } + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) + if err != nil { + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) + } + stats.Interfaces = append(stats.Interfaces, istats) + } + } + return stats, nil +} + +func (c *linuxContainer) Set(config configs.Config) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } + c.config = &config + return c.cgroupManager.Set(c.config) +} + +func (c *linuxContainer) Start(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + if err := c.createExecFifo(); err != nil { + return err + } + } + if err := c.start(process, status == Stopped); err != nil { + if status == Stopped { + c.deleteExecFifo() + } + return err + } + return nil +} + +func (c *linuxContainer) Run(process *Process) error { + c.m.Lock() + status, err := c.currentStatus() + if err != nil { + c.m.Unlock() + return err + } + c.m.Unlock() + if err := c.Start(process); err != nil { + return err + } + if status == Stopped { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo for reading") + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + if len(data) > 0 { + os.Remove(path) + return nil + } + return fmt.Errorf("cannot start an already running container") +} + +func (c *linuxContainer) start(process *Process, isInit bool) error { + parent, err := c.newParentProcess(process, isInit) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") + } + if err := parent.start(); err != nil { + // terminate the process to ensure that it properly is reaped. + if err := parent.terminate(); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCause(err, "starting container process") + } + // generate a timestamp indicating when the container was started + c.created = time.Now().UTC() + if isInit { + c.state = &createdState{ + c: c, + } + state, err := c.updateState(parent) + if err != nil { + return err + } + c.initProcessStartTime = state.InitProcessStartTime + + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + } + for i, hook := range c.config.Hooks.Poststart { + if err := hook.Run(s); err != nil { + if err := parent.terminate(); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCausef(err, "running poststart hook %d", i) + } + } + } + } else { + c.state = &runningState{ + c: c, + } + } + return nil +} + +func (c *linuxContainer) Signal(s os.Signal, all bool) error { + if all { + return signalAllProcesses(c.cgroupManager, s) + } + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") + } + return nil +} + +func (c *linuxContainer) createExecFifo() error { + rootuid, err := c.Config().HostRootUID() + if err != nil { + return err + } + rootgid, err := c.Config().HostRootGID() + if err != nil { + return err + } + + fifoName := filepath.Join(c.root, execFifoFilename) + if _, err := os.Stat(fifoName); err == nil { + return fmt.Errorf("exec fifo %s already exists", fifoName) + } + oldMask := unix.Umask(0000) + if err := unix.Mkfifo(fifoName, 0622); err != nil { + unix.Umask(oldMask) + return err + } + unix.Umask(oldMask) + if err := os.Chown(fifoName, rootuid, rootgid); err != nil { + return err + } + return nil +} + +func (c *linuxContainer) deleteExecFifo() { + fifoName := filepath.Join(c.root, execFifoFilename) + os.Remove(fifoName) +} + +func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { + parentPipe, childPipe, err := utils.NewSockPair("init") + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new init pipe") + } + cmd, err := c.commandTemplate(p, childPipe) + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new command template") + } + if !doInit { + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + } + + // We only set up rootDir if we're not doing a `runc exec`. The reason for + // this is to avoid cases where a racing, unprivileged process inside the + // container can get access to the statedir file descriptor (which would + // allow for container rootfs escape). + rootDir, err := os.Open(c.root) + if err != nil { + return nil, err + } + cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) +} + +func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { + cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) + cmd.Stdin = p.Stdin + cmd.Stdout = p.Stdout + cmd.Stderr = p.Stderr + cmd.Dir = c.config.Rootfs + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) + if p.ConsoleSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + ) + // NOTE: when running a container with no PID namespace and the parent process spawning the container is + // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason + // even with the parent still running. + if c.config.ParentDeathSignal > 0 { + cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) + } + return cmd, nil +} + +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) + if err != nil { + return nil, err + } + return &initProcess{ + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, + rootDir: rootDir, + }, nil +} + +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting container's current state") + } + // for setns process, we don't have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(0, state.NamespacePaths) + if err != nil { + return nil, err + } + return &setnsProcess{ + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, + }, nil +} + +func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + cfg := &initConfig{ + Config: c.config, + Args: process.Args, + Env: process.Env, + User: process.User, + AdditionalGroups: process.AdditionalGroups, + Cwd: process.Cwd, + Capabilities: process.Capabilities, + PassedFilesCount: len(process.ExtraFiles), + ContainerId: c.ID(), + NoNewPrivileges: c.config.NoNewPrivileges, + Rootless: c.config.Rootless, + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, + } + if process.NoNewPrivileges != nil { + cfg.NoNewPrivileges = *process.NoNewPrivileges + } + if process.AppArmorProfile != "" { + cfg.AppArmorProfile = process.AppArmorProfile + } + if process.Label != "" { + cfg.ProcessLabel = process.Label + } + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } + cfg.CreateConsole = process.ConsoleSocket != nil + return cfg +} + +func (c *linuxContainer) Destroy() error { + c.m.Lock() + defer c.m.Unlock() + return c.state.destroy() +} + +func (c *linuxContainer) Pause() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) + } + return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) +} + +func (c *linuxContainer) Resume() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Paused { + return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + } + if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return c.state.transition(&runningState{ + c: c, + }) +} + +func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + } + return notifyOnOOM(c.cgroupManager.GetPaths()) +} + +func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + } + return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) +} + +var criuFeatures *criurpc.CriuFeatures + +func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { + + var t criurpc.CriuReqType + t = criurpc.CriuReqType_FEATURE_CHECK + + if err := c.checkCriuVersion("1.8"); err != nil { + // Feature checking was introduced with CRIU 1.8. + // Ignore the feature check if an older CRIU version is used + // and just act as before. + // As all automated PR testing is done using CRIU 1.7 this + // code will not be tested by automated PR testing. + return nil + } + + // make sure the features we are looking for are really not from + // some previous check + criuFeatures = nil + + req := &criurpc.CriuReq{ + Type: &t, + // Theoretically this should not be necessary but CRIU + // segfaults if Opts is empty. + // Fixed in CRIU 2.12 + Opts: rpcOpts, + Features: criuFeat, + } + + err := c.criuSwrk(nil, req, criuOpts, false) + if err != nil { + logrus.Debugf("%s", err) + return fmt.Errorf("CRIU feature check failed") + } + + logrus.Debugf("Feature check says: %s", criuFeatures) + missingFeatures := false + + if *criuFeat.MemTrack && !*criuFeatures.MemTrack { + missingFeatures = true + logrus.Debugf("CRIU does not support MemTrack") + } + + if missingFeatures { + return fmt.Errorf("CRIU is missing features") + } + + return nil +} + +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion string) error { + var x, y, z, versionReq int + + _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 + if err != nil { + _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 + } + versionReq = x*10000 + y*100 + z + + out, err := exec.Command(c.criuPath, "-V").Output() + if err != nil { + return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath) + } + + x = 0 + y = 0 + z = 0 + if ep := strings.Index(string(out), "-"); ep >= 0 { + // criu Git version format + var version string + if sp := strings.Index(string(out), "GitID"); sp > 0 { + version = string(out)[sp:ep] + } else { + return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath) + } + + n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 + y++ + } else { + z++ + } + if n < 2 || err != nil { + return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) + } + } else { + // criu release version format + n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 + } + if n < 2 || err != nil { + return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) + } + } + + c.criuVersion = x*10000 + y*100 + z + + if c.criuVersion < versionReq { + return fmt.Errorf("CRIU version %d must be %d or higher", c.criuVersion, versionReq) + } + + return nil +} + +const descriptorsFilename = "descriptors.json" + +func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(mountDest), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { + for _, path := range c.config.MaskPaths { + fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return err + } + if fi.IsDir() { + continue + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(path), + Val: proto.String("/dev/null"), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) + } + + return nil +} + +func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + if c.config.Rootless { + return fmt.Errorf("cannot checkpoint a rootless container") + } + + if err := c.checkCriuVersion("1.5.2"); err != nil { + return err + } + + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to save checkpoint") + } + + // Since a container can be C/R'ed multiple times, + // the checkpoint directory may already exist. + if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) { + return err + } + + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + + if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) { + return err + } + + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + + rpcOpts := criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + LogLevel: proto.Int32(4), + LogFile: proto.String("dump.log"), + Root: proto.String(c.config.Rootfs), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + Pid: proto.Int32(int32(c.initProcess.pid())), + ShellJob: proto.Bool(criuOpts.ShellJob), + LeaveRunning: proto.Bool(criuOpts.LeaveRunning), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + } + + fcg := c.cgroupManager.GetPaths()["freezer"] + if fcg != "" { + rpcOpts.FreezeCgroup = proto.String(fcg) + } + + // append optional criu opts, e.g., page-server and port + if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { + rpcOpts.Ps = &criurpc.CriuPageServerInfo{ + Address: proto.String(criuOpts.PageServer.Address), + Port: proto.Int32(criuOpts.PageServer.Port), + } + } + + //pre-dump may need parentImage param to complete iterative migration + if criuOpts.ParentImage != "" { + rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) + rpcOpts.TrackMem = proto.Bool(true) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + if err := c.checkCriuVersion("1.7"); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + rpcOpts.ManageCgroupsMode = &mode + } + + var t criurpc.CriuReqType + if criuOpts.PreDump { + feat := criurpc.CriuFeatures{ + MemTrack: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + t = criurpc.CriuReqType_PRE_DUMP + } else { + t = criurpc.CriuReqType_DUMP + } + req := &criurpc.CriuReq{ + Type: &t, + Opts: &rpcOpts, + } + + //no need to dump these information in pre-dump + if !criuOpts.PreDump { + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuDumpMount(req, m) + break + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuDumpMount(req, b) + } + break + } + } + + if err := c.addMaskPaths(req); err != nil { + return err + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuDumpMount(req, m) + } + + // Write the FD info to a file in the image directory + fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) + if err != nil { + return err + } + + err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) + if err != nil { + return err + } + } + + err = c.criuSwrk(nil, req, criuOpts, false) + if err != nil { + return err + } + return nil +} + +func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(m.Source), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + break + case "loopback": + break + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + +func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + if c.config.Rootless { + return fmt.Errorf("cannot restore a rootless container") + } + + if err := c.checkCriuVersion("1.5.2"); err != nil { + return err + } + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + // Since a container can be C/R'ed multiple times, + // the work directory may already exist. + if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to restore checkpoint") + } + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + // CRIU has a few requirements for a root directory: + // * it must be a mount point + // * its parent must not be overmounted + // c.config.Rootfs is bind-mounted to a temporary directory + // to satisfy these requirements. + root := filepath.Join(c.root, "criu-root") + if err := os.Mkdir(root, 0755); err != nil { + return err + } + defer os.Remove(root) + root, err = filepath.EvalSymlinks(root) + if err != nil { + return err + } + err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") + if err != nil { + return err + } + defer unix.Unmount(root, unix.MNT_DETACH) + t := criurpc.CriuReqType_RESTORE + req := &criurpc.CriuReq{ + Type: &t, + Opts: &criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + EvasiveDevices: proto.Bool(true), + LogLevel: proto.Int32(4), + LogFile: proto.String("restore.log"), + RstSibling: proto.Bool(true), + Root: proto.String(root), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + ShellJob: proto.Bool(criuOpts.ShellJob), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + }, + } + + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuRestoreMount(req, m) + break + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuRestoreMount(req, b) + } + break + } + } + + if len(c.config.MaskPaths) > 0 { + m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} + c.addCriuRestoreMount(req, m) + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuRestoreMount(req, m) + } + + if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + if err := c.checkCriuVersion("1.7"); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + req.Opts.ManageCgroupsMode = &mode + } + + var ( + fds []string + fdJSON []byte + ) + if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { + return err + } + + if err := json.Unmarshal(fdJSON, &fds); err != nil { + return err + } + for i := range fds { + if s := fds[i]; strings.Contains(s, "pipe:") { + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String(s) + inheritFd.Fd = proto.Int32(int32(i)) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + } + } + return c.criuSwrk(process, req, criuOpts, true) +} + +func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // XXX: Do we need to deal with this case? AFAIK criu still requires root. + if err := c.cgroupManager.Apply(pid); err != nil { + return err + } + + if err := c.cgroupManager.Set(c.config); err != nil { + return newSystemError(err) + } + + path := fmt.Sprintf("/proc/%d/cgroup", pid) + cgroupsPaths, err := cgroups.ParseCgroupFile(path) + if err != nil { + return err + } + + for c, p := range cgroupsPaths { + cgroupRoot := &criurpc.CgroupRoot{ + Ctrl: proto.String(c), + Path: proto.String(p), + } + req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) + } + + return nil +} + +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return err + } + + logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) + criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") + criuClientFileCon, err := net.FileConn(criuClient) + criuClient.Close() + if err != nil { + return err + } + + criuClientCon := criuClientFileCon.(*net.UnixConn) + defer criuClientCon.Close() + + criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") + defer criuServer.Close() + + args := []string{"swrk", "3"} + logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) + logrus.Debugf("Using CRIU with following args: %s", args) + cmd := exec.Command(c.criuPath, args...) + if process != nil { + cmd.Stdin = process.Stdin + cmd.Stdout = process.Stdout + cmd.Stderr = process.Stderr + } + cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + + if err := cmd.Start(); err != nil { + return err + } + criuServer.Close() + + defer func() { + criuClientCon.Close() + _, err := cmd.Process.Wait() + if err != nil { + return + } + }() + + if applyCgroups { + err := c.criuApplyCgroups(cmd.Process.Pid, req) + if err != nil { + return err + } + } + + var extFds []string + if process != nil { + extFds, err = getPipeFds(cmd.Process.Pid) + if err != nil { + return err + } + } + + logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) + // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() + // should be empty. For older CRIU versions it still will be + // available but empty. + if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK { + val := reflect.ValueOf(req.GetOpts()) + v := reflect.Indirect(val) + for i := 0; i < v.NumField(); i++ { + st := v.Type() + name := st.Field(i).Name + if strings.HasPrefix(name, "XXX_") { + continue + } + value := val.MethodByName("Get" + name).Call([]reflect.Value{}) + logrus.Debugf("CRIU option %s with value %v", name, value[0]) + } + } + data, err := proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + + buf := make([]byte, 10*4096) + oob := make([]byte, 4096) + for true { + n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) + if err != nil { + return err + } + if n == 0 { + return fmt.Errorf("unexpected EOF") + } + if n == len(buf) { + return fmt.Errorf("buffer is too small") + } + + resp := new(criurpc.CriuResp) + err = proto.Unmarshal(buf[:n], resp) + if err != nil { + return err + } + if !resp.GetSuccess() { + typeString := req.GetType().String() + return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) + } + + t := resp.GetType() + switch { + case t == criurpc.CriuReqType_FEATURE_CHECK: + logrus.Debugf("Feature check says: %s", resp) + criuFeatures = resp.GetFeatures() + break + case t == criurpc.CriuReqType_NOTIFY: + if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil { + return err + } + t = criurpc.CriuReqType_NOTIFY + req = &criurpc.CriuReq{ + Type: &t, + NotifySuccess: proto.Bool(true), + } + data, err = proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + continue + case t == criurpc.CriuReqType_RESTORE: + case t == criurpc.CriuReqType_DUMP: + case t == criurpc.CriuReqType_PRE_DUMP: + default: + return fmt.Errorf("unable to parse the response %s", resp.String()) + } + + break + } + + criuClientCon.CloseWrite() + // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. + // Here we want to wait only the CRIU process. + st, err := cmd.Process.Wait() + if err != nil { + return err + } + + // In pre-dump mode CRIU is in a loop and waits for + // the final DUMP command. + // The current runc pre-dump approach, however, is + // start criu in PRE_DUMP once for a single pre-dump + // and not the whole series of pre-dump, pre-dump, ...m, dump + // If we got the message CriuReqType_PRE_DUMP it means + // CRIU was successful and we need to forcefully stop CRIU + if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { + return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) + } + return nil +} + +// block any external network activity +func lockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + + if err := strategy.detach(config); err != nil { + return err + } + } + return nil +} + +func unlockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err = strategy.attach(config); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error { + notify := resp.GetNotify() + if notify == nil { + return fmt.Errorf("invalid response: %s", resp.String()) + } + logrus.Debugf("notify: %s\n", notify.GetScript()) + switch { + case notify.GetScript() == "post-dump": + f, err := os.Create(filepath.Join(c.root, "checkpoint")) + if err != nil { + return err + } + f.Close() + case notify.GetScript() == "network-unlock": + if err := unlockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "network-lock": + if err := lockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "setup-namespaces": + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: int(notify.GetPid()), + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + } + for i, hook := range c.config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + case notify.GetScript() == "post-restore": + pid := notify.GetPid() + r, err := newRestoredProcess(int(pid), fds) + if err != nil { + return err + } + process.ops = r + if err := c.state.transition(&restoredState{ + imageDir: opts.ImagesDirectory, + c: c, + }); err != nil { + return err + } + // create a timestamp indicating when the restored checkpoint was started + c.created = time.Now().UTC() + if _, err := c.updateState(r); err != nil { + return err + } + if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + logrus.Error(err) + } + } + case notify.GetScript() == "orphan-pts-master": + scm, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return err + } + fds, err := unix.ParseUnixRights(&scm[0]) + + master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") + defer master.Close() + + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(process.ConsoleSocket, master); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) updateState(process parentProcess) (*State, error) { + c.initProcess = process + state, err := c.currentState() + if err != nil { + return nil, err + } + err = c.saveState(state) + if err != nil { + return nil, err + } + return state, nil +} + +func (c *linuxContainer) saveState(s *State) error { + f, err := os.Create(filepath.Join(c.root, stateFilename)) + if err != nil { + return err + } + defer f.Close() + return utils.WriteJSON(f, s) +} + +func (c *linuxContainer) deleteState() error { + return os.Remove(filepath.Join(c.root, stateFilename)) +} + +func (c *linuxContainer) currentStatus() (Status, error) { + if err := c.refreshState(); err != nil { + return -1, err + } + return c.state.status(), nil +} + +// refreshState needs to be called to verify that the current state on the +// container is what is true. Because consumers of libcontainer can use it +// out of process we need to verify the container's status based on runtime +// information and not rely on our in process info. +func (c *linuxContainer) refreshState() error { + paused, err := c.isPaused() + if err != nil { + return err + } + if paused { + return c.state.transition(&pausedState{c: c}) + } + t, err := c.runType() + if err != nil { + return err + } + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: + return c.state.transition(&runningState{c: c}) + } + return c.state.transition(&stoppedState{c: c}) +} + +func (c *linuxContainer) runType() (Status, error) { + if c.initProcess == nil { + return Stopped, nil + } + pid := c.initProcess.pid() + stat, err := system.Stat(pid) + if err != nil { + return Stopped, nil + } + if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { + return Stopped, nil + } + // We'll create exec fifo and blocking on it after container is created, + // and delete it after start container. + if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { + return Created, nil + } + return Running, nil +} + +func (c *linuxContainer) isPaused() (bool, error) { + fcg := c.cgroupManager.GetPaths()["freezer"] + if fcg == "" { + // A container doesn't have a freezer cgroup + return false, nil + } + data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state")) + if err != nil { + // If freezer cgroup is not mounted, the container would just be not paused. + if os.IsNotExist(err) { + return false, nil + } + return false, newSystemErrorWithCause(err, "checking if container is paused") + } + return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil +} + +func (c *linuxContainer) currentState() (*State, error) { + var ( + startTime uint64 + externalDescriptors []string + pid = -1 + ) + if c.initProcess != nil { + pid = c.initProcess.pid() + startTime, _ = c.initProcess.startTime() + externalDescriptors = c.initProcess.externalDescriptors() + } + state := &State{ + BaseState: BaseState{ + ID: c.ID(), + Config: *c.config, + InitProcessPid: pid, + InitProcessStartTime: startTime, + Created: c.created, + }, + Rootless: c.config.Rootless, + CgroupPaths: c.cgroupManager.GetPaths(), + NamespacePaths: make(map[configs.NamespaceType]string), + ExternalDescriptors: externalDescriptors, + } + if pid > 0 { + for _, ns := range c.config.Namespaces { + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + for _, nsType := range configs.NamespaceTypes() { + if !configs.IsNamespaceSupported(nsType) { + continue + } + if _, ok := state.NamespacePaths[nsType]; !ok { + ns := configs.Namespace{Type: nsType} + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + } + } + return state, nil +} + +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + + for _, ns := range configs.NamespaceTypes() { + + // Remove namespaces that we don't need to join. + if !c.config.Namespaces.Contains(ns) { + continue + } + + if p, ok := namespaces[ns]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(ns) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) + } + + } + + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.GidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // The following only applies if we are root. + if !c.config.Rootless { + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + } + + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), + }) + + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessAttr, + Value: c.config.Rootless, + }) + + return bytes.NewReader(r.Serialize()), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go new file mode 100644 index 00000000..bb84ff74 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go new file mode 100644 index 00000000..bb84ff74 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go new file mode 100644 index 00000000..9423d246 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go @@ -0,0 +1,37 @@ +package libcontainer + +// cgroup restoring strategy provided by criu +type cgMode uint32 + +const ( + CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu + CRIU_CG_MODE_FULL // always restore all cgroups and their properties + CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system + CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT +) + +type CriuPageServerInfo struct { + Address string // IP address of CRIU page server + Port int32 // port number of CRIU page server +} + +type VethPairName struct { + ContainerInterfaceName string + HostInterfaceName string +} + +type CriuOpts struct { + ImagesDirectory string // directory for storing image files + WorkDirectory string // directory to cd and write logs/pidfiles/stats to + ParentImage string // direcotry for storing parent image files in pre-dump and dump + LeaveRunning bool // leave container in running state after checkpoint + TcpEstablished bool // checkpoint/restore established TCP connections + ExternalUnixConnections bool // allow external unix connections + ShellJob bool // allow to dump and restore shell jobs + FileLocks bool // handle file locks, for safety + PreDump bool // call criu predump to perform iterative checkpoint + PageServer CriuPageServerInfo // allow to dump to criu page server + VethPairs []VethPairName // pass the veth to criu when restore + ManageCgroupsMode cgMode // dump or restore cgroup mode + EmptyNs uint32 // don't c/r properties for namespace from this mask +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go new file mode 100644 index 00000000..bc920770 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go @@ -0,0 +1,6 @@ +package libcontainer + +// TODO Windows: This can ultimately be entirely factored out as criu is +// a Unix concept not relevant on Windows. +type CriuOpts struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go new file mode 100644 index 00000000..1e5311bf --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go @@ -0,0 +1,1069 @@ +// Code generated by protoc-gen-go. +// source: criurpc.proto +// DO NOT EDIT! + +/* +Package criurpc is a generated protocol buffer package. + +It is generated from these files: + criurpc.proto + +It has these top-level messages: + CriuPageServerInfo + CriuVethPair + ExtMountMap + JoinNamespace + InheritFd + CgroupRoot + UnixSk + CriuOpts + CriuDumpResp + CriuRestoreResp + CriuNotify + CriuFeatures + CriuReq + CriuResp +*/ +package criurpc + +import proto "github.com/golang/protobuf/proto" +import fmt "fmt" +import math "math" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package + +type CriuCgMode int32 + +const ( + CriuCgMode_IGNORE CriuCgMode = 0 + CriuCgMode_CG_NONE CriuCgMode = 1 + CriuCgMode_PROPS CriuCgMode = 2 + CriuCgMode_SOFT CriuCgMode = 3 + CriuCgMode_FULL CriuCgMode = 4 + CriuCgMode_STRICT CriuCgMode = 5 + CriuCgMode_DEFAULT CriuCgMode = 6 +) + +var CriuCgMode_name = map[int32]string{ + 0: "IGNORE", + 1: "CG_NONE", + 2: "PROPS", + 3: "SOFT", + 4: "FULL", + 5: "STRICT", + 6: "DEFAULT", +} +var CriuCgMode_value = map[string]int32{ + "IGNORE": 0, + "CG_NONE": 1, + "PROPS": 2, + "SOFT": 3, + "FULL": 4, + "STRICT": 5, + "DEFAULT": 6, +} + +func (x CriuCgMode) Enum() *CriuCgMode { + p := new(CriuCgMode) + *p = x + return p +} +func (x CriuCgMode) String() string { + return proto.EnumName(CriuCgMode_name, int32(x)) +} +func (x *CriuCgMode) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode") + if err != nil { + return err + } + *x = CriuCgMode(value) + return nil +} +func (CriuCgMode) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } + +type CriuReqType int32 + +const ( + CriuReqType_EMPTY CriuReqType = 0 + CriuReqType_DUMP CriuReqType = 1 + CriuReqType_RESTORE CriuReqType = 2 + CriuReqType_CHECK CriuReqType = 3 + CriuReqType_PRE_DUMP CriuReqType = 4 + CriuReqType_PAGE_SERVER CriuReqType = 5 + CriuReqType_NOTIFY CriuReqType = 6 + CriuReqType_CPUINFO_DUMP CriuReqType = 7 + CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 +) + +var CriuReqType_name = map[int32]string{ + 0: "EMPTY", + 1: "DUMP", + 2: "RESTORE", + 3: "CHECK", + 4: "PRE_DUMP", + 5: "PAGE_SERVER", + 6: "NOTIFY", + 7: "CPUINFO_DUMP", + 8: "CPUINFO_CHECK", + 9: "FEATURE_CHECK", +} +var CriuReqType_value = map[string]int32{ + "EMPTY": 0, + "DUMP": 1, + "RESTORE": 2, + "CHECK": 3, + "PRE_DUMP": 4, + "PAGE_SERVER": 5, + "NOTIFY": 6, + "CPUINFO_DUMP": 7, + "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, +} + +func (x CriuReqType) Enum() *CriuReqType { + p := new(CriuReqType) + *p = x + return p +} +func (x CriuReqType) String() string { + return proto.EnumName(CriuReqType_name, int32(x)) +} +func (x *CriuReqType) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(CriuReqType_value, data, "CriuReqType") + if err != nil { + return err + } + *x = CriuReqType(value) + return nil +} +func (CriuReqType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{1} } + +type CriuPageServerInfo struct { + Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"` + Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"` + Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"` + Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuPageServerInfo) Reset() { *m = CriuPageServerInfo{} } +func (m *CriuPageServerInfo) String() string { return proto.CompactTextString(m) } +func (*CriuPageServerInfo) ProtoMessage() {} +func (*CriuPageServerInfo) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } + +func (m *CriuPageServerInfo) GetAddress() string { + if m != nil && m.Address != nil { + return *m.Address + } + return "" +} + +func (m *CriuPageServerInfo) GetPort() int32 { + if m != nil && m.Port != nil { + return *m.Port + } + return 0 +} + +func (m *CriuPageServerInfo) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +func (m *CriuPageServerInfo) GetFd() int32 { + if m != nil && m.Fd != nil { + return *m.Fd + } + return 0 +} + +type CriuVethPair struct { + IfIn *string `protobuf:"bytes,1,req,name=if_in" json:"if_in,omitempty"` + IfOut *string `protobuf:"bytes,2,req,name=if_out" json:"if_out,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuVethPair) Reset() { *m = CriuVethPair{} } +func (m *CriuVethPair) String() string { return proto.CompactTextString(m) } +func (*CriuVethPair) ProtoMessage() {} +func (*CriuVethPair) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} } + +func (m *CriuVethPair) GetIfIn() string { + if m != nil && m.IfIn != nil { + return *m.IfIn + } + return "" +} + +func (m *CriuVethPair) GetIfOut() string { + if m != nil && m.IfOut != nil { + return *m.IfOut + } + return "" +} + +type ExtMountMap struct { + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *ExtMountMap) Reset() { *m = ExtMountMap{} } +func (m *ExtMountMap) String() string { return proto.CompactTextString(m) } +func (*ExtMountMap) ProtoMessage() {} +func (*ExtMountMap) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{2} } + +func (m *ExtMountMap) GetKey() string { + if m != nil && m.Key != nil { + return *m.Key + } + return "" +} + +func (m *ExtMountMap) GetVal() string { + if m != nil && m.Val != nil { + return *m.Val + } + return "" +} + +type JoinNamespace struct { + Ns *string `protobuf:"bytes,1,req,name=ns" json:"ns,omitempty"` + NsFile *string `protobuf:"bytes,2,req,name=ns_file" json:"ns_file,omitempty"` + ExtraOpt *string `protobuf:"bytes,3,opt,name=extra_opt" json:"extra_opt,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *JoinNamespace) Reset() { *m = JoinNamespace{} } +func (m *JoinNamespace) String() string { return proto.CompactTextString(m) } +func (*JoinNamespace) ProtoMessage() {} +func (*JoinNamespace) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{3} } + +func (m *JoinNamespace) GetNs() string { + if m != nil && m.Ns != nil { + return *m.Ns + } + return "" +} + +func (m *JoinNamespace) GetNsFile() string { + if m != nil && m.NsFile != nil { + return *m.NsFile + } + return "" +} + +func (m *JoinNamespace) GetExtraOpt() string { + if m != nil && m.ExtraOpt != nil { + return *m.ExtraOpt + } + return "" +} + +type InheritFd struct { + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *InheritFd) Reset() { *m = InheritFd{} } +func (m *InheritFd) String() string { return proto.CompactTextString(m) } +func (*InheritFd) ProtoMessage() {} +func (*InheritFd) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{4} } + +func (m *InheritFd) GetKey() string { + if m != nil && m.Key != nil { + return *m.Key + } + return "" +} + +func (m *InheritFd) GetFd() int32 { + if m != nil && m.Fd != nil { + return *m.Fd + } + return 0 +} + +type CgroupRoot struct { + Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"` + Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CgroupRoot) Reset() { *m = CgroupRoot{} } +func (m *CgroupRoot) String() string { return proto.CompactTextString(m) } +func (*CgroupRoot) ProtoMessage() {} +func (*CgroupRoot) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{5} } + +func (m *CgroupRoot) GetCtrl() string { + if m != nil && m.Ctrl != nil { + return *m.Ctrl + } + return "" +} + +func (m *CgroupRoot) GetPath() string { + if m != nil && m.Path != nil { + return *m.Path + } + return "" +} + +type UnixSk struct { + Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *UnixSk) Reset() { *m = UnixSk{} } +func (m *UnixSk) String() string { return proto.CompactTextString(m) } +func (*UnixSk) ProtoMessage() {} +func (*UnixSk) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{6} } + +func (m *UnixSk) GetInode() uint32 { + if m != nil && m.Inode != nil { + return *m.Inode + } + return 0 +} + +type CriuOpts struct { + ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd" json:"images_dir_fd,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` + LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running" json:"leave_running,omitempty"` + ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk" json:"ext_unix_sk,omitempty"` + TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established" json:"tcp_established,omitempty"` + EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices" json:"evasive_devices,omitempty"` + ShellJob *bool `protobuf:"varint,7,opt,name=shell_job" json:"shell_job,omitempty"` + FileLocks *bool `protobuf:"varint,8,opt,name=file_locks" json:"file_locks,omitempty"` + LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,def=2" json:"log_level,omitempty"` + LogFile *string `protobuf:"bytes,10,opt,name=log_file" json:"log_file,omitempty"` + Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"` + NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts" json:"notify_scripts,omitempty"` + Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"` + ParentImg *string `protobuf:"bytes,14,opt,name=parent_img" json:"parent_img,omitempty"` + TrackMem *bool `protobuf:"varint,15,opt,name=track_mem" json:"track_mem,omitempty"` + AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup" json:"auto_dedup,omitempty"` + WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd" json:"work_dir_fd,omitempty"` + LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap" json:"link_remap,omitempty"` + Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"` + CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,def=4294967295" json:"cpu_cap,omitempty"` + ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap" json:"force_irmap,omitempty"` + ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd" json:"exec_cmd,omitempty"` + ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt" json:"ext_mnt,omitempty"` + ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups" json:"manage_cgroups,omitempty"` + CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root" json:"cg_root,omitempty"` + RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling" json:"rst_sibling,omitempty"` + InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd" json:"inherit_fd,omitempty"` + AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt" json:"auto_ext_mnt,omitempty"` + ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing" json:"ext_sharing,omitempty"` + ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters" json:"ext_masters,omitempty"` + SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"` + EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"` + UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"` + ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"` + GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"` + IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"` + External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"` + EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"` + JoinNs []*JoinNamespace `protobuf:"bytes,39,rep,name=join_ns" json:"join_ns,omitempty"` + CgroupProps *string `protobuf:"bytes,41,opt,name=cgroup_props" json:"cgroup_props,omitempty"` + CgroupPropsFile *string `protobuf:"bytes,42,opt,name=cgroup_props_file" json:"cgroup_props_file,omitempty"` + CgroupDumpController []string `protobuf:"bytes,43,rep,name=cgroup_dump_controller" json:"cgroup_dump_controller,omitempty"` + FreezeCgroup *string `protobuf:"bytes,44,opt,name=freeze_cgroup" json:"freeze_cgroup,omitempty"` + Timeout *uint32 `protobuf:"varint,45,opt,name=timeout" json:"timeout,omitempty"` + TcpSkipInFlight *bool `protobuf:"varint,46,opt,name=tcp_skip_in_flight" json:"tcp_skip_in_flight,omitempty"` + WeakSysctls *bool `protobuf:"varint,47,opt,name=weak_sysctls" json:"weak_sysctls,omitempty"` + LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages" json:"lazy_pages,omitempty"` + StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd" json:"status_fd,omitempty"` + OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master" json:"orphan_pts_master,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuOpts) Reset() { *m = CriuOpts{} } +func (m *CriuOpts) String() string { return proto.CompactTextString(m) } +func (*CriuOpts) ProtoMessage() {} +func (*CriuOpts) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{7} } + +const Default_CriuOpts_LogLevel int32 = 2 +const Default_CriuOpts_CpuCap uint32 = 4294967295 +const Default_CriuOpts_GhostLimit uint32 = 1048576 + +func (m *CriuOpts) GetImagesDirFd() int32 { + if m != nil && m.ImagesDirFd != nil { + return *m.ImagesDirFd + } + return 0 +} + +func (m *CriuOpts) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +func (m *CriuOpts) GetLeaveRunning() bool { + if m != nil && m.LeaveRunning != nil { + return *m.LeaveRunning + } + return false +} + +func (m *CriuOpts) GetExtUnixSk() bool { + if m != nil && m.ExtUnixSk != nil { + return *m.ExtUnixSk + } + return false +} + +func (m *CriuOpts) GetTcpEstablished() bool { + if m != nil && m.TcpEstablished != nil { + return *m.TcpEstablished + } + return false +} + +func (m *CriuOpts) GetEvasiveDevices() bool { + if m != nil && m.EvasiveDevices != nil { + return *m.EvasiveDevices + } + return false +} + +func (m *CriuOpts) GetShellJob() bool { + if m != nil && m.ShellJob != nil { + return *m.ShellJob + } + return false +} + +func (m *CriuOpts) GetFileLocks() bool { + if m != nil && m.FileLocks != nil { + return *m.FileLocks + } + return false +} + +func (m *CriuOpts) GetLogLevel() int32 { + if m != nil && m.LogLevel != nil { + return *m.LogLevel + } + return Default_CriuOpts_LogLevel +} + +func (m *CriuOpts) GetLogFile() string { + if m != nil && m.LogFile != nil { + return *m.LogFile + } + return "" +} + +func (m *CriuOpts) GetPs() *CriuPageServerInfo { + if m != nil { + return m.Ps + } + return nil +} + +func (m *CriuOpts) GetNotifyScripts() bool { + if m != nil && m.NotifyScripts != nil { + return *m.NotifyScripts + } + return false +} + +func (m *CriuOpts) GetRoot() string { + if m != nil && m.Root != nil { + return *m.Root + } + return "" +} + +func (m *CriuOpts) GetParentImg() string { + if m != nil && m.ParentImg != nil { + return *m.ParentImg + } + return "" +} + +func (m *CriuOpts) GetTrackMem() bool { + if m != nil && m.TrackMem != nil { + return *m.TrackMem + } + return false +} + +func (m *CriuOpts) GetAutoDedup() bool { + if m != nil && m.AutoDedup != nil { + return *m.AutoDedup + } + return false +} + +func (m *CriuOpts) GetWorkDirFd() int32 { + if m != nil && m.WorkDirFd != nil { + return *m.WorkDirFd + } + return 0 +} + +func (m *CriuOpts) GetLinkRemap() bool { + if m != nil && m.LinkRemap != nil { + return *m.LinkRemap + } + return false +} + +func (m *CriuOpts) GetVeths() []*CriuVethPair { + if m != nil { + return m.Veths + } + return nil +} + +func (m *CriuOpts) GetCpuCap() uint32 { + if m != nil && m.CpuCap != nil { + return *m.CpuCap + } + return Default_CriuOpts_CpuCap +} + +func (m *CriuOpts) GetForceIrmap() bool { + if m != nil && m.ForceIrmap != nil { + return *m.ForceIrmap + } + return false +} + +func (m *CriuOpts) GetExecCmd() []string { + if m != nil { + return m.ExecCmd + } + return nil +} + +func (m *CriuOpts) GetExtMnt() []*ExtMountMap { + if m != nil { + return m.ExtMnt + } + return nil +} + +func (m *CriuOpts) GetManageCgroups() bool { + if m != nil && m.ManageCgroups != nil { + return *m.ManageCgroups + } + return false +} + +func (m *CriuOpts) GetCgRoot() []*CgroupRoot { + if m != nil { + return m.CgRoot + } + return nil +} + +func (m *CriuOpts) GetRstSibling() bool { + if m != nil && m.RstSibling != nil { + return *m.RstSibling + } + return false +} + +func (m *CriuOpts) GetInheritFd() []*InheritFd { + if m != nil { + return m.InheritFd + } + return nil +} + +func (m *CriuOpts) GetAutoExtMnt() bool { + if m != nil && m.AutoExtMnt != nil { + return *m.AutoExtMnt + } + return false +} + +func (m *CriuOpts) GetExtSharing() bool { + if m != nil && m.ExtSharing != nil { + return *m.ExtSharing + } + return false +} + +func (m *CriuOpts) GetExtMasters() bool { + if m != nil && m.ExtMasters != nil { + return *m.ExtMasters + } + return false +} + +func (m *CriuOpts) GetSkipMnt() []string { + if m != nil { + return m.SkipMnt + } + return nil +} + +func (m *CriuOpts) GetEnableFs() []string { + if m != nil { + return m.EnableFs + } + return nil +} + +func (m *CriuOpts) GetUnixSkIno() []*UnixSk { + if m != nil { + return m.UnixSkIno + } + return nil +} + +func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode { + if m != nil && m.ManageCgroupsMode != nil { + return *m.ManageCgroupsMode + } + return CriuCgMode_IGNORE +} + +func (m *CriuOpts) GetGhostLimit() uint32 { + if m != nil && m.GhostLimit != nil { + return *m.GhostLimit + } + return Default_CriuOpts_GhostLimit +} + +func (m *CriuOpts) GetIrmapScanPaths() []string { + if m != nil { + return m.IrmapScanPaths + } + return nil +} + +func (m *CriuOpts) GetExternal() []string { + if m != nil { + return m.External + } + return nil +} + +func (m *CriuOpts) GetEmptyNs() uint32 { + if m != nil && m.EmptyNs != nil { + return *m.EmptyNs + } + return 0 +} + +func (m *CriuOpts) GetJoinNs() []*JoinNamespace { + if m != nil { + return m.JoinNs + } + return nil +} + +func (m *CriuOpts) GetCgroupProps() string { + if m != nil && m.CgroupProps != nil { + return *m.CgroupProps + } + return "" +} + +func (m *CriuOpts) GetCgroupPropsFile() string { + if m != nil && m.CgroupPropsFile != nil { + return *m.CgroupPropsFile + } + return "" +} + +func (m *CriuOpts) GetCgroupDumpController() []string { + if m != nil { + return m.CgroupDumpController + } + return nil +} + +func (m *CriuOpts) GetFreezeCgroup() string { + if m != nil && m.FreezeCgroup != nil { + return *m.FreezeCgroup + } + return "" +} + +func (m *CriuOpts) GetTimeout() uint32 { + if m != nil && m.Timeout != nil { + return *m.Timeout + } + return 0 +} + +func (m *CriuOpts) GetTcpSkipInFlight() bool { + if m != nil && m.TcpSkipInFlight != nil { + return *m.TcpSkipInFlight + } + return false +} + +func (m *CriuOpts) GetWeakSysctls() bool { + if m != nil && m.WeakSysctls != nil { + return *m.WeakSysctls + } + return false +} + +func (m *CriuOpts) GetLazyPages() bool { + if m != nil && m.LazyPages != nil { + return *m.LazyPages + } + return false +} + +func (m *CriuOpts) GetStatusFd() int32 { + if m != nil && m.StatusFd != nil { + return *m.StatusFd + } + return 0 +} + +func (m *CriuOpts) GetOrphanPtsMaster() bool { + if m != nil && m.OrphanPtsMaster != nil { + return *m.OrphanPtsMaster + } + return false +} + +type CriuDumpResp struct { + Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuDumpResp) Reset() { *m = CriuDumpResp{} } +func (m *CriuDumpResp) String() string { return proto.CompactTextString(m) } +func (*CriuDumpResp) ProtoMessage() {} +func (*CriuDumpResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{8} } + +func (m *CriuDumpResp) GetRestored() bool { + if m != nil && m.Restored != nil { + return *m.Restored + } + return false +} + +type CriuRestoreResp struct { + Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuRestoreResp) Reset() { *m = CriuRestoreResp{} } +func (m *CriuRestoreResp) String() string { return proto.CompactTextString(m) } +func (*CriuRestoreResp) ProtoMessage() {} +func (*CriuRestoreResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{9} } + +func (m *CriuRestoreResp) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +type CriuNotify struct { + Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuNotify) Reset() { *m = CriuNotify{} } +func (m *CriuNotify) String() string { return proto.CompactTextString(m) } +func (*CriuNotify) ProtoMessage() {} +func (*CriuNotify) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{10} } + +func (m *CriuNotify) GetScript() string { + if m != nil && m.Script != nil { + return *m.Script + } + return "" +} + +func (m *CriuNotify) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +// +// List of features which can queried via +// CRIU_REQ_TYPE__FEATURE_CHECK +type CriuFeatures struct { + MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuFeatures) Reset() { *m = CriuFeatures{} } +func (m *CriuFeatures) String() string { return proto.CompactTextString(m) } +func (*CriuFeatures) ProtoMessage() {} +func (*CriuFeatures) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{11} } + +func (m *CriuFeatures) GetMemTrack() bool { + if m != nil && m.MemTrack != nil { + return *m.MemTrack + } + return false +} + +type CriuReq struct { + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` + NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success" json:"notify_success,omitempty"` + // + // When set service won't close the connection but + // will wait for more req-s to appear. Works not + // for all request types. + KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"` + // + // 'features' can be used to query which features + // are supported by the installed criu/kernel + // via RPC. + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuReq) Reset() { *m = CriuReq{} } +func (m *CriuReq) String() string { return proto.CompactTextString(m) } +func (*CriuReq) ProtoMessage() {} +func (*CriuReq) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{12} } + +func (m *CriuReq) GetType() CriuReqType { + if m != nil && m.Type != nil { + return *m.Type + } + return CriuReqType_EMPTY +} + +func (m *CriuReq) GetOpts() *CriuOpts { + if m != nil { + return m.Opts + } + return nil +} + +func (m *CriuReq) GetNotifySuccess() bool { + if m != nil && m.NotifySuccess != nil { + return *m.NotifySuccess + } + return false +} + +func (m *CriuReq) GetKeepOpen() bool { + if m != nil && m.KeepOpen != nil { + return *m.KeepOpen + } + return false +} + +func (m *CriuReq) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + +type CriuResp struct { + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` + Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"` + Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"` + Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` + Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` + CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"` + Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` + CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg" json:"cr_errmsg,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuResp) Reset() { *m = CriuResp{} } +func (m *CriuResp) String() string { return proto.CompactTextString(m) } +func (*CriuResp) ProtoMessage() {} +func (*CriuResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{13} } + +func (m *CriuResp) GetType() CriuReqType { + if m != nil && m.Type != nil { + return *m.Type + } + return CriuReqType_EMPTY +} + +func (m *CriuResp) GetSuccess() bool { + if m != nil && m.Success != nil { + return *m.Success + } + return false +} + +func (m *CriuResp) GetDump() *CriuDumpResp { + if m != nil { + return m.Dump + } + return nil +} + +func (m *CriuResp) GetRestore() *CriuRestoreResp { + if m != nil { + return m.Restore + } + return nil +} + +func (m *CriuResp) GetNotify() *CriuNotify { + if m != nil { + return m.Notify + } + return nil +} + +func (m *CriuResp) GetPs() *CriuPageServerInfo { + if m != nil { + return m.Ps + } + return nil +} + +func (m *CriuResp) GetCrErrno() int32 { + if m != nil && m.CrErrno != nil { + return *m.CrErrno + } + return 0 +} + +func (m *CriuResp) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + +func (m *CriuResp) GetCrErrmsg() string { + if m != nil && m.CrErrmsg != nil { + return *m.CrErrmsg + } + return "" +} + +func init() { + proto.RegisterType((*CriuPageServerInfo)(nil), "criu_page_server_info") + proto.RegisterType((*CriuVethPair)(nil), "criu_veth_pair") + proto.RegisterType((*ExtMountMap)(nil), "ext_mount_map") + proto.RegisterType((*JoinNamespace)(nil), "join_namespace") + proto.RegisterType((*InheritFd)(nil), "inherit_fd") + proto.RegisterType((*CgroupRoot)(nil), "cgroup_root") + proto.RegisterType((*UnixSk)(nil), "unix_sk") + proto.RegisterType((*CriuOpts)(nil), "criu_opts") + proto.RegisterType((*CriuDumpResp)(nil), "criu_dump_resp") + proto.RegisterType((*CriuRestoreResp)(nil), "criu_restore_resp") + proto.RegisterType((*CriuNotify)(nil), "criu_notify") + proto.RegisterType((*CriuFeatures)(nil), "criu_features") + proto.RegisterType((*CriuReq)(nil), "criu_req") + proto.RegisterType((*CriuResp)(nil), "criu_resp") + proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value) + proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) +} + +func init() { proto.RegisterFile("criurpc.proto", fileDescriptor0) } + +var fileDescriptor0 = []byte{ + // 1297 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x7c, 0x56, 0xdb, 0x77, 0xd3, 0xc6, + 0x13, 0xfe, 0xc5, 0x71, 0x7c, 0x59, 0x5f, 0x22, 0x04, 0x84, 0xe5, 0x9e, 0x9f, 0x28, 0x6d, 0x49, + 0x5b, 0x03, 0x3e, 0x5c, 0x0a, 0x4f, 0xe5, 0x04, 0x87, 0xe6, 0x14, 0x62, 0x1f, 0xc7, 0xe9, 0x39, + 0x3c, 0xed, 0x11, 0xd2, 0xda, 0x5e, 0x2c, 0x69, 0xd5, 0xdd, 0x95, 0x21, 0xfc, 0x03, 0x7d, 0xec, + 0x63, 0x1f, 0xfb, 0xaf, 0x76, 0x76, 0x24, 0x05, 0x41, 0x39, 0xbc, 0x80, 0x35, 0x9a, 0xcb, 0x37, + 0xdf, 0xcc, 0x7c, 0x0a, 0xe9, 0x05, 0x4a, 0x64, 0x2a, 0x0d, 0x06, 0xa9, 0x92, 0x46, 0x7a, 0x63, + 0x72, 0xd1, 0x1a, 0x58, 0xea, 0x2f, 0x38, 0xd3, 0x5c, 0xad, 0xb9, 0x62, 0x22, 0x99, 0x4b, 0x77, + 0x9b, 0x34, 0xfd, 0x30, 0x54, 0x5c, 0x6b, 0xba, 0xb1, 0xbb, 0xf1, 0x7d, 0xdb, 0xed, 0x92, 0x7a, + 0x2a, 0x95, 0xa1, 0x35, 0x78, 0xda, 0x72, 0x3b, 0x64, 0x33, 0x15, 0x21, 0xdd, 0xc4, 0x07, 0x42, + 0x6a, 0xf3, 0x90, 0xd6, 0xed, 0x6f, 0xef, 0x2e, 0xe9, 0x63, 0xc2, 0x35, 0x37, 0x4b, 0xc8, 0x2a, + 0x94, 0xdb, 0x23, 0x5b, 0x62, 0x0e, 0x49, 0x21, 0x4f, 0x0d, 0xf2, 0xf4, 0x49, 0x03, 0x1e, 0x65, + 0x66, 0x33, 0xc1, 0xb3, 0x77, 0x87, 0xf4, 0xf8, 0x7b, 0xc3, 0x62, 0x99, 0x25, 0xf0, 0xaf, 0x9f, + 0xda, 0xd4, 0x2b, 0x7e, 0x5a, 0x78, 0xc3, 0xc3, 0xda, 0x8f, 0x0a, 0xd7, 0x5f, 0x48, 0xff, 0xad, + 0x14, 0x09, 0x4b, 0xfc, 0x98, 0xeb, 0xd4, 0x0f, 0xb8, 0xad, 0x9c, 0xe8, 0xc2, 0x15, 0x10, 0x27, + 0x9a, 0xcd, 0x45, 0xc4, 0x73, 0x77, 0xf7, 0x1c, 0x69, 0x43, 0x66, 0xe5, 0x33, 0x99, 0x1a, 0x44, + 0xda, 0xf6, 0x6e, 0x13, 0x22, 0x92, 0x25, 0x57, 0xc2, 0xb0, 0x79, 0xf8, 0x69, 0xa5, 0xbc, 0x09, + 0x1b, 0xb9, 0x05, 0x98, 0x3a, 0xc1, 0x42, 0xc9, 0x2c, 0x65, 0x4a, 0x4a, 0x63, 0x5b, 0x0f, 0x8c, + 0x8a, 0x2a, 0x44, 0xf8, 0x66, 0x59, 0x60, 0xa2, 0xa4, 0x99, 0x25, 0xe2, 0x3d, 0xd3, 0x2b, 0x6c, + 0x34, 0x91, 0x21, 0xc7, 0x84, 0x3d, 0xef, 0xef, 0x36, 0x69, 0x23, 0x15, 0x50, 0x5e, 0xbb, 0x17, + 0x49, 0x4f, 0xc4, 0x40, 0xb2, 0x66, 0xa1, 0x50, 0x50, 0x1c, 0x9d, 0xce, 0x78, 0xcc, 0x49, 0x05, + 0x9f, 0x88, 0xfb, 0x6b, 0xce, 0x54, 0x96, 0x24, 0x22, 0x59, 0x20, 0xe8, 0x96, 0x7b, 0x9e, 0x74, + 0x2c, 0x43, 0x45, 0x19, 0xe4, 0xb9, 0xe5, 0x5e, 0x22, 0xdb, 0x26, 0x48, 0x19, 0xd7, 0xc6, 0x7f, + 0x13, 0x09, 0xbd, 0xe4, 0x21, 0xdd, 0x2a, 0x5f, 0xf0, 0xb5, 0xaf, 0x05, 0xa4, 0x09, 0xf9, 0x5a, + 0x04, 0x5c, 0xd3, 0x06, 0xbe, 0x00, 0x3a, 0xc0, 0x2d, 0x8a, 0xd8, 0x5b, 0xf9, 0x86, 0x36, 0xd1, + 0x04, 0x4d, 0x5b, 0xbe, 0x58, 0x24, 0x83, 0x95, 0xa6, 0x2d, 0xb4, 0x5d, 0x20, 0xed, 0x48, 0x2e, + 0x58, 0xc4, 0xd7, 0x3c, 0xa2, 0x6d, 0x8b, 0xeb, 0xe9, 0xc6, 0xd0, 0x75, 0x48, 0xcb, 0x5a, 0x91, + 0x5d, 0x82, 0x34, 0x78, 0xa4, 0x96, 0x6a, 0xda, 0x81, 0xdf, 0x9d, 0xe1, 0xce, 0xe0, 0xcb, 0x4b, + 0xb4, 0x43, 0xfa, 0x89, 0x34, 0x62, 0x7e, 0xca, 0x34, 0x38, 0x00, 0x0d, 0xb4, 0x8b, 0x35, 0x80, + 0x42, 0x4b, 0x2c, 0xed, 0x61, 0x26, 0x40, 0x91, 0xfa, 0x8a, 0xc3, 0xf8, 0x45, 0xbc, 0xa0, 0x7d, + 0xb4, 0x01, 0x58, 0x98, 0x5c, 0xb0, 0x62, 0x31, 0x8f, 0xe9, 0x76, 0x09, 0xd6, 0xcf, 0x8c, 0x84, + 0xae, 0xc2, 0x2c, 0xa5, 0x4e, 0x49, 0xcd, 0x3b, 0xa9, 0x56, 0x25, 0xa7, 0xe7, 0x90, 0x46, 0x70, + 0x8c, 0x44, 0xb2, 0x62, 0x8a, 0xc3, 0x3a, 0x51, 0x17, 0x1d, 0x6f, 0x90, 0x2d, 0xbb, 0x91, 0x9a, + 0x9e, 0xdf, 0xdd, 0x04, 0xc0, 0xdb, 0x83, 0xcf, 0x96, 0xf4, 0x2a, 0x69, 0x06, 0x69, 0xc6, 0x02, + 0x08, 0xb8, 0x00, 0x01, 0xbd, 0xa7, 0xe4, 0xc1, 0xf0, 0xc9, 0x83, 0x27, 0x8f, 0x1e, 0x0f, 0x9f, + 0x3c, 0xb4, 0x55, 0xe6, 0x52, 0x05, 0x9c, 0x09, 0x65, 0x33, 0x5e, 0xc4, 0x8c, 0xc0, 0x08, 0x7f, + 0xcf, 0x03, 0x16, 0xc4, 0x21, 0xdd, 0x81, 0xa4, 0x6d, 0xf7, 0x26, 0x69, 0xe2, 0x26, 0x27, 0x86, + 0x5e, 0xc2, 0x2a, 0xfd, 0xc1, 0xa7, 0x9b, 0x0d, 0x74, 0xc4, 0x7e, 0x62, 0x49, 0xca, 0xb7, 0x4b, + 0x53, 0x8a, 0xa9, 0xae, 0x43, 0xf1, 0x05, 0xae, 0x1a, 0xbd, 0x8c, 0x81, 0xdd, 0x41, 0x75, 0xfd, + 0xa0, 0xbc, 0xd2, 0x86, 0x69, 0x01, 0x83, 0x86, 0xa5, 0xb8, 0x82, 0x31, 0x37, 0xab, 0x9b, 0x4c, + 0xaf, 0x62, 0x58, 0x67, 0x50, 0x59, 0xee, 0x0b, 0xa4, 0x8b, 0x74, 0x95, 0x90, 0xae, 0x55, 0x77, + 0x49, 0x2f, 0x7d, 0x65, 0x73, 0x5d, 0xaf, 0x1a, 0x63, 0x5f, 0x1b, 0xae, 0x34, 0xbd, 0x51, 0xf6, + 0xa7, 0x57, 0x22, 0xc5, 0xd8, 0x9b, 0xd8, 0x9f, 0xbd, 0xa7, 0x04, 0xb6, 0x8d, 0xb3, 0xb9, 0xa6, + 0xbb, 0x68, 0xba, 0x4e, 0x3a, 0xc5, 0x5a, 0xc2, 0xc0, 0x25, 0xfd, 0x3f, 0xc2, 0x68, 0x0d, 0xca, + 0x8b, 0xd8, 0x23, 0xe7, 0x3f, 0x6d, 0x18, 0xc8, 0x80, 0xfb, 0xf0, 0xa0, 0x40, 0x7f, 0xd8, 0xcb, + 0x67, 0x00, 0x9d, 0x5b, 0xa3, 0x7b, 0x8d, 0x74, 0x16, 0x4b, 0x09, 0x7d, 0x46, 0x22, 0x16, 0x86, + 0xde, 0xc2, 0x29, 0x34, 0xef, 0xdf, 0x7b, 0xf0, 0xf3, 0xc3, 0xc7, 0x8f, 0x5c, 0x4a, 0x1c, 0x24, + 0x1f, 0x16, 0xc9, 0x4f, 0x98, 0xbd, 0x3f, 0x4d, 0xbf, 0x41, 0x08, 0x38, 0x07, 0xc0, 0x9d, 0x80, + 0x4c, 0xdc, 0x3e, 0xb3, 0xc4, 0xa9, 0x39, 0x65, 0x20, 0x0d, 0xdf, 0xda, 0x34, 0xee, 0x2e, 0x69, + 0xe6, 0xc2, 0xa1, 0xe9, 0x77, 0xc5, 0xfc, 0x3f, 0x13, 0x12, 0x60, 0xab, 0xa0, 0x1c, 0x74, 0x11, + 0x06, 0x73, 0x07, 0xb7, 0xf0, 0x32, 0x39, 0x57, 0xb5, 0xe6, 0xeb, 0xbf, 0x87, 0xaf, 0x6e, 0x90, + 0x9d, 0xe2, 0x55, 0x98, 0xc5, 0x29, 0x0b, 0x64, 0x62, 0x94, 0x8c, 0x22, 0xae, 0xe8, 0x0f, 0x08, + 0x02, 0x6e, 0x79, 0xae, 0x38, 0xff, 0x50, 0xb6, 0x4e, 0x7f, 0xc4, 0x30, 0x10, 0x29, 0x23, 0x62, + 0x6e, 0xe5, 0xef, 0x27, 0x84, 0x76, 0x85, 0xb8, 0xf6, 0x8e, 0x91, 0x6a, 0x40, 0x34, 0x8f, 0xc4, + 0x62, 0x69, 0xe8, 0xa0, 0x38, 0xc5, 0xee, 0x3b, 0xee, 0xaf, 0x98, 0x3e, 0xd5, 0x81, 0x89, 0x34, + 0xbd, 0x5b, 0xde, 0x41, 0xe4, 0x7f, 0x38, 0xc5, 0x6b, 0xd3, 0xf4, 0xde, 0xd9, 0x6d, 0x1b, 0xdf, + 0x64, 0xda, 0x2e, 0xc3, 0x7d, 0xbc, 0x02, 0xc0, 0x2e, 0x55, 0xba, 0xb4, 0x6c, 0x19, 0x5d, 0xcc, + 0x96, 0x0e, 0xad, 0xb7, 0xe7, 0x15, 0x1a, 0x8d, 0xc8, 0x41, 0xe2, 0x53, 0x4b, 0x19, 0xfc, 0x6f, + 0xa4, 0xe2, 0x21, 0xaa, 0x5c, 0xcb, 0xdb, 0x85, 0xd6, 0xad, 0x4f, 0x61, 0xce, 0xdd, 0x0a, 0xb5, + 0x42, 0xe9, 0xf2, 0xf6, 0x40, 0x24, 0xad, 0x47, 0x7e, 0xe1, 0x56, 0xd7, 0xf3, 0x23, 0x2f, 0x64, + 0xb2, 0xaa, 0x6c, 0x50, 0x11, 0xbf, 0x3b, 0x6c, 0xce, 0x01, 0x24, 0xa4, 0xb2, 0x80, 0xe1, 0xb2, + 0x19, 0xde, 0x78, 0x51, 0xf1, 0xaf, 0x0d, 0xd2, 0x2a, 0x4a, 0xfe, 0x01, 0xdb, 0x50, 0x37, 0xa7, + 0x69, 0x2e, 0xa5, 0x7d, 0x38, 0xa4, 0xf2, 0x05, 0xb3, 0x56, 0xd8, 0x86, 0xba, 0x15, 0x55, 0x4c, + 0xde, 0x19, 0x92, 0xc1, 0x47, 0x99, 0xad, 0x28, 0x4e, 0x16, 0x04, 0xf6, 0xeb, 0xb5, 0x59, 0x12, + 0xb4, 0xe2, 0x3c, 0x05, 0x27, 0x9e, 0x14, 0x0a, 0xba, 0x4b, 0x5a, 0x25, 0x1c, 0x94, 0xce, 0x4e, + 0x59, 0xa6, 0xb4, 0x7a, 0x7f, 0xd6, 0x0a, 0x05, 0xc7, 0xe6, 0xbf, 0x0e, 0x09, 0x06, 0x5b, 0x56, + 0xb4, 0x1f, 0x06, 0x7b, 0xd4, 0x75, 0xcb, 0x2f, 0xd6, 0x3f, 0x13, 0x9c, 0x8f, 0x8c, 0xdf, 0x22, + 0xcd, 0x82, 0x5a, 0x84, 0xd3, 0x19, 0xba, 0x83, 0xff, 0xf2, 0x7d, 0x8d, 0x34, 0xf2, 0x6e, 0x0a, + 0x80, 0xdd, 0x41, 0x95, 0xf1, 0x5c, 0x81, 0x1b, 0x5f, 0x55, 0x60, 0xc7, 0x72, 0xca, 0xb8, 0x52, + 0x70, 0x9d, 0x4d, 0xdc, 0x8b, 0x6a, 0xdb, 0xad, 0x2f, 0xb5, 0x6d, 0xb9, 0xca, 0x63, 0x62, 0xbd, + 0xc0, 0x2f, 0x40, 0x7b, 0x8f, 0xc1, 0x79, 0x54, 0x8f, 0x95, 0x90, 0xc6, 0xe1, 0x8b, 0xa3, 0xf1, + 0x74, 0xe4, 0xfc, 0x0f, 0x06, 0xdd, 0xdc, 0x7f, 0xc1, 0x8e, 0xc6, 0x47, 0x23, 0x67, 0xc3, 0x6d, + 0x93, 0xad, 0xc9, 0x74, 0x3c, 0x39, 0x76, 0x6a, 0x6e, 0x8b, 0xd4, 0x8f, 0xc7, 0x07, 0x33, 0x67, + 0xd3, 0xfe, 0x3a, 0x38, 0x79, 0xf9, 0xd2, 0xa9, 0xdb, 0xb8, 0xe3, 0xd9, 0xf4, 0x70, 0x7f, 0xe6, + 0xd8, 0x4f, 0x5f, 0xf3, 0xf9, 0xe8, 0xe0, 0xd9, 0xc9, 0xcb, 0x99, 0xd3, 0xd8, 0xfb, 0x67, 0xa3, + 0xd8, 0x90, 0x33, 0x42, 0x21, 0xd3, 0xe8, 0xd5, 0x64, 0xf6, 0x1a, 0x2a, 0x40, 0xfc, 0xf3, 0x93, + 0x57, 0x13, 0x48, 0x0f, 0x31, 0xd3, 0xd1, 0xf1, 0xcc, 0x16, 0xae, 0x59, 0x8f, 0xfd, 0x5f, 0x47, + 0xfb, 0xbf, 0x41, 0x85, 0x2e, 0x69, 0x4d, 0xa6, 0x23, 0x86, 0x5e, 0x75, 0x98, 0x45, 0x67, 0xf2, + 0xec, 0xc5, 0x88, 0x1d, 0x8f, 0xa6, 0xbf, 0x8f, 0xa6, 0x8e, 0xfd, 0x03, 0xa5, 0x71, 0x34, 0x9e, + 0x1d, 0x1e, 0xbc, 0x76, 0x1a, 0xc0, 0x48, 0x77, 0x7f, 0x72, 0x72, 0x78, 0x74, 0x30, 0xce, 0xdd, + 0x9b, 0xd0, 0x6f, 0xaf, 0xb4, 0xe4, 0xf9, 0xec, 0xba, 0xf4, 0x0e, 0x46, 0xcf, 0x66, 0x27, 0x90, + 0x33, 0x37, 0xb5, 0xff, 0x0d, 0x00, 0x00, 0xff, 0xff, 0xed, 0xcb, 0xc7, 0x95, 0x39, 0x09, 0x00, + 0x00, +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto new file mode 100644 index 00000000..f924b735 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto @@ -0,0 +1,195 @@ +syntax = "proto2"; + +message criu_page_server_info { + optional string address = 1; + optional int32 port = 2; + optional int32 pid = 3; + optional int32 fd = 4; +} + +message criu_veth_pair { + required string if_in = 1; + required string if_out = 2; +}; + +message ext_mount_map { + required string key = 1; + required string val = 2; +}; + +message join_namespace { + required string ns = 1; + required string ns_file = 2; + optional string extra_opt = 3; +} + +message inherit_fd { + required string key = 1; + required int32 fd = 2; +}; + +message cgroup_root { + optional string ctrl = 1; + required string path = 2; +}; + +message unix_sk { + required uint32 inode = 1; +}; + +enum criu_cg_mode { + IGNORE = 0; + CG_NONE = 1; + PROPS = 2; + SOFT = 3; + FULL = 4; + STRICT = 5; + DEFAULT = 6; +}; + +message criu_opts { + required int32 images_dir_fd = 1; + optional int32 pid = 2; /* if not set on dump, will dump requesting process */ + + optional bool leave_running = 3; + optional bool ext_unix_sk = 4; + optional bool tcp_established = 5; + optional bool evasive_devices = 6; + optional bool shell_job = 7; + optional bool file_locks = 8; + optional int32 log_level = 9 [default = 2]; + optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */ + + optional criu_page_server_info ps = 11; + + optional bool notify_scripts = 12; + + optional string root = 13; + optional string parent_img = 14; + optional bool track_mem = 15; + optional bool auto_dedup = 16; + + optional int32 work_dir_fd = 17; + optional bool link_remap = 18; + repeated criu_veth_pair veths = 19; /* DEPRECATED, use external instead */ + + optional uint32 cpu_cap = 20 [default = 0xffffffff]; + optional bool force_irmap = 21; + repeated string exec_cmd = 22; + + repeated ext_mount_map ext_mnt = 23; /* DEPRECATED, use external instead */ + optional bool manage_cgroups = 24; /* backward compatibility */ + repeated cgroup_root cg_root = 25; + + optional bool rst_sibling = 26; /* swrk only */ + repeated inherit_fd inherit_fd = 27; /* swrk only */ + + optional bool auto_ext_mnt = 28; + optional bool ext_sharing = 29; + optional bool ext_masters = 30; + + repeated string skip_mnt = 31; + repeated string enable_fs = 32; + + repeated unix_sk unix_sk_ino = 33; /* DEPRECATED, use external instead */ + + optional criu_cg_mode manage_cgroups_mode = 34; + optional uint32 ghost_limit = 35 [default = 0x100000]; + repeated string irmap_scan_paths = 36; + repeated string external = 37; + optional uint32 empty_ns = 38; + repeated join_namespace join_ns = 39; + + optional string cgroup_props = 41; + optional string cgroup_props_file = 42; + repeated string cgroup_dump_controller = 43; + + optional string freeze_cgroup = 44; + optional uint32 timeout = 45; + optional bool tcp_skip_in_flight = 46; + optional bool weak_sysctls = 47; + optional bool lazy_pages = 48; + optional int32 status_fd = 49; + optional bool orphan_pts_master = 50; +} + +message criu_dump_resp { + optional bool restored = 1; +} + +message criu_restore_resp { + required int32 pid = 1; +} + +message criu_notify { + optional string script = 1; + optional int32 pid = 2; +} + +enum criu_req_type { + EMPTY = 0; + DUMP = 1; + RESTORE = 2; + CHECK = 3; + PRE_DUMP = 4; + PAGE_SERVER = 5; + + NOTIFY = 6; + + CPUINFO_DUMP = 7; + CPUINFO_CHECK = 8; + + FEATURE_CHECK = 9; +} + +/* + * List of features which can queried via + * CRIU_REQ_TYPE__FEATURE_CHECK + */ +message criu_features { + optional bool mem_track = 1; +} + +/* + * Request -- each type corresponds to must-be-there + * request arguments of respective type + */ + +message criu_req { + required criu_req_type type = 1; + + optional criu_opts opts = 2; + optional bool notify_success = 3; + + /* + * When set service won't close the connection but + * will wait for more req-s to appear. Works not + * for all request types. + */ + optional bool keep_open = 4; + /* + * 'features' can be used to query which features + * are supported by the installed criu/kernel + * via RPC. + */ + optional criu_features features = 5; +} + +/* + * Response -- it states whether the request was served + * and additional request-specific information + */ + +message criu_resp { + required criu_req_type type = 1; + required bool success = 2; + + optional criu_dump_resp dump = 3; + optional criu_restore_resp restore = 4; + optional criu_notify notify = 5; + optional criu_page_server_info ps = 6; + + optional int32 cr_errno = 7; + optional criu_features features = 8; + optional string cr_errmsg = 9; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/error.go b/vendor/github.com/opencontainers/runc/libcontainer/error.go new file mode 100644 index 00000000..21a3789b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/error.go @@ -0,0 +1,70 @@ +package libcontainer + +import "io" + +// ErrorCode is the API error code type. +type ErrorCode int + +// API error codes. +const ( + // Factory errors + IdInUse ErrorCode = iota + InvalidIdFormat + + // Container errors + ContainerNotExists + ContainerPaused + ContainerNotStopped + ContainerNotRunning + ContainerNotPaused + + // Process errors + NoProcessOps + + // Common errors + ConfigInvalid + ConsoleExists + SystemError +) + +func (c ErrorCode) String() string { + switch c { + case IdInUse: + return "Id already in use" + case InvalidIdFormat: + return "Invalid format" + case ContainerPaused: + return "Container paused" + case ConfigInvalid: + return "Invalid configuration" + case SystemError: + return "System error" + case ContainerNotExists: + return "Container does not exist" + case ContainerNotStopped: + return "Container is not stopped" + case ContainerNotRunning: + return "Container is not running" + case ConsoleExists: + return "Console exists for process" + case ContainerNotPaused: + return "Container is not paused" + case NoProcessOps: + return "No process operations" + default: + return "Unknown error" + } +} + +// Error is the API error type. +type Error interface { + error + + // Returns an error if it failed to write the detail of the Error to w. + // The detail of the Error may include the error message and a + // representation of the stack trace. + Detail(w io.Writer) error + + // Returns the error code for this error. + Code() ErrorCode +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory.go b/vendor/github.com/opencontainers/runc/libcontainer/factory.go new file mode 100644 index 00000000..0986cd77 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory.go @@ -0,0 +1,44 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Factory interface { + // Creates a new container with the given id and starts the initial process inside it. + // id must be a string containing only letters, digits and underscores and must contain + // between 1 and 1024 characters, inclusive. + // + // The id must not already be in use by an existing container. Containers created using + // a factory with the same path (and filesystem) must have distinct ids. + // + // Returns the new container with a running process. + // + // errors: + // IdInUse - id is already in use by a container + // InvalidIdFormat - id has incorrect format + // ConfigInvalid - config is invalid + // Systemerror - System error + // + // On error, any partially created container parts are cleaned up (the operation is atomic). + Create(id string, config *configs.Config) (Container, error) + + // Load takes an ID for an existing container and returns the container information + // from the state. This presents a read only view of the container. + // + // errors: + // Path does not exist + // System error + Load(id string) (Container, error) + + // StartInitialization is an internal API to libcontainer used during the reexec of the + // container. + // + // Errors: + // Pipe connection error + // System error + StartInitialization() error + + // Type returns info string about factory type (e.g. lxc, libcontainer...) + Type() string +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go new file mode 100644 index 00000000..42b6f5a0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -0,0 +1,325 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "runtime/debug" + "strconv" + + "github.com/docker/docker/pkg/mount" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/rootless" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +const ( + stateFilename = "state.json" + execFifoFilename = "exec.fifo" +) + +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// InitArgs returns an options func to configure a LinuxFactory with the +// provided init binary path and arguments. +func InitArgs(args ...string) func(*LinuxFactory) error { + return func(l *LinuxFactory) (err error) { + if len(args) > 0 { + // Resolve relative paths to ensure that its available + // after directory changes. + if args[0], err = filepath.Abs(args[0]); err != nil { + return newGenericError(err, ConfigInvalid) + } + } + + l.InitArgs = args + return nil + } +} + +// SystemdCgroups is an options func to configure a LinuxFactory to return +// containers that use systemd to create and manage cgroups. +func SystemdCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &systemd.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// Cgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to +// create and manage cgroups. +func Cgroupfs(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// RootlessCgroups is an options func to configure a LinuxFactory to +// return containers that use the "rootless" cgroup manager, which will +// fail to do any operations not possible to do with an unprivileged user. +// It should only be used in conjunction with rootless containers. +func RootlessCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &rootless.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. +func TmpfsRoot(l *LinuxFactory) error { + mounted, err := mount.Mounted(l.Root) + if err != nil { + return err + } + if !mounted { + if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil { + return err + } + } + return nil +} + +// CriuPath returns an option func to configure a LinuxFactory with the +// provided criupath +func CriuPath(criupath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.CriuPath = criupath + return nil + } +} + +// New returns a linux based container factory based in the root directory and +// configures the factory with the provided option funcs. +func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { + if root != "" { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, newGenericError(err, SystemError) + } + } + l := &LinuxFactory{ + Root: root, + InitArgs: []string{"/proc/self/exe", "init"}, + Validator: validate.New(), + CriuPath: "criu", + } + Cgroupfs(l) + for _, opt := range options { + if err := opt(l); err != nil { + return nil, err + } + } + return l, nil +} + +// LinuxFactory implements the default factory interface for linux based systems. +type LinuxFactory struct { + // Root directory for the factory to store state. + Root string + + // InitArgs are arguments for calling the init responsibilities for spawning + // a container. + InitArgs []string + + // CriuPath is the path to the criu binary used for checkpoint and restore of + // containers. + CriuPath string + + // Validator provides validation to container configurations. + Validator validate.Validator + + // NewCgroupsManager returns an initialized cgroups manager for a single container. + NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager +} + +func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + if err := l.validateID(id); err != nil { + return nil, err + } + if err := l.Validator.Validate(config); err != nil { + return nil, newGenericError(err, ConfigInvalid) + } + containerRoot := filepath.Join(l.Root, id) + if _, err := os.Stat(containerRoot); err == nil { + return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) + } else if !os.IsNotExist(err) { + return nil, newGenericError(err, SystemError) + } + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil { + return nil, newGenericError(err, SystemError) + } + if config.Rootless { + RootlessCgroups(l) + } + c := &linuxContainer{ + id: id, + root: containerRoot, + config: config, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + } + c.state = &stoppedState{c: c} + return c, nil +} + +func (l *LinuxFactory) Load(id string) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + containerRoot := filepath.Join(l.Root, id) + state, err := l.loadState(containerRoot, id) + if err != nil { + return nil, err + } + r := &nonChildProcess{ + processPid: state.InitProcessPid, + processStartTime: state.InitProcessStartTime, + fds: state.ExternalDescriptors, + } + // We have to use the RootlessManager. + if state.Rootless { + RootlessCgroups(l) + } + c := &linuxContainer{ + initProcess: r, + initProcessStartTime: state.InitProcessStartTime, + id: id, + config: &state.Config, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, + } + c.state = &loadedState{c: c} + if err := c.refreshState(); err != nil { + return nil, err + } + return c, nil +} + +func (l *LinuxFactory) Type() string { + return "libcontainer" +} + +// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state +// This is a low level implementation detail of the reexec and should not be consumed externally +func (l *LinuxFactory) StartInitialization() (err error) { + var ( + pipefd, rootfd int + consoleSocket *os.File + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") + envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") + envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") + ) + + // Get the INITPIPE. + pipefd, err = strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) + } + + var ( + pipe = os.NewFile(uintptr(pipefd), "pipe") + it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) + ) + defer pipe.Close() + + // Only init processes have STATEDIR. + rootfd = -1 + if it == initStandard { + if rootfd, err = strconv.Atoi(envStateDir); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) + } + } + + if envConsole != "" { + console, err := strconv.Atoi(envConsole) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err) + } + consoleSocket = os.NewFile(uintptr(console), "console-socket") + defer consoleSocket.Close() + } + + // clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() + + defer func() { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + }() + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + } + }() + + i, err := newContainerInit(it, pipe, consoleSocket, rootfd) + if err != nil { + return err + } + + // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. + return i.Init() +} + +func (l *LinuxFactory) loadState(root, id string) (*State, error) { + f, err := os.Open(filepath.Join(root, stateFilename)) + if err != nil { + if os.IsNotExist(err) { + return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) + } + return nil, newGenericError(err, SystemError) + } + defer f.Close() + var state *State + if err := json.NewDecoder(f).Decode(&state); err != nil { + return nil, newGenericError(err, SystemError) + } + return state, nil +} + +func (l *LinuxFactory) validateID(id string) error { + if !idRegex.MatchString(id) { + return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go new file mode 100644 index 00000000..6e7de2fe --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -0,0 +1,92 @@ +package libcontainer + +import ( + "fmt" + "io" + "text/template" + "time" + + "github.com/opencontainers/runc/libcontainer/stacktrace" +) + +var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} +Code: {{.ECode}} +{{if .Message }} +Message: {{.Message}} +{{end}} +Frames:{{range $i, $frame := .Stack.Frames}} +--- +{{$i}}: {{$frame.Function}} +Package: {{$frame.Package}} +File: {{$frame.File}}@{{$frame.Line}}{{end}} +`)) + +func newGenericError(err error, c ErrorCode) Error { + if le, ok := err.(Error); ok { + return le + } + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: c, + Stack: stacktrace.Capture(1), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: SystemError, + Cause: cause, + Stack: stacktrace.Capture(2), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +type genericError struct { + Timestamp time.Time + ECode ErrorCode + Err error `json:"-"` + Cause string + Message string + Stack stacktrace.Stacktrace +} + +func (e *genericError) Error() string { + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) +} + +func (e *genericError) Code() ErrorCode { + return e.ECode +} + +func (e *genericError) Detail(w io.Writer) error { + return errorTemplate.Execute(w, e) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go new file mode 100644 index 00000000..65bee0be --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -0,0 +1,502 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + "net" + "os" + "strings" + "syscall" // only for Errno + "unsafe" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/vishvananda/netlink" + + "golang.org/x/sys/unix" +) + +type initType string + +const ( + initSetns initType = "setns" + initStandard initType = "standard" +) + +type pid struct { + Pid int `json:"pid"` +} + +// network is an internal struct used to setup container networks. +type network struct { + configs.Network + + // TempVethPeerName is a unique temporary veth peer name that was placed into + // the container's namespace. + TempVethPeerName string `json:"temp_veth_peer_name"` +} + +// initConfig is used for transferring parameters from Exec() to Init() +type initConfig struct { + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities *configs.Capabilities `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` + Config *configs.Config `json:"config"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + CreateConsole bool `json:"create_console"` + Rootless bool `json:"rootless"` +} + +type initer interface { + Init() error +} + +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err + } + if err := populateProcessEnvironment(config.Env); err != nil { + return nil, err + } + switch t { + case initSetns: + return &linuxSetnsInit{ + pipe: pipe, + consoleSocket: consoleSocket, + config: config, + }, nil + case initStandard: + return &linuxStandardInit{ + pipe: pipe, + consoleSocket: consoleSocket, + parentPid: unix.Getppid(), + config: config, + stateDirFD: stateDirFD, + }, nil + } + return nil, fmt.Errorf("unknown init type %q", t) +} + +// populateProcessEnvironment loads the provided environment variables into the +// current processes's environment. +func populateProcessEnvironment(env []string) error { + for _, pair := range env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaked file descriptors +// before executing the command inside the namespace +func finalizeNamespace(config *initConfig) error { + // Ensure that all unwanted fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { + return err + } + + capabilities := &configs.Capabilities{} + if config.Capabilities != nil { + capabilities = config.Capabilities + } else if config.Config.Capabilities != nil { + capabilities = config.Config.Capabilities + } + w, err := newContainerCapList(capabilities) + if err != nil { + return err + } + // drop capabilities in bounding set before changing user + if err := w.ApplyBoundingSet(); err != nil { + return err + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return err + } + if err := setupUser(config); err != nil { + return err + } + if err := system.ClearKeepCaps(); err != nil { + return err + } + if err := w.ApplyCaps(); err != nil { + return err + } + if config.Cwd != "" { + if err := unix.Chdir(config.Cwd); err != nil { + return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err) + } + } + return nil +} + +// setupConsole sets up the console from inside the container, and sends the +// master pty fd to the config.Pipe (using cmsg). This is done to ensure that +// consoles are scoped to a container properly (see runc#814 and the many +// issues related to that). This has to be run *after* we've pivoted to the new +// rootfs (and the users' configuration is entirely set up). +func setupConsole(socket *os.File, config *initConfig, mount bool) error { + defer socket.Close() + // At this point, /dev/ptmx points to something that we would expect. We + // used to change the owner of the slave path, but since the /dev/pts mount + // can have gid=X set (at the users' option). So touching the owner of the + // slave PTY is not necessary, as the kernel will handle that for us. Note + // however, that setupUser (specifically fixStdioPermissions) *will* change + // the UID owner of the console to be the user the process will run as (so + // they can actually control their console). + console, err := newConsole() + if err != nil { + return err + } + // After we return from here, we don't need the console anymore. + defer console.Close() + + linuxConsole, ok := console.(*linuxConsole) + if !ok { + return fmt.Errorf("failed to cast console to *linuxConsole") + } + // Mount the console inside our rootfs. + if mount { + if err := linuxConsole.mount(); err != nil { + return err + } + } + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(socket, linuxConsole.File()); err != nil { + return err + } + // Now, dup over all the things. + return linuxConsole.dupStdio() +} + +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + if err := readSync(pipe, procRun); err != nil { + return err + } + + return nil +} + +// syncParentHooks sends to the given pipe a JSON payload which indicates that +// the parent should execute pre-start hooks. It then waits for the parent to +// indicate that it is cleared to resume. +func syncParentHooks(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procHooks); err != nil { + return err + } + + // Wait for parent to give the all-clear. + if err := readSync(pipe, procResume); err != nil { + return err + } + + return nil +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *initConfig) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: 0, + Gid: 0, + Home: "/", + } + + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return err + } + + var addGroups []int + if len(config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) + if err != nil { + return err + } + } + + if config.Rootless { + if execUser.Uid != 0 { + return fmt.Errorf("cannot run as a non-root user in a rootless container") + } + + if execUser.Gid != 0 { + return fmt.Errorf("cannot run as a non-root group in a rootless container") + } + + // We cannot set any additional groups in a rootless container and thus we + // bail if the user asked us to do so. TODO: We currently can't do this + // earlier, but if libcontainer.Process.User was typesafe this might work. + if len(addGroups) > 0 { + return fmt.Errorf("cannot set any additional groups in a rootless container") + } + } + + // before we change to the container's user make sure that the processes STDIO + // is correctly owned by the user that we are switching to. + if err := fixStdioPermissions(config, execUser); err != nil { + return err + } + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + if !config.Rootless { + suppGroups := append(execUser.Sgids, addGroups...) + if err := unix.Setgroups(suppGroups); err != nil { + return err + } + } + + if err := system.Setgid(execUser.Gid); err != nil { + return err + } + + if err := system.Setuid(execUser.Uid); err != nil { + return err + } + + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return err + } + } + return nil +} + +// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. +// The ownership needs to match because it is created outside of the container and needs to be +// localized. +func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { + var null unix.Stat_t + if err := unix.Stat("/dev/null", &null); err != nil { + return err + } + for _, fd := range []uintptr{ + os.Stdin.Fd(), + os.Stderr.Fd(), + os.Stdout.Fd(), + } { + var s unix.Stat_t + if err := unix.Fstat(int(fd), &s); err != nil { + return err + } + + // Skip chown of /dev/null if it was used as one of the STDIO fds. + if s.Rdev == null.Rdev { + continue + } + + // Skip chown if s.Gid is actually an unmapped gid in the host. While + // this is a bit dodgy if it just so happens that the console _is_ + // owned by overflow_gid, there's no way for us to disambiguate this as + // a userspace program. + if _, err := config.Config.HostGID(int(s.Gid)); err != nil { + continue + } + + // We only change the uid owner (as it is possible for the mount to + // prefer a different gid, and there's no reason for us to change it). + // The reason why we don't just leave the default uid=X mount setup is + // that users expect to be able to actually use their console. Without + // this code, you couldn't effectively run as a non-root user inside a + // container and also have a console set up. + if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { + return err + } + } + return nil +} + +// setupNetwork sets up and initializes any network interface inside the container. +func setupNetwork(config *initConfig) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.initialize(config); err != nil { + return err + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + _, dst, err := net.ParseCIDR(config.Destination) + if err != nil { + return err + } + src := net.ParseIP(config.Source) + if src == nil { + return fmt.Errorf("Invalid source for route: %s", config.Source) + } + gw := net.ParseIP(config.Gateway) + if gw == nil { + return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) + } + l, err := netlink.LinkByName(config.InterfaceName) + if err != nil { + return err + } + route := &netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + Dst: dst, + Src: src, + Gw: gw, + LinkIndex: l.Attrs().Index, + } + if err := netlink.RouteAdd(route); err != nil { + return err + } + } + return nil +} + +func setupRlimits(limits []configs.Rlimit, pid int) error { + for _, rlimit := range limits { + if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} + +const _P_PID = 1 + +type siginfo struct { + si_signo int32 + si_errno int32 + si_code int32 + // below here is a union; si_pid is the only field we use + si_pid int32 + // Pad to 128 bytes as detailed in blockUntilWaitable + pad [96]byte +} + +// isWaitable returns true if the process has exited false otherwise. +// Its based off blockUntilWaitable in src/os/wait_waitid.go +func isWaitable(pid int) (bool, error) { + si := &siginfo{} + _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) + if e != 0 { + return false, os.NewSyscallError("waitid", e) + } + + return si.si_pid != 0, nil +} + +// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise +func isNoChildren(err error) bool { + switch err := err.(type) { + case syscall.Errno: + if err == unix.ECHILD { + return true + } + case *os.SyscallError: + if err.Err == unix.ECHILD { + return true + } + } + return false +} + +// signalAllProcesses freezes then iterates over all the processes inside the +// manager's cgroups sending the signal s to them. +// If s is SIGKILL then it will wait for each process to exit. +// For all other signals it will check if the process is ready to report its +// exit status and only if it is will a wait be performed. +func signalAllProcesses(m cgroups.Manager, s os.Signal) error { + var procs []*os.Process + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Warn(err) + } + pids, err := m.GetAllPids() + if err != nil { + m.Freeze(configs.Thawed) + return err + } + for _, pid := range pids { + p, err := os.FindProcess(pid) + if err != nil { + logrus.Warn(err) + continue + } + procs = append(procs, p) + if err := p.Signal(s); err != nil { + logrus.Warn(err) + } + } + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } + + for _, p := range procs { + if s != unix.SIGKILL { + if ok, err := isWaitable(p.Pid); err != nil { + if !isNoChildren(err) { + logrus.Warn("signalAllProcesses: ", p.Pid, err) + } + continue + } else if !ok { + // Not ready to report so don't wait + continue + } + } + + if _, err := p.Wait(); err != nil { + if !isNoChildren(err) { + logrus.Warn("wait: ", err) + } + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go new file mode 100644 index 00000000..82ffa7a8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -0,0 +1,50 @@ +// +build linux + +package keys + +import ( + "fmt" + "strconv" + "strings" + + "golang.org/x/sys/unix" +) + +type KeySerial uint32 + +func JoinSessionKeyring(name string) (KeySerial, error) { + sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) + if err != nil { + return 0, fmt.Errorf("could not create session key: %v", err) + } + return KeySerial(sessKeyId), nil +} + +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, +// anding the bits with the given mask (clearing permissions) and setting +// additional permission bits +func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { + dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId)) + if err != nil { + return err + } + + res := strings.Split(string(dest), ";") + if len(res) < 5 { + return fmt.Errorf("Destination buffer for key description is too small") + } + + // parse permissions + perm64, err := strconv.ParseUint(res[3], 16, 32) + if err != nil { + return err + } + + perm := (uint32(perm64) & mask) | setbits + + if err := unix.KeyctlSetperm(int(ringId), perm); err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go new file mode 100644 index 00000000..8829b71a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -0,0 +1,87 @@ +// +build linux + +package libcontainer + +import ( + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +// list of known message types we want to send to bootstrap program +// The number is randomly chosen to not conflict with known netlink types +const ( + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessAttr uint16 = 27287 +) + +type Int32msg struct { + Type uint16 + Value uint32 +} + +// Serialize serializes the message. +// Int32msg has the following representation +// | nlattr len | nlattr type | +// | uint32 value | +func (msg *Int32msg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + native.PutUint32(buf[4:8], msg.Value) + return buf +} + +func (msg *Int32msg) Len() int { + return unix.NLA_HDRLEN + 4 +} + +// Bytemsg has the following representation +// | nlattr len | nlattr type | +// | value | pad | +type Bytemsg struct { + Type uint16 + Value []byte +} + +func (msg *Bytemsg) Serialize() []byte { + l := msg.Len() + buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1)) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(l)) + native.PutUint16(buf[2:4], msg.Type) + copy(buf[4:], msg.Value) + return buf +} + +func (msg *Bytemsg) Len() int { + return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated +} + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return unix.NLA_HDRLEN + 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go new file mode 100644 index 00000000..5075bee4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -0,0 +1,259 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "net" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/vishvananda/netlink" +) + +var strategies = map[string]networkStrategy{ + "veth": &veth{}, + "loopback": &loopback{}, +} + +// networkStrategy represents a specific network configuration for +// a container's networking stack +type networkStrategy interface { + create(*network, int) error + initialize(*network) error + detach(*configs.Network) error + attach(*configs.Network) error +} + +// getStrategy returns the specific network strategy for the +// provided type. +func getStrategy(tpe string) (networkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, fmt.Errorf("unknown strategy type %q", tpe) + } + return s, nil +} + +// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo. +func getNetworkInterfaceStats(interfaceName string) (*NetworkInterface, error) { + out := &NetworkInterface{Name: interfaceName} + // This can happen if the network runtime information is missing - possible if the + // container was created by an old version of libcontainer. + if interfaceName == "" { + return out, nil + } + type netStatsPair struct { + // Where to write the output. + Out *uint64 + // The network stats file to read. + File string + } + // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container. + netStats := []netStatsPair{ + {Out: &out.RxBytes, File: "tx_bytes"}, + {Out: &out.RxPackets, File: "tx_packets"}, + {Out: &out.RxErrors, File: "tx_errors"}, + {Out: &out.RxDropped, File: "tx_dropped"}, + + {Out: &out.TxBytes, File: "rx_bytes"}, + {Out: &out.TxPackets, File: "rx_packets"}, + {Out: &out.TxErrors, File: "rx_errors"}, + {Out: &out.TxDropped, File: "rx_dropped"}, + } + for _, netStat := range netStats { + data, err := readSysfsNetworkStats(interfaceName, netStat.File) + if err != nil { + return nil, err + } + *(netStat.Out) = data + } + return out, nil +} + +// Reads the specified statistics available under /sys/class/net//statistics +func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) { + data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// loopback is a network strategy that provides a basic loopback device +type loopback struct { +} + +func (l *loopback) create(n *network, nspid int) error { + return nil +} + +func (l *loopback) initialize(config *network) error { + return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}}) +} + +func (l *loopback) attach(n *configs.Network) (err error) { + return nil +} + +func (l *loopback) detach(n *configs.Network) (err error) { + return nil +} + +// veth is a network strategy that uses a bridge and creates +// a veth pair, one that is attached to the bridge on the host and the other +// is placed inside the container's namespace +type veth struct { +} + +func (v *veth) detach(n *configs.Network) (err error) { + return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) +} + +// attach a container network interface to an external network +func (v *veth) attach(n *configs.Network) (err error) { + brl, err := netlink.LinkByName(n.Bridge) + if err != nil { + return err + } + br, ok := brl.(*netlink.Bridge) + if !ok { + return fmt.Errorf("Wrong device type %T", brl) + } + host, err := netlink.LinkByName(n.HostInterfaceName) + if err != nil { + return err + } + + if err := netlink.LinkSetMaster(host, br); err != nil { + return err + } + if err := netlink.LinkSetMTU(host, n.Mtu); err != nil { + return err + } + if n.HairpinMode { + if err := netlink.LinkSetHairpin(host, true); err != nil { + return err + } + } + if err := netlink.LinkSetUp(host); err != nil { + return err + } + + return nil +} + +func (v *veth) create(n *network, nspid int) (err error) { + tmpName, err := v.generateTempPeerName() + if err != nil { + return err + } + n.TempVethPeerName = tmpName + if n.Bridge == "" { + return fmt.Errorf("bridge is not specified") + } + veth := &netlink.Veth{ + LinkAttrs: netlink.LinkAttrs{ + Name: n.HostInterfaceName, + TxQLen: n.TxQueueLen, + }, + PeerName: n.TempVethPeerName, + } + if err := netlink.LinkAdd(veth); err != nil { + return err + } + defer func() { + if err != nil { + netlink.LinkDel(veth) + } + }() + if err := v.attach(&n.Network); err != nil { + return err + } + child, err := netlink.LinkByName(n.TempVethPeerName) + if err != nil { + return err + } + return netlink.LinkSetNsPid(child, nspid) +} + +func (v *veth) generateTempPeerName() (string, error) { + return utils.GenerateRandomName("veth", 7) +} + +func (v *veth) initialize(config *network) error { + peer := config.TempVethPeerName + if peer == "" { + return fmt.Errorf("peer is not specified") + } + child, err := netlink.LinkByName(peer) + if err != nil { + return err + } + if err := netlink.LinkSetDown(child); err != nil { + return err + } + if err := netlink.LinkSetName(child, config.Name); err != nil { + return err + } + // get the interface again after we changed the name as the index also changes. + if child, err = netlink.LinkByName(config.Name); err != nil { + return err + } + if config.MacAddress != "" { + mac, err := net.ParseMAC(config.MacAddress) + if err != nil { + return err + } + if err := netlink.LinkSetHardwareAddr(child, mac); err != nil { + return err + } + } + ip, err := netlink.ParseAddr(config.Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip); err != nil { + return err + } + if config.IPv6Address != "" { + ip6, err := netlink.ParseAddr(config.IPv6Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip6); err != nil { + return err + } + } + if err := netlink.LinkSetMTU(child, config.Mtu); err != nil { + return err + } + if err := netlink.LinkSetUp(child); err != nil { + return err + } + if config.Gateway != "" { + gw := net.ParseIP(config.Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + if config.IPv6Gateway != "" { + gw := net.ParseIP(config.IPv6Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go new file mode 100644 index 00000000..81587b9b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go @@ -0,0 +1,90 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +const oomCgroupName = "memory" + +type PressureLevel uint + +const ( + LowPressure PressureLevel = iota + MediumPressure + CriticalPressure +) + +func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) { + evFile, err := os.Open(filepath.Join(cgDir, evName)) + if err != nil { + return nil, err + } + fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + evFile.Close() + return nil, err + } + + eventfd := os.NewFile(uintptr(fd), "eventfd") + + eventControlPath := filepath.Join(cgDir, "cgroup.event_control") + data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) + if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { + eventfd.Close() + evFile.Close() + return nil, err + } + ch := make(chan struct{}) + go func() { + defer func() { + close(ch) + eventfd.Close() + evFile.Close() + }() + buf := make([]byte, 8) + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + ch <- struct{}{} + } + }() + return ch, nil +} + +// notifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + return registerMemoryEvent(dir, "memory.oom_control", "") +} + +func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + if level > CriticalPressure { + return nil, fmt.Errorf("invalid pressure level %d", level) + } + + levelStr := []string{"low", "medium", "critical"}[level] + return registerMemoryEvent(dir, "memory.pressure_level", levelStr) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go new file mode 100644 index 00000000..f1ad0814 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -0,0 +1,106 @@ +package libcontainer + +import ( + "fmt" + "io" + "math" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type processOperations interface { + wait() (*os.ProcessState, error) + signal(sig os.Signal) error + pid() int +} + +// Process specifies the configuration and IO for a process inside +// a container. +type Process struct { + // The command to be run followed by any arguments. + Args []string + + // Env specifies the environment variables for the process. + Env []string + + // User will set the uid and gid of the executing process running inside the container + // local to the container's user and group configuration. + User string + + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string + + // Cwd will change the processes current working directory inside the container's rootfs. + Cwd string + + // Stdin is a pointer to a reader which provides the standard input stream. + Stdin io.Reader + + // Stdout is a pointer to a writer which receives the standard output stream. + Stdout io.Writer + + // Stderr is a pointer to a writer which receives the standard error stream. + Stderr io.Writer + + // ExtraFiles specifies additional open files to be inherited by the container + ExtraFiles []*os.File + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *configs.Capabilities + + // AppArmorProfile specifies the profile to apply to the process and is + // changed at the time the process is execed + AppArmorProfile string + + // Label specifies the label to apply to the process. It is commonly used by selinux + Label string + + // NoNewPrivileges controls whether processes can gain additional privileges. + NoNewPrivileges *bool + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []configs.Rlimit + + // ConsoleSocket provides the masterfd console. + ConsoleSocket *os.File + + ops processOperations +} + +// Wait waits for the process to exit. +// Wait releases any resources associated with the Process +func (p Process) Wait() (*os.ProcessState, error) { + if p.ops == nil { + return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.wait() +} + +// Pid returns the process ID +func (p Process) Pid() (int, error) { + // math.MinInt32 is returned here, because it's invalid value + // for the kill() system call. + if p.ops == nil { + return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.pid(), nil +} + +// Signal sends a signal to the Process. +func (p Process) Signal(sig os.Signal) error { + if p.ops == nil { + return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.signal(sig) +} + +// IO holds the process's STDIO +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go new file mode 100644 index 00000000..171685cc --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -0,0 +1,493 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" // only for Signal + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +type parentProcess interface { + // pid returns the pid for the running process. + pid() int + + // start starts the process execution. + start() error + + // send a SIGKILL to the process and wait for the exit. + terminate() error + + // wait waits on the process returning the process state. + wait() (*os.ProcessState, error) + + // startTime returns the process start time. + startTime() (uint64, error) + + signal(os.Signal) error + + externalDescriptors() []string + + setExternalDescriptors(fds []string) +} + +type setnsProcess struct { + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader +} + +func (p *setnsProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *setnsProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *setnsProcess) start() (err error) { + defer p.parentPipe.Close() + err = p.cmd.Start() + p.childPipe.Close() + if err != nil { + return newSystemErrorWithCause(err, "starting setns process") + } + if p.bootstrapData != nil { + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + } + if err = p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "executing setns process") + } + // We can't join cgroups if we're in a rootless container. + if !p.config.Rootless && len(p.cgroupPaths) > 0 { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + } + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for process") + } + if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { + return newSystemErrorWithCause(err, "writing config to pipe") + } + + ierr := parseSync(p.parentPipe, func(sync *syncT) error { + switch sync.Type { + case procReady: + // This shouldn't happen. + panic("unexpected procReady in setns") + case procHooks: + // This shouldn't happen. + panic("unexpected procHooks in setns") + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + }) + + if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "calling shutdown on init pipe") + } + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +func (p *setnsProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "waiting on setns process to finish") + } + if !status.Success() { + p.cmd.Wait() + return newSystemError(&exec.ExitError{ProcessState: status}) + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "reading pid from init pipe") + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +// terminate sends a SIGKILL to the forked process for the setns routine then waits to +// avoid the process becoming a zombie. +func (p *setnsProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *setnsProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + + // Return actual ProcessState even on Wait error + return p.cmd.ProcessState, err +} + +func (p *setnsProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *setnsProcess) externalDescriptors() []string { + return p.fds +} + +func (p *setnsProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +type initProcess struct { + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool + rootDir *os.File +} + +func (p *initProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *initProcess) externalDescriptors() []string { + return p.fds +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +func (p *initProcess) start() error { + defer p.parentPipe.Close() + err := p.cmd.Start() + p.process.ops = p + p.childPipe.Close() + p.rootDir.Close() + if err != nil { + p.process.ops = nil + return newSystemErrorWithCause(err, "starting init process command") + } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + if err := p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "running exec setns process for init") + } + // Save the standard descriptor names before the container process + // can potentially move them (e.g., via dup2()). If we don't do this now, + // we won't know at checkpoint time which file descriptor to look up. + fds, err := getPipeFds(p.pid()) + if err != nil { + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) + } + p.setExternalDescriptors(fds) + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + } + }() + if err := p.createNetworkInterfaces(); err != nil { + return newSystemErrorWithCause(err, "creating network interfaces") + } + if err := p.sendConfig(); err != nil { + return newSystemErrorWithCause(err, "sending config to init process") + } + var ( + sentRun bool + sentResume bool + ) + + ierr := parseSync(p.parentPipe, func(sync *syncT) error { + switch sync.Type { + case procReady: + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for ready process") + } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } + + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + } + // Sync with child. + if err := writeSync(p.parentPipe, procRun); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'run'") + } + sentRun = true + case procHooks: + // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") + } + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + // Sync with child. + if err := writeSync(p.parentPipe, procResume); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'resume'") + } + sentResume = true + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + + return nil + }) + + if !sentRun { + return newSystemErrorWithCause(ierr, "container init") + } + if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { + return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) + } + if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "shutting down init pipe") + } + + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +func (p *initProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + if err != nil { + return p.cmd.ProcessState, err + } + // we should kill all processes in cgroup when init is died if we use host PID namespace + if p.sharePidns { + signalAllProcesses(p.manager, unix.SIGKILL) + } + return p.cmd.ProcessState, nil +} + +func (p *initProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *initProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *initProcess) sendConfig() error { + // send the config to the container's init process, we don't use JSON Encode + // here because there might be a problem in JSON decoder in some cases, see: + // https://github.com/docker/docker/issues/14203#issuecomment-174177790 + return utils.WriteJSON(p.parentPipe, p.config) +} + +func (p *initProcess) createNetworkInterfaces() error { + for _, config := range p.config.Config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + n := &network{ + Network: *config, + } + if err := strategy.create(n, p.pid()); err != nil { + return err + } + p.config.Networks = append(p.config.Networks, n) + } + return nil +} + +func (p *initProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *initProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func getPipeFds(pid int) ([]string, error) { + fds := make([]string, 3) + + dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") + for i := 0; i < 3; i++ { + // XXX: This breaks if the path is not a valid symlink (which can + // happen in certain particularly unlucky mount namespace setups). + f := filepath.Join(dirPath, strconv.Itoa(i)) + target, err := os.Readlink(f) + if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } + return fds, err + } + fds[i] = target + } + return fds, nil +} + +// InitializeIO creates pipes for use with the process's stdio and returns the +// opposite side for each. Do not use this if you want to have a pseudoterminal +// set up for you by libcontainer (TODO: fix that too). +// TODO: This is mostly unnecessary, and should be handled by clients. +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + unix.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stderr, i.Stderr = w, r + // change ownership of the pipes incase we are in a user namespace + for _, fd := range fds { + if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { + return nil, err + } + } + return i, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/restored_process.go b/vendor/github.com/opencontainers/runc/libcontainer/restored_process.go new file mode 100644 index 00000000..408916ad --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/restored_process.go @@ -0,0 +1,122 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/system" +) + +func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) { + var ( + err error + ) + proc, err := os.FindProcess(pid) + if err != nil { + return nil, err + } + stat, err := system.Stat(pid) + if err != nil { + return nil, err + } + return &restoredProcess{ + proc: proc, + processStartTime: stat.StartTime, + fds: fds, + }, nil +} + +type restoredProcess struct { + proc *os.Process + processStartTime uint64 + fds []string +} + +func (p *restoredProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *restoredProcess) pid() int { + return p.proc.Pid +} + +func (p *restoredProcess) terminate() error { + err := p.proc.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *restoredProcess) wait() (*os.ProcessState, error) { + // TODO: how do we wait on the actual process? + // maybe use --exec-cmd in criu + st, err := p.proc.Wait() + if err != nil { + return nil, err + } + return st, nil +} + +func (p *restoredProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *restoredProcess) signal(s os.Signal) error { + return p.proc.Signal(s) +} + +func (p *restoredProcess) externalDescriptors() []string { + return p.fds +} + +func (p *restoredProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +// nonChildProcess represents a process where the calling process is not +// the parent process. This process is created when a factory loads a container from +// a persisted state. +type nonChildProcess struct { + processPid int + processStartTime uint64 + fds []string +} + +func (p *nonChildProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *nonChildProcess) pid() int { + return p.processPid +} + +func (p *nonChildProcess) terminate() error { + return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError) +} + +func (p *nonChildProcess) wait() (*os.ProcessState, error) { + return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError) +} + +func (p *nonChildProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *nonChildProcess) signal(s os.Signal) error { + proc, err := os.FindProcess(p.processPid) + if err != nil { + return err + } + return proc.Signal(s) +} + +func (p *nonChildProcess) externalDescriptors() []string { + return p.fds +} + +func (p *nonChildProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go new file mode 100644 index 00000000..e2e734a8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -0,0 +1,812 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "time" + + "github.com/docker/docker/pkg/mount" + "github.com/docker/docker/pkg/symlink" + "github.com/mrunalp/fileutils" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux/label" + + "golang.org/x/sys/unix" +) + +const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV + +// needsSetupDev returns true if /dev needs to be set up. +func needsSetupDev(config *configs.Config) bool { + for _, m := range config.Mounts { + if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { + return false + } + } + return true +} + +// prepareRootfs sets up the devices, mount points, and filesystems for use +// inside a new mount namespace. It doesn't set anything as ro. You must call +// finalizeRootfs after this function to finish setting up the rootfs. +func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { + if err := prepareRoot(config); err != nil { + return newSystemErrorWithCause(err, "preparing rootfs") + } + + setupDev := needsSetupDev(config) + for _, m := range config.Mounts { + for _, precmd := range m.PremountCmds { + if err := mountCmd(precmd); err != nil { + return newSystemErrorWithCause(err, "running premount command") + } + } + + if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) + } + + for _, postcmd := range m.PostmountCmds { + if err := mountCmd(postcmd); err != nil { + return newSystemErrorWithCause(err, "running postmount command") + } + } + } + + if setupDev { + if err := createDevices(config); err != nil { + return newSystemErrorWithCause(err, "creating device nodes") + } + if err := setupPtmx(config); err != nil { + return newSystemErrorWithCause(err, "setting up ptmx") + } + if err := setupDevSymlinks(config.Rootfs); err != nil { + return newSystemErrorWithCause(err, "setting up /dev symlinks") + } + } + + // Signal the parent to run the pre-start hooks. + // The hooks are run after the mounts are setup, but before we switch to the new + // root, so that the old root is still available in the hooks for any mount + // manipulations. + if err := syncParentHooks(pipe); err != nil { + return err + } + + // The reason these operations are done here rather than in finalizeRootfs + // is because the console-handling code gets quite sticky if we have to set + // up the console before doing the pivot_root(2). This is because the + // Console API has to also work with the ExecIn case, which means that the + // API must be able to deal with being inside as well as outside the + // container. It's just cleaner to do this here (at the expense of the + // operation not being perfectly split). + + if err := unix.Chdir(config.Rootfs); err != nil { + return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) + } + + if config.NoPivotRoot { + err = msMoveRoot(config.Rootfs) + } else { + err = pivotRoot(config.Rootfs) + } + if err != nil { + return newSystemErrorWithCause(err, "jailing process inside rootfs") + } + + if setupDev { + if err := reOpenDevNull(); err != nil { + return newSystemErrorWithCause(err, "reopening /dev/null inside container") + } + } + + return nil +} + +// finalizeRootfs sets anything to ro if necessary. You must call +// prepareRootfs first. +func finalizeRootfs(config *configs.Config) (err error) { + // remount dev as ro if specified + for _, m := range config.Mounts { + if libcontainerUtils.CleanPath(m.Destination) == "/dev" { + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + if err := remountReadonly(m); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) + } + } + break + } + } + + // set rootfs ( / ) as readonly + if config.Readonlyfs { + if err := setReadonly(); err != nil { + return newSystemErrorWithCause(err, "setting rootfs as readonly") + } + } + + unix.Umask(0022) + return nil +} + +func mountCmd(cmd configs.Command) error { + command := exec.Command(cmd.Path, cmd.Args[:]...) + command.Env = cmd.Env + command.Dir = cmd.Dir + if out, err := command.CombinedOutput(); err != nil { + return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) + } + return nil +} + +func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + + switch m.Device { + case "proc", "sysfs": + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + // Selinux kernels do not support labeling of /proc or /sys + return mountPropagate(m, rootfs, "") + case "mqueue": + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + // older kernels do not support labeling of /dev/mqueue + if err := mountPropagate(m, rootfs, ""); err != nil { + return err + } + return label.SetFileLabel(dest, mountLabel) + } + return nil + case "tmpfs": + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + tmpDir := "" + stat, err := os.Stat(dest) + if err != nil { + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + if copyUp { + tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") + } + defer os.RemoveAll(tmpDir) + m.Destination = tmpDir + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + if copyUp { + if err := fileutils.CopyDirectory(dest, tmpDir); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err) + if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err) + if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + } + if stat != nil { + if err = os.Chmod(dest, stat.Mode()); err != nil { + return err + } + } + return nil + case "bind": + stat, err := os.Stat(m.Source) + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. + return err + } + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := createIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + // bind mount won't change mount options, we need remount to make mount options effective. + // first check that we have non-default options required before attempting a remount + if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { + // only remount if unique mount options are set + if err := remount(m, rootfs); err != nil { + return err + } + } + + if m.Relabel != "" { + if err := label.Validate(m.Relabel); err != nil { + return err + } + shared := label.IsShared(m.Relabel) + if err := label.Relabel(m.Source, mountLabel, shared); err != nil { + return err + } + } + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + var merged []string + for _, b := range binds { + ss := filepath.Base(b.Destination) + if strings.Contains(ss, ",") { + merged = append(merged, ss) + } + } + tmpfs := &configs.Mount{ + Source: "tmpfs", + Device: "tmpfs", + Destination: m.Destination, + Flags: defaultMountFlags, + Data: "mode=755", + PropagationFlags: m.PropagationFlags, + } + if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil { + return err + } + for _, b := range binds { + if err := mountToRootfs(b, rootfs, mountLabel); err != nil { + return err + } + } + for _, mc := range merged { + for _, ss := range strings.Split(mc, ",") { + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { + return err + } + } + } + if m.Flags&unix.MS_RDONLY != 0 { + // remount cgroup root as readonly + mcgrouproot := &configs.Mount{ + Source: m.Destination, + Device: "bind", + Destination: m.Destination, + Flags: defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND, + } + if err := remount(mcgrouproot, rootfs); err != nil { + return err + } + } + default: + // ensure that the destination of the mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + var err error + if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + return mountPropagate(m, rootfs, mountLabel) + } + return nil +} + +func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { + mounts, err := cgroups.GetCgroupMounts(false) + if err != nil { + return nil, err + } + + cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + var binds []*configs.Mount + + for _, mm := range mounts { + dir, err := mm.GetOwnCgroup(cgroupPaths) + if err != nil { + return nil, err + } + relDir, err := filepath.Rel(mm.Root, dir) + if err != nil { + return nil, err + } + binds = append(binds, &configs.Mount{ + Device: "bind", + Source: filepath.Join(mm.Mountpoint, relDir), + Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)), + Flags: unix.MS_BIND | unix.MS_REC | m.Flags, + PropagationFlags: m.PropagationFlags, + }) + } + + return binds, nil +} + +// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. +// dest is required to be an abs path and have any symlinks resolved before calling this function. +func checkMountDestination(rootfs, dest string) error { + invalidDestinations := []string{ + "/proc", + } + // White list, it should be sub directories of invalid destinations + validDestinations := []string{ + // These entries can be bind mounted by files emulated by fuse, + // so commands like top, free displays stats in container. + "/proc/cpuinfo", + "/proc/diskstats", + "/proc/meminfo", + "/proc/stat", + "/proc/swaps", + "/proc/uptime", + "/proc/net/dev", + } + for _, valid := range validDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) + if err != nil { + return err + } + if path == "." { + return nil + } + } + for _, invalid := range invalidDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) + if err != nil { + return err + } + if path == "." || !strings.HasPrefix(path, "..") { + return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) + } + } + return nil +} + +func setupDevSymlinks(rootfs string) error { + var links = [][2]string{ + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } + // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink + // in /dev if it exists in /proc. + if _, err := os.Stat("/proc/kcore"); err == nil { + links = append(links, [2]string{"/proc/kcore", "/dev/core"}) + } + for _, link := range links { + var ( + src = link[0] + dst = filepath.Join(rootfs, link[1]) + ) + if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink %s %s %s", src, dst, err) + } + } + return nil +} + +// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs +// this method will make them point to `/dev/null` in this container's rootfs. This +// needs to be called after we chroot/pivot into the container's rootfs so that any +// symlinks are resolved locally. +func reOpenDevNull() error { + var stat, devNullStat unix.Stat_t + file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) + if err != nil { + return fmt.Errorf("Failed to open /dev/null - %s", err) + } + defer file.Close() + if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { + return err + } + for fd := 0; fd < 3; fd++ { + if err := unix.Fstat(fd, &stat); err != nil { + return err + } + if stat.Rdev == devNullStat.Rdev { + // Close and re-open the fd. + if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil { + return err + } + } + } + return nil +} + +// Create the device nodes in the container. +func createDevices(config *configs.Config) error { + useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) + oldMask := unix.Umask(0000) + for _, node := range config.Devices { + // containers running in a user namespace are not allowed to mknod + // devices so we can just bind mount it from the host. + if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { + unix.Umask(oldMask) + return err + } + } + unix.Umask(oldMask) + return nil +} + +func bindMountDeviceNode(dest string, node *configs.Device) error { + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "") +} + +// Creates the device node in the rootfs of the container. +func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { + dest := filepath.Join(rootfs, node.Path) + if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { + return err + } + + if bind { + return bindMountDeviceNode(dest, node) + } + if err := mknodDevice(dest, node); err != nil { + if os.IsExist(err) { + return nil + } else if os.IsPermission(err) { + return bindMountDeviceNode(dest, node) + } + return err + } + return nil +} + +func mknodDevice(dest string, node *configs.Device) error { + fileMode := node.FileMode + switch node.Type { + case 'c', 'u': + fileMode |= unix.S_IFCHR + case 'b': + fileMode |= unix.S_IFBLK + case 'p': + fileMode |= unix.S_IFIFO + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil { + return err + } + return unix.Chown(dest, int(node.Uid), int(node.Gid)) +} + +func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { + for _, m := range mountinfo { + if m.Mountpoint == dir { + return m + } + } + return nil +} + +// Get the parent mount point of directory passed in as argument. Also return +// optional fields. +func getParentMount(rootfs string) (string, string, error) { + var path string + + mountinfos, err := mount.GetMounts() + if err != nil { + return "", "", err + } + + mountinfo := getMountInfo(mountinfos, rootfs) + if mountinfo != nil { + return rootfs, mountinfo.Optional, nil + } + + path = rootfs + for { + path = filepath.Dir(path) + + mountinfo = getMountInfo(mountinfos, path) + if mountinfo != nil { + return path, mountinfo.Optional, nil + } + + if path == "/" { + break + } + } + + // If we are here, we did not find parent mount. Something is wrong. + return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs) +} + +// Make parent mount private if it was shared +func rootfsParentMountPrivate(rootfs string) error { + sharedMount := false + + parentMount, optionalOpts, err := getParentMount(rootfs) + if err != nil { + return err + } + + optsSplit := strings.Split(optionalOpts, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + sharedMount = true + break + } + } + + // Make parent mount PRIVATE if it was shared. It is needed for two + // reasons. First of all pivot_root() will fail if parent mount is + // shared. Secondly when we bind mount rootfs it will propagate to + // parent namespace and we don't want that to happen. + if sharedMount { + return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "") + } + + return nil +} + +func prepareRoot(config *configs.Config) error { + flag := unix.MS_SLAVE | unix.MS_REC + if config.RootPropagation != 0 { + flag = config.RootPropagation + } + if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil { + return err + } + + // Make parent mount private to make sure following bind mount does + // not propagate in other namespaces. Also it will help with kernel + // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent)) + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err + } + + return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "") +} + +func setReadonly() error { + return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") +} + +func setupPtmx(config *configs.Config) error { + ptmx := filepath.Join(config.Rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + return nil +} + +// pivotRoot will call pivot_root such that rootfs becomes the new root +// filesystem, and everything else is cleaned up. +func pivotRoot(rootfs string) error { + // While the documentation may claim otherwise, pivot_root(".", ".") is + // actually valid. What this results in is / being the new root but + // /proc/self/cwd being the old root. Since we can play around with the cwd + // with pivot_root this allows us to pivot without creating directories in + // the rootfs. Shout-outs to the LXC developers for giving us this idea. + + oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(oldroot) + + newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(newroot) + + // Change to the new root so that the pivot_root actually acts on it. + if err := unix.Fchdir(newroot); err != nil { + return err + } + + if err := unix.PivotRoot(".", "."); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + + // Currently our "." is oldroot (according to the current kernel code). + // However, purely for safety, we will fchdir(oldroot) since there isn't + // really any guarantee from the kernel what /proc/self/cwd will be after a + // pivot_root(2). + + if err := unix.Fchdir(oldroot); err != nil { + return err + } + + // Make oldroot rprivate to make sure our unmounts don't propagate to the + // host (and thus bork the machine). + if err := unix.Mount("", ".", "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil { + return err + } + // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { + return err + } + + // Switch back to our shiny new root. + if err := unix.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + return nil +} + +func msMoveRoot(rootfs string) error { + if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { + return err + } + if err := unix.Chroot("."); err != nil { + return err + } + return unix.Chdir("/") +} + +// createIfNotExists creates a file or a directory only if it does not already exist. +func createIfNotExists(path string, isDir bool) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + return os.MkdirAll(path, 0755) + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + return nil +} + +// readonlyPath will make a path read only. +func readonlyPath(path string) error { + if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") +} + +// remountReadonly will remount an existing mount point and ensure that it is read-only. +func remountReadonly(m *configs.Mount) error { + var ( + dest = m.Destination + flags = m.Flags + ) + for i := 0; i < 5; i++ { + if err := unix.Mount("", dest, "", uintptr(flags|unix.MS_REMOUNT|unix.MS_RDONLY), ""); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + return nil + } + return fmt.Errorf("unable to mount %s as readonly max retries reached", dest) +} + +// maskPath masks the top of the specified path inside a container to avoid +// security issues from processes reading information from non-namespace aware +// mounts ( proc/kcore ). +// For files, maskPath bind mounts /dev/null over the top of the specified path. +// For directories, maskPath mounts read-only tmpfs over the top of the specified path. +func maskPath(path string) error { + if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + if err == unix.ENOTDIR { + return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, "") + } + return err + } + return nil +} + +// writeSystemProperty writes the value to a path under /proc/sys as determined from the key. +// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. +func writeSystemProperty(key, value string) error { + keyPath := strings.Replace(key, ".", "/", -1) + return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) +} + +func remount(m *configs.Mount, rootfs string) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil { + return err + } + return nil +} + +// Do the mount operation followed by additional mounts required to take care +// of propagation flags. +func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { + var ( + dest = m.Destination + data = label.FormatMountLabel(m.Data, mountLabel) + flags = m.Flags + ) + if libcontainerUtils.CleanPath(dest) == "/dev" { + flags &= ^unix.MS_RDONLY + } + + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + if !(copyUp || strings.HasPrefix(dest, rootfs)) { + dest = filepath.Join(rootfs, dest) + } + + if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { + return err + } + + for _, pflag := range m.PropagationFlags { + if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go new file mode 100644 index 00000000..ded5a6bb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -0,0 +1,76 @@ +package seccomp + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok == true { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header, though some +// (notable, SCMP_ACT_TRACE) are not available in this implementation and will +// return errors. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok == true { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok == true { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 00000000..2523cbf9 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,227 @@ +// +build linux,cgo,seccomp + +package seccomp + +import ( + "bufio" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" + + "golang.org/x/sys/unix" +) + +var ( + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) +) + +// Filters given syscalls in a container, preventing them from being used +// Started in the container init process, and carried over to all child processes +// Setns calls, however, require a separate invocation, as they are not children +// of the init until they join the namespace +func InitSeccomp(config *configs.Seccomp) error { + if config == nil { + return fmt.Errorf("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction) + if err != nil { + return fmt.Errorf("error initializing seccomp - invalid default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return fmt.Errorf("error creating filter: %s", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return err + } + + if err := filter.AddArch(scmpArch); err != nil { + return err + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return fmt.Errorf("error setting no new privileges: %s", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return fmt.Errorf("encountered nil syscall while initializing Seccomp") + } + + if err = matchCall(filter, call); err != nil { + return err + } + } + + if err = filter.Load(); err != nil { + return fmt.Errorf("error loading seccomp filter into kernel: %s", err) + } + + return nil +} + +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill: + return actKill, nil + case configs.Errno: + return actErrno, nil + case configs.Trap: + return actTrap, nil + case configs.Allow: + return actAllow, nil + case configs.Trace: + return actTrace, nil + default: + return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, fmt.Errorf("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { + if call == nil || filter == nil { + return fmt.Errorf("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return fmt.Errorf("empty string is not a valid syscall") + } + + // If we can't resolve the syscall, assume it's not supported on this kernel + // Ignore it, don't error out + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + return nil + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action) + if err != nil { + return err + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err = filter.AddRule(callNum, callAct); err != nil { + return err + } + } else { + // Conditional match - convert the per-arg rules into library format + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return err + } + + conditions = append(conditions, newCond) + } + + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return err + } + } + + return nil +} + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + if err := s.Err(); err != nil { + return nil, err + } + + return status, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 00000000..44df1ad4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) error { + if config != nil { + return ErrSeccompNotEnabled + } + return nil +} + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go new file mode 100644 index 00000000..c7bdb605 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go @@ -0,0 +1,11 @@ +// +build linux,go1.5 + +package libcontainer + +import "syscall" + +// Set the GidMappingsEnableSetgroups member to true, so the process's +// setgroups proc entry wont be set to 'deny' if GidMappings are set +func enableSetgroups(sys *syscall.SysProcAttr) { + sys.GidMappingsEnableSetgroups = true +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go new file mode 100644 index 00000000..35b84219 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -0,0 +1,65 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/selinux/go-selinux/label" + + "golang.org/x/sys/unix" +) + +// linuxSetnsInit performs the container's initialization for running a new process +// inside an existing container. +type linuxSetnsInit struct { + pipe *os.File + consoleSocket *os.File + config *initConfig +} + +func (l *linuxSetnsInit) getSessionRingName() string { + return fmt.Sprintf("_ses.%s", l.config.ContainerId) +} + +func (l *linuxSetnsInit) Init() error { + if !l.config.Config.NoNewKeyring { + // do not inherit the parent's session keyring + if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { + return err + } + } + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, false); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + if l.config.Config.Seccomp != nil { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go new file mode 100644 index 00000000..0bbe1495 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go @@ -0,0 +1,27 @@ +package stacktrace + +import "runtime" + +// Capture captures a stacktrace for the current calling go program +// +// skip is the number of frames to skip +func Capture(userSkip int) Stacktrace { + var ( + skip = userSkip + 1 // add one for our own function + frames []Frame + prevPc uintptr + ) + for i := skip; ; i++ { + pc, file, line, ok := runtime.Caller(i) + //detect if caller is repeated to avoid loop, gccgo + //currently runs into a loop without this check + if !ok || pc == prevPc { + break + } + frames = append(frames, NewFrame(pc, file, line)) + prevPc = pc + } + return Stacktrace{ + Frames: frames, + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go new file mode 100644 index 00000000..0d590d9a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go @@ -0,0 +1,38 @@ +package stacktrace + +import ( + "path/filepath" + "runtime" + "strings" +) + +// NewFrame returns a new stack frame for the provided information +func NewFrame(pc uintptr, file string, line int) Frame { + fn := runtime.FuncForPC(pc) + if fn == nil { + return Frame{} + } + pack, name := parseFunctionName(fn.Name()) + return Frame{ + Line: line, + File: filepath.Base(file), + Package: pack, + Function: name, + } +} + +func parseFunctionName(name string) (string, string) { + i := strings.LastIndex(name, ".") + if i == -1 { + return "", name + } + return name[:i], name[i+1:] +} + +// Frame contains all the information for a stack frame within a go program +type Frame struct { + File string + Function string + Package string + Line int +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go new file mode 100644 index 00000000..5e8b58d2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go @@ -0,0 +1,5 @@ +package stacktrace + +type Stacktrace struct { + Frames []Frame +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go new file mode 100644 index 00000000..580b3fe4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -0,0 +1,188 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "os/exec" + "syscall" //only for Exec + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/selinux/go-selinux/label" + + "golang.org/x/sys/unix" +) + +type linuxStandardInit struct { + pipe *os.File + consoleSocket *os.File + parentPid int + stateDirFD int + config *initConfig +} + +func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { + var newperms uint32 + + if l.config.Config.Namespaces.Contains(configs.NEWUSER) { + // with user ns we need 'other' search permissions + newperms = 0x8 + } else { + // without user ns we need 'UID' search permissions + newperms = 0x80000 + } + + // create a unique per session container name that we can + // join in setns; however, other containers can also join it + return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +} + +func (l *linuxStandardInit) Init() error { + if !l.config.Config.NoNewKeyring { + ringname, keepperms, newperms := l.getSessionRingParams() + + // do not inherit the parent's session keyring + sessKeyId, err := keys.JoinSessionKeyring(ringname) + if err != nil { + return err + } + // make session keyring searcheable + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return err + } + } + + if err := setupNetwork(l.config); err != nil { + return err + } + if err := setupRoute(l.config.Config); err != nil { + return err + } + + label.Init() + + // prepareRootfs() can be executed only for a new mount namespace. + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := prepareRootfs(l.pipe, l.config.Config); err != nil { + return err + } + } + + // Set up the console. This has to be done *before* we finalize the rootfs, + // but *after* we've given the user the chance to set up all of the mounts + // they wanted. + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, true); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } + + // Finish the rootfs setup. + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := finalizeRootfs(l.config.Config); err != nil { + return err + } + } + + if hostname := l.config.Config.Hostname; hostname != "" { + if err := unix.Sethostname([]byte(hostname)); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + + for key, value := range l.config.Config.Sysctl { + if err := writeSystemProperty(key, value); err != nil { + return err + } + } + for _, path := range l.config.Config.ReadonlyPaths { + if err := readonlyPath(path); err != nil { + return err + } + } + for _, path := range l.config.Config.MaskPaths { + if err := maskPath(path); err != nil { + return err + } + } + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return err + } + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return err + } + // compare the parent from the initial start of the init process and make sure that it did not change. + // if the parent changes that means it died and we were reparented to something else so we should + // just kill ourself and not cause problems for someone else. + if unix.Getppid() != l.parentPid { + return unix.Kill(unix.Getpid(), unix.SIGKILL) + } + // check for the arg before waiting to make sure it exists and it is returned + // as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // close the pipe to signal that we have completed our init. + l.pipe.Close() + // wait for the fifo to be opened on the other side before + // exec'ing the users process. + fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "openat exec fifo") + } + if _, err := unix.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + // close the statedir fd before exec because the kernel resets dumpable in the wrong order + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 + unix.Close(l.stateDirFD) + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go new file mode 100644 index 00000000..07055695 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -0,0 +1,248 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +func newStateTransitionError(from, to containerState) error { + return &stateTransitionError{ + From: from.status().String(), + To: to.status().String(), + } +} + +// stateTransitionError is returned when an invalid state transition happens from one +// state to another. +type stateTransitionError struct { + From string + To string +} + +func (s *stateTransitionError) Error() string { + return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To) +} + +type containerState interface { + transition(containerState) error + destroy() error + status() Status +} + +func destroy(c *linuxContainer) error { + if !c.config.Namespaces.Contains(configs.NEWPID) { + if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { + logrus.Warn(err) + } + } + err := c.cgroupManager.Destroy() + if rerr := os.RemoveAll(c.root); err == nil { + err = rerr + } + c.initProcess = nil + if herr := runPoststopHooks(c); err == nil { + err = herr + } + c.state = &stoppedState{c: c} + return err +} + +func runPoststopHooks(c *linuxContainer) error { + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + } + for _, hook := range c.config.Hooks.Poststop { + if err := hook.Run(s); err != nil { + return err + } + } + } + return nil +} + +// stoppedState represents a container is a stopped/destroyed state. +type stoppedState struct { + c *linuxContainer +} + +func (b *stoppedState) status() Status { + return Stopped +} + +func (b *stoppedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *restoredState: + b.c.state = s + return nil + case *stoppedState: + return nil + } + return newStateTransitionError(b, s) +} + +func (b *stoppedState) destroy() error { + return destroy(b.c) +} + +// runningState represents a container that is currently running. +type runningState struct { + c *linuxContainer +} + +func (r *runningState) status() Status { + return Running +} + +func (r *runningState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + } + r.c.state = s + return nil + case *pausedState: + r.c.state = s + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *runningState) destroy() error { + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + } + return destroy(r.c) +} + +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(unix.SIGKILL) + return destroy(i.c) +} + +// pausedState represents a container that is currently pause. It cannot be destroyed in a +// paused state and must transition back to running first. +type pausedState struct { + c *linuxContainer +} + +func (p *pausedState) status() Status { + return Paused +} + +func (p *pausedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *stoppedState: + p.c.state = s + return nil + case *pausedState: + return nil + } + return newStateTransitionError(p, s) +} + +func (p *pausedState) destroy() error { + t, err := p.c.runType() + if err != nil { + return err + } + if t != Running && t != Created { + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return destroy(p.c) + } + return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) +} + +// restoredState is the same as the running state but also has associated checkpoint +// information that maybe need destroyed when the container is stopped and destroy is called. +type restoredState struct { + imageDir string + c *linuxContainer +} + +func (r *restoredState) status() Status { + return Running +} + +func (r *restoredState) transition(s containerState) error { + switch s.(type) { + case *stoppedState, *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *restoredState) destroy() error { + if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + return err + } + } + return destroy(r.c) +} + +// loadedState is used whenever a container is restored, loaded, or setting additional +// processes inside and it should not be destroyed when it is exiting. +type loadedState struct { + c *linuxContainer + s Status +} + +func (n *loadedState) status() Status { + return n.s +} + +func (n *loadedState) transition(s containerState) error { + n.c.state = s + return nil +} + +func (n *loadedState) destroy() error { + if err := n.c.refreshState(); err != nil { + return err + } + return n.c.state.destroy() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/stats.go new file mode 100644 index 00000000..303e4b94 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats.go @@ -0,0 +1,15 @@ +package libcontainer + +type NetworkInterface struct { + // Name is the name of the network interface. + Name string + + RxBytes uint64 + RxPackets uint64 + RxErrors uint64 + RxDropped uint64 + TxBytes uint64 + TxPackets uint64 + TxErrors uint64 + TxDropped uint64 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go new file mode 100644 index 00000000..f8d1d689 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go @@ -0,0 +1,5 @@ +package libcontainer + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go new file mode 100644 index 00000000..c629dc67 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go @@ -0,0 +1,8 @@ +package libcontainer + +import "github.com/opencontainers/runc/libcontainer/cgroups" + +type Stats struct { + Interfaces []*NetworkInterface + CgroupStats *cgroups.Stats +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go new file mode 100644 index 00000000..da78c1c2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go @@ -0,0 +1,7 @@ +package libcontainer + +// Solaris - TODO + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go new file mode 100644 index 00000000..f8d1d689 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go @@ -0,0 +1,5 @@ +package libcontainer + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go new file mode 100644 index 00000000..cf7b45bc --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -0,0 +1,107 @@ +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +type syncType string + +// Constants that are used for synchronisation between the parent and child +// during container setup. They come in pairs (with procError being a generic +// response which is followed by a &genericError). +// +// [ child ] <-> [ parent ] +// +// procHooks --> [run hooks] +// <-- procResume +// +// procConsole --> +// <-- procConsoleReq +// [send(fd)] --> [recv(fd)] +// <-- procConsoleAck +// +// procReady --> [final setup] +// <-- procRun +const ( + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" +) + +type syncT struct { + Type syncType `json:"type"` +} + +// writeSync is used to write to a synchronisation pipe. An error is returned +// if there was a problem writing the payload. +func writeSync(pipe io.Writer, sync syncType) error { + if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { + return err + } + return nil +} + +// readSync is used to read from a synchronisation pipe. An error is returned +// if we got a genericError, the pipe was closed, or we got an unexpected flag. +func readSync(pipe io.Reader, expected syncType) error { + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + + if procSync.Type == procError { + var ierr genericError + + if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { + return fmt.Errorf("failed reading error from parent: %v", err) + } + + return &ierr + } + + if procSync.Type != expected { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + +// parseSync runs the given callback function on each syncT received from the +// child. It will return once io.EOF is returned from the given pipe. +func parseSync(pipe io.Reader, fn func(*syncT) error) error { + dec := json.NewDecoder(pipe) + for { + var sync syncT + if err := dec.Decode(&sync); err != nil { + if err == io.EOF { + break + } + return err + } + + // We handle this case outside fn for cleanliness reasons. + var ierr *genericError + if sync.Type == procError { + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") + } + if ierr != nil { + return ierr + } + // Programmer error. + panic("No error following JSON procError payload.") + } + + if err := fn(&sync); err != nil { + return err + } + } + return nil +}