diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/README.md b/vendor/src/github.com/opencontainers/runc/libcontainer/README.md new file mode 100644 index 00000000..6331010f --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/README.md @@ -0,0 +1,244 @@ +Libcontainer provides a native Go implementation for creating containers +with namespaces, cgroups, capabilities, and filesystem access controls. +It allows you to manage the lifecycle of the container performing additional operations +after the container is created. + + +#### Container +A container is a self contained execution environment that shares the kernel of the +host system and which is (optionally) isolated from other containers in the system. + +#### Using libcontainer + +Because containers are spawned in a two step process you will need a binary that +will be executed as the init process for the container. In libcontainer, we use +the current binary (/proc/self/exe) to be executed as the init process, and use +arg "init", we call the first step process "bootstrap", so you always need a "init" +function as the entry of "bootstrap". + +```go +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } + panic("--this line should have never been executed, congratulations--") + } +} +``` + +Then to create a container you first have to initialize an instance of a factory +that will handle the creation and initialization for a container. + +```go +factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init")) +if err != nil { + logrus.Fatal(err) + return +} +``` + +Once you have an instance of the factory created we can create a configuration +struct describing how the container is to be created. A sample would look similar to this: + +```go +defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +config := &configs.Config{ + Rootfs: "/your/path/to/rootfs", + Capabilities: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWUSER}, + {Type: configs.NEWNET}, + }), + Cgroups: &configs.Cgroup{ + Name: "test-container", + Parent: "system", + Resources: &configs.Resources{ + MemorySwappiness: nil, + AllowAllDevices: false, + AllowedDevices: configs.DefaultAllowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + }, + ReadonlyPaths: []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: configs.DefaultAutoCreatedDevices, + Hostname: "testing", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | syscall.MS_RDONLY, + }, + }, + UidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + GidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, +} +``` + +Once you have the configuration populated you can create a container: + +```go +container, err := factory.Create("container-id", config) +if err != nil { + logrus.Fatal(err) + return +} +``` + +To spawn bash as the initial process inside the container and have the +processes pid returned in order to wait, signal, or kill the process: + +```go +process := &libcontainer.Process{ + Args: []string{"/bin/bash"}, + Env: []string{"PATH=/bin"}, + User: "daemon", + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, +} + +err := container.Start(process) +if err != nil { + container.Destroy() + logrus.Fatal(err) + return +} + +// wait for the process to finish. +_, err := process.Wait() +if err != nil { + logrus.Fatal(err) +} + +// destroy the container. +container.Destroy() +``` + +Additional ways to interact with a running container are: + +```go +// return all the pids for all processes running inside the container. +processes, err := container.Processes() + +// get detailed cpu, memory, io, and network statistics for the container and +// it's processes. +stats, err := container.Stats() + +// pause all processes inside the container. +container.Pause() + +// resume all paused processes. +container.Resume() + +// send signal to container's init process. +container.Signal(signal) + +// update container resource constraints. +container.Set(config) +``` + + +#### Checkpoint & Restore + +libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. +This let's you save the state of a process running inside a container to disk, and then restore +that state into a new process, on the same machine or on another machine. + +`criu` version 1.5.2 or higher is required to use checkpoint and restore. +If you don't already have `criu` installed, you can build it from source, following the +[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image +generated when building libcontainer with docker. + + +## Copyright and license + +Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license. +Docs released under Creative commons. + diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/SPEC.md b/vendor/src/github.com/opencontainers/runc/libcontainer/SPEC.md new file mode 100644 index 00000000..32578f01 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/SPEC.md @@ -0,0 +1,335 @@ +## Container Specification - v1 + +This is the standard configuration for version 1 containers. It includes +namespaces, standard filesystem setup, a default Linux capability set, and +information about resource reservations. It also has information about any +populated environment settings for the processes running inside a container. + +Along with the configuration of how a container is created the standard also +discusses actions that can be performed on a container to manage and inspect +information about the processes running inside. + +The v1 profile is meant to be able to accommodate the majority of applications +with a strong security configuration. + +### System Requirements and Compatibility + +Minimum requirements: +* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches) +* Mounted cgroups with each subsystem in its own hierarchy + + +### Namespaces + +| Flag | Enabled | +| ------------ | ------- | +| CLONE_NEWPID | 1 | +| CLONE_NEWUTS | 1 | +| CLONE_NEWIPC | 1 | +| CLONE_NEWNET | 1 | +| CLONE_NEWNS | 1 | +| CLONE_NEWUSER | 1 | + +Namespaces are created for the container via the `clone` syscall. + + +### Filesystem + +A root filesystem must be provided to a container for execution. The container +will use this root filesystem (rootfs) to jail and spawn processes inside where +the binaries and system libraries are local to that directory. Any binaries +to be executed must be contained within this rootfs. + +Mounts that happen inside the container are automatically cleaned up when the +container exits as the mount namespace is destroyed and the kernel will +unmount all the mounts that were setup within that namespace. + +For a container to execute properly there are certain filesystems that +are required to be mounted within the rootfs that the runtime will setup. + +| Path | Type | Flags | Data | +| ----------- | ------ | -------------------------------------- | ---------------------------------------- | +| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 | +| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k | +| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 | +| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | | + + +After a container's filesystems are mounted within the newly created +mount namespace `/dev` will need to be populated with a set of device nodes. +It is expected that a rootfs does not need to have any device nodes specified +for `/dev` within the rootfs as the container will setup the correct devices +that are required for executing a container's process. + +| Path | Mode | Access | +| ------------ | ---- | ---------- | +| /dev/null | 0666 | rwm | +| /dev/zero | 0666 | rwm | +| /dev/full | 0666 | rwm | +| /dev/tty | 0666 | rwm | +| /dev/random | 0666 | rwm | +| /dev/urandom | 0666 | rwm | +| /dev/fuse | 0666 | rwm | + + +**ptmx** +`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within +the container. + +The use of a pseudo TTY is optional within a container and it should support both. +If a pseudo is provided to the container `/dev/console` will need to be +setup by binding the console in `/dev/` after it has been populated and mounted +in tmpfs. + +| Source | Destination | UID GID | Mode | Type | +| --------------- | ------------ | ------- | ---- | ---- | +| *pty host path* | /dev/console | 0 0 | 0600 | bind | + + +After `/dev/null` has been setup we check for any external links between +the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing +to `/dev/null` outside the container we close and `dup2` the `/dev/null` +that is local to the container's rootfs. + + +After the container has `/proc` mounted a few standard symlinks are setup +within `/dev/` for the io. + +| Source | Destination | +| --------------- | ----------- | +| /proc/self/fd | /dev/fd | +| /proc/self/fd/0 | /dev/stdin | +| /proc/self/fd/1 | /dev/stdout | +| /proc/self/fd/2 | /dev/stderr | + +A `pivot_root` is used to change the root for the process, effectively +jailing the process inside the rootfs. + +```c +put_old = mkdir(...); +pivot_root(rootfs, put_old); +chdir("/"); +unmount(put_old, MS_DETACH); +rmdir(put_old); +``` + +For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined +with a `chroot` is required as `pivot_root` is not supported in `ramfs`. + +```c +mount(rootfs, "/", NULL, MS_MOVE, NULL); +chroot("."); +chdir("/"); +``` + +The `umask` is set back to `0022` after the filesystem setup has been completed. + +### Resources + +Cgroups are used to handle resource allocation for containers. This includes +system resources like cpu, memory, and device access. + +| Subsystem | Enabled | +| ---------- | ------- | +| devices | 1 | +| memory | 1 | +| cpu | 1 | +| cpuacct | 1 | +| cpuset | 1 | +| blkio | 1 | +| perf_event | 1 | +| freezer | 1 | +| hugetlb | 1 | +| pids | 1 | + + +All cgroup subsystem are joined so that statistics can be collected from +each of the subsystems. Freezer does not expose any stats but is joined +so that containers can be paused and resumed. + +The parent process of the container's init must place the init pid inside +the correct cgroups before the initialization begins. This is done so +that no processes or threads escape the cgroups. This sync is +done via a pipe ( specified in the runtime section below ) that the container's +init process will block waiting for the parent to finish setup. + +### Security + +The standard set of Linux capabilities that are set in a container +provide a good default for security and flexibility for the applications. + + +| Capability | Enabled | +| -------------------- | ------- | +| CAP_NET_RAW | 1 | +| CAP_NET_BIND_SERVICE | 1 | +| CAP_AUDIT_READ | 1 | +| CAP_AUDIT_WRITE | 1 | +| CAP_DAC_OVERRIDE | 1 | +| CAP_SETFCAP | 1 | +| CAP_SETPCAP | 1 | +| CAP_SETGID | 1 | +| CAP_SETUID | 1 | +| CAP_MKNOD | 1 | +| CAP_CHOWN | 1 | +| CAP_FOWNER | 1 | +| CAP_FSETID | 1 | +| CAP_KILL | 1 | +| CAP_SYS_CHROOT | 1 | +| CAP_NET_BROADCAST | 0 | +| CAP_SYS_MODULE | 0 | +| CAP_SYS_RAWIO | 0 | +| CAP_SYS_PACCT | 0 | +| CAP_SYS_ADMIN | 0 | +| CAP_SYS_NICE | 0 | +| CAP_SYS_RESOURCE | 0 | +| CAP_SYS_TIME | 0 | +| CAP_SYS_TTY_CONFIG | 0 | +| CAP_AUDIT_CONTROL | 0 | +| CAP_MAC_OVERRIDE | 0 | +| CAP_MAC_ADMIN | 0 | +| CAP_NET_ADMIN | 0 | +| CAP_SYSLOG | 0 | +| CAP_DAC_READ_SEARCH | 0 | +| CAP_LINUX_IMMUTABLE | 0 | +| CAP_IPC_LOCK | 0 | +| CAP_IPC_OWNER | 0 | +| CAP_SYS_PTRACE | 0 | +| CAP_SYS_BOOT | 0 | +| CAP_LEASE | 0 | +| CAP_WAKE_ALARM | 0 | +| CAP_BLOCK_SUSPEND | 0 | + + +Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) +and [selinux](http://selinuxproject.org/page/Main_Page) can be used with +the containers. A container should support setting an apparmor profile or +selinux process and mount labels if provided in the configuration. + +Standard apparmor profile: +```c +#include +profile flags=(attach_disconnected,mediate_deleted) { + #include + network, + capability, + file, + umount, + + deny @{PROC}/sys/fs/** wklx, + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + deny mount, + + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, +} +``` + +*TODO: seccomp work is being done to find a good default config* + +### Runtime and Init Process + +During container creation the parent process needs to talk to the container's init +process and have a form of synchronization. This is accomplished by creating +a pipe that is passed to the container's init. When the init process first spawns +it will block on its side of the pipe until the parent closes its side. This +allows the parent to have time to set the new process inside a cgroup hierarchy +and/or write any uid/gid mappings required for user namespaces. +The pipe is passed to the init process via FD 3. + +The application consuming libcontainer should be compiled statically. libcontainer +does not define any init process and the arguments provided are used to `exec` the +process inside the application. There should be no long running init within the +container spec. + +If a pseudo tty is provided to a container it will open and `dup2` the console +as the container's STDIN, STDOUT, STDERR as well as mounting the console +as `/dev/console`. + +An extra set of mounts are provided to a container and setup for use. A container's +rootfs can contain some non portable files inside that can cause side effects during +execution of a process. These files are usually created and populated with the container +specific information via the runtime. + +**Extra runtime files:** +* /etc/hosts +* /etc/resolv.conf +* /etc/hostname +* /etc/localtime + + +#### Defaults + +There are a few defaults that can be overridden by users, but in their omission +these apply to processes within a container. + +| Type | Value | +| ------------------- | ------------------------------ | +| Parent Death Signal | SIGKILL | +| UID | 0 | +| GID | 0 | +| GROUPS | 0, NULL | +| CWD | "/" | +| $HOME | Current user's home dir or "/" | +| Readonly rootfs | false | +| Pseudo TTY | false | + + +## Actions + +After a container is created there is a standard set of actions that can +be done to the container. These actions are part of the public API for +a container. + +| Action | Description | +| -------------- | ------------------------------------------------------------------ | +| Get processes | Return all the pids for processes running inside a container | +| Get Stats | Return resource statistics for the container as a whole | +| Wait | Waits on the container's init process ( pid 1 ) | +| Wait Process | Wait on any of the container's processes returning the exit status | +| Destroy | Kill the container's init process and remove any filesystem state | +| Signal | Send a signal to the container's init process | +| Signal Process | Send a signal to any of the container's processes | +| Pause | Pause all processes inside the container | +| Resume | Resume all processes inside the container if paused | +| Exec | Execute a new process inside of the container ( requires setns ) | +| Set | Setup configs of the container after it's created | + +### Execute a new process inside of a running container. + +User can execute a new process inside of a running container. Any binaries to be +executed must be accessible within the container's rootfs. + +The started process will run inside the container's rootfs. Any changes +made by the process to the container's filesystem will persist after the +process finished executing. + +The started process will join all the container's existing namespaces. When the +container is paused, the process will also be paused and will resume when +the container is unpaused. The started process will only run when the container's +primary process (PID 1) is running, and will not be restarted when the container +is restarted. + +#### Planned additions + +The started process will have its own cgroups nested inside the container's +cgroups. This is used for process tracking and optionally resource allocation +handling for the new process. Freezer cgroup is required, the rest of the cgroups +are optional. The process executor must place its pid inside the correct +cgroups before starting the process. This is done so that no child processes or +threads can escape the cgroups. + +When the process is stopped, the process executor will try (in a best-effort way) +to stop all its children and remove the sub-cgroups. diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go new file mode 100644 index 00000000..82ed1a68 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -0,0 +1,39 @@ +// +build apparmor,linux + +package apparmor + +// #cgo LDFLAGS: -lapparmor +// #include +// #include +import "C" +import ( + "fmt" + "io/ioutil" + "os" + "unsafe" +) + +// IsEnabled returns true if apparmor is enabled for the host. +func IsEnabled() bool { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { + if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && len(buf) > 1 && buf[0] == 'Y' + } + } + return false +} + +// ApplyProfile will apply the profile with the specified name to the process after +// the next exec. +func ApplyProfile(name string) error { + if name == "" { + return nil + } + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + if _, err := C.aa_change_onexec(cName); err != nil { + return fmt.Errorf("apparmor failed to apply profile: %s", err) + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go b/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go new file mode 100644 index 00000000..d4110cf0 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go @@ -0,0 +1,20 @@ +// +build !apparmor !linux + +package apparmor + +import ( + "errors" +) + +var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") + +func IsEnabled() bool { + return false +} + +func ApplyProfile(name string) error { + if name != "" { + return ErrApparmorNotEnabled + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/capabilities_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/capabilities_linux.go new file mode 100644 index 00000000..4eda56d1 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/capabilities_linux.go @@ -0,0 +1,69 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "strings" + + "github.com/syndtr/gocapability/capability" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDS + +var capabilityMap map[string]capability.Cap + +func init() { + capabilityMap = make(map[string]capability.Cap) + last := capability.CAP_LAST_CAP + // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap + if last == capability.Cap(63) { + last = capability.CAP_BLOCK_SUSPEND + } + for _, cap := range capability.List() { + if cap > last { + continue + } + capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())) + capabilityMap[capKey] = cap + } +} + +func newCapWhitelist(caps []string) (*whitelist, error) { + l := []capability.Cap{} + for _, c := range caps { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + l = append(l, v) + } + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + return &whitelist{ + keep: l, + pid: pid, + }, nil +} + +type whitelist struct { + pid capability.Capabilities + keep []capability.Cap +} + +// dropBoundingSet drops the capability bounding set to those specified in the whitelist. +func (w *whitelist) dropBoundingSet() error { + w.pid.Clear(capability.BOUNDS) + w.pid.Set(capability.BOUNDS, w.keep...) + return w.pid.Apply(capability.BOUNDS) +} + +// drop drops all capabilities for the current process except those specified in the whitelist. +func (w *whitelist) drop() error { + w.pid.Clear(allCapabilityTypes) + w.pid.Set(allCapabilityTypes, w.keep...) + return w.pid.Apply(allCapabilityTypes) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go new file mode 100644 index 00000000..274ab47d --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -0,0 +1,64 @@ +// +build linux + +package cgroups + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager interface { + // Applies cgroup configuration to the process with the specified pid + Apply(pid int) error + + // Returns the PIDs inside the cgroup set + GetPids() ([]int, error) + + // Returns the PIDs inside the cgroup set & all sub-cgroups + GetAllPids() ([]int, error) + + // Returns statistics for the cgroup set + GetStats() (*Stats, error) + + // Toggles the freezer cgroup according with specified state + Freeze(state configs.FreezerState) error + + // Destroys the cgroup set + Destroy() error + + // NewCgroupManager() and LoadCgroupManager() require following attributes: + // Paths map[string]string + // Cgroups *cgroups.Cgroup + // Paths maps cgroup subsystem to path at which it is mounted. + // Cgroups specifies specific cgroup settings for the various subsystems + + // Returns cgroup paths to save in a state file and to be able to + // restore the object later. + GetPaths() map[string]string + + // Set the cgroup as configured. + Set(container *configs.Config) error +} + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups_unsupported.go new file mode 100644 index 00000000..278d507e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package cgroups diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go new file mode 100644 index 00000000..633ab042 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -0,0 +1,402 @@ +// +build linux + +package fs + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "sync" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +var ( + subsystems = subsystemSet{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, + } + CgroupProcesses = "cgroup.procs" + HugePageSizes, _ = cgroups.GetHugePageSize() +) + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +type subsystemSet []subsystem + +func (s subsystemSet) Get(name string) (subsystem, error) { + for _, ss := range s { + if ss.Name() == name { + return ss, nil + } + } + return nil, errSubsystemDoesNotExist +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Removes the cgroup represented by 'cgroupData'. + Remove(*cgroupData) error + // Creates and joins the cgroup represented by 'cgroupData'. + Apply(*cgroupData) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error +} + +type Manager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +// The absolute path to the root of the cgroup hierarchies. +var cgroupRootLock sync.Mutex +var cgroupRoot string + +// Gets the cgroupRoot. +func getCgroupRoot() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + root, err := cgroups.FindCgroupMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +type cgroupData struct { + root string + innerPath string + config *configs.Cgroup + pid int +} + +func (m *Manager) Apply(pid int) (err error) { + if m.Cgroups == nil { + return nil + } + + var c = m.Cgroups + + d, err := getCgroupData(m.Cgroups, pid) + if err != nil { + return err + } + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := d.path(name) + if err != nil { + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + m.mu.Lock() + defer m.mu.Unlock() + paths := make(map[string]string) + for _, sys := range subsystems { + if err := sys.Apply(d); err != nil { + return err + } + // TODO: Apply should, ideally, be reentrant or be broken up into a separate + // create and join phase so that the cgroup hierarchy for a container can be + // created then join consists of writing the process pids to cgroup.procs + p, err := d.path(sys.Name()) + if err != nil { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && sys.Name() != "devices" { + continue + } + return err + } + paths[sys.Name()] = p + } + m.Paths = paths + return nil +} + +func (m *Manager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *Manager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := subsystems.Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *Manager) Set(container *configs.Config) error { + for _, sys := range subsystems { + // Generate fake cgroup data. + d, err := getCgroupData(container.Cgroups, -1) + if err != nil { + return err + } + // Get the path, but don't error out if the cgroup wasn't found. + path, err := d.path(sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + if m.Paths["cpu"] != "" { + if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *Manager) Freeze(state configs.FreezerState) error { + d, err := getCgroupData(m.Cgroups, 0) + if err != nil { + return err + } + dir, err := d.path("freezer") + if err != nil { + return err + } + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := subsystems.Get("freezer") + if err != nil { + return err + } + err = freezer.Set(dir, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *Manager) GetPids() ([]int, error) { + dir, err := getCgroupPath(m.Cgroups) + if err != nil { + return nil, err + } + return cgroups.GetPids(dir) +} + +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := getCgroupPath(m.Cgroups) + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func getCgroupPath(c *configs.Cgroup) (string, error) { + d, err := getCgroupData(c, 0) + if err != nil { + return "", err + } + + return d.path("devices") +} + +func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { + root, err := getCgroupRoot() + if err != nil { + return nil, err + } + + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } + + return &cgroupData{ + root: root, + innerPath: innerPath, + config: c, + pid: pid, + }, nil +} + +func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) { + // Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + initPath, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return "", err + } + // This is needed for nested containers, because in /proc/self/cgroup we + // see pathes from host, which don't exist in container. + relDir, err := filepath.Rel(root, initPath) + if err != nil { + return "", err + } + return filepath.Join(mountpoint, relDir), nil +} + +func (raw *cgroupData) path(subsystem string) (string, error) { + mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(raw.innerPath) { + // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. + return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil + } + + parentPath, err := raw.parentPath(subsystem, mnt, root) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, raw.innerPath), nil +} + +func (raw *cgroupData) join(subsystem string) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil { + return "", err + } + return path, nil +} + +func writeFile(dir, file, data string) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func readFile(dir, file string) (string, error) { + data, err := ioutil.ReadFile(filepath.Join(dir, file)) + return string(data), err +} + +func removePath(p string, err error) error { + if err != nil { + return err + } + if p != "" { + return os.RemoveAll(p) + } + return nil +} + +func CheckCpushares(path string, c int64) error { + var cpuShares int64 + + if c == 0 { + return nil + } + + fd, err := os.Open(filepath.Join(path, "cpu.shares")) + if err != nil { + return err + } + defer fd.Close() + + _, err = fmt.Fscanf(fd, "%d", &cpuShares) + if err != nil && err != io.EOF { + return err + } + + if c > cpuShares { + return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares) + } else if c < cpuShares { + return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares) + } + + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 00000000..a142cb99 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,237 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(d *cgroupData) error { + _, err := d.join("blkio") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.BlkioWeight != 0 { + if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + if cgroup.Resources.BlkioLeafWeight != 0 { + if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range cgroup.Resources.BlkioWeightDevice { + if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { + return err + } + if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +func (s *BlkioGroup) Remove(d *cgroupData) error { + return removePath(d.path("blkio")) +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, err + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, err + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, err + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + // Try to read CFQ stats available on all CFQ enabled kernels first + if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil { + return getCFQStats(path, stats) + } + return getStats(path, stats) // Use generic stats as fallback +} + +func getCFQStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil { + return err + } + stats.BlkioStats.SectorsRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil { + return err + } + stats.BlkioStats.IoQueuedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServiceTimeRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoWaitTimeRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil { + return err + } + stats.BlkioStats.IoMergedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoTimeRecursive = blkioStats + + return nil +} + +func getStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 00000000..a4ef28a6 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,94 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpuGroup struct { +} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(d *cgroupData) error { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + _, err := d.join("cpu") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuShares != 0 { + if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuPeriod != 0 { + if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuQuota != 0 { + if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuRtPeriod != 0 { + if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuRtRuntime != 0 { + if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *CpuGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpu")) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(path, "cpu.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 00000000..53afbadd --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,121 @@ +// +build linux + +package fs + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + nanosecondsInSecond = 1000000000 +) + +var clockTicks = uint64(system.GetClockTicks()) + +type CpuacctGroup struct { +} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(d *cgroupData) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) { + return err + } + + return nil +} + +func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *CpuacctGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpuacct")) +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := getCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + userModeUsage := uint64(0) + kernelModeUsage := uint64(0) + const ( + userField = "user" + systemField = "system" + ) + + // Expected format: + // user + // system + data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat)) + if err != nil { + return 0, 0, err + } + fields := strings.Fields(string(data)) + if len(fields) != 4 { + return 0, 0, fmt.Errorf("failure - %s is expected to have 4 fields", filepath.Join(path, cgroupCpuacctStat)) + } + if fields[0] != userField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) + } + if fields[2] != systemField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, err + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, err + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + percpuUsage := []uint64{} + data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu")) + if err != nil { + return percpuUsage, err + } + for _, value := range strings.Fields(string(data)) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err) + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 00000000..cbe62bd9 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,139 @@ +// +build linux + +package fs + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +type CpusetGroup struct { +} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(d *cgroupData) error { + dir, err := d.path("cpuset") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(dir, d.config, d.pid) +} + +func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpuset")) +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + root, err := getCgroupRoot() + if err != nil { + return err + } + if err := s.ensureParent(dir, root); err != nil { + return err + } + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return err + } + + return nil +} + +func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (s *CpusetGroup) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if libcontainerUtils.CleanPath(parent) == root { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if err := s.ensureParent(parent, root); err != nil { + return err + } + if err := os.MkdirAll(current, 0755); err != nil { + return err + } + return s.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (s *CpusetGroup) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + + if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { + return err + } + if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + return err + } + + if s.isEmpty(currentCpus) { + if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if s.isEmpty(currentMems) { + if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go new file mode 100644 index 00000000..5f783310 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,78 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +) + +type DevicesGroup struct { +} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(d *cgroupData) error { + _, err := d.join("devices") + if err != nil { + // We will return error even it's `not found` error, devices + // cgroup is hard requirement for container's security. + return err + } + return nil +} + +func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { + if system.RunningInUserNS() { + return nil + } + + devices := cgroup.Resources.Devices + if len(devices) > 0 { + for _, dev := range devices { + file := "devices.deny" + if dev.Allow { + file = "devices.allow" + } + if err := writeFile(path, file, dev.CgroupString()); err != nil { + return err + } + } + return nil + } + if !cgroup.Resources.AllowAllDevices { + if err := writeFile(path, "devices.deny", "a"); err != nil { + return err + } + + for _, dev := range cgroup.Resources.AllowedDevices { + if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { + return err + } + } + return nil + } + + if err := writeFile(path, "devices.allow", "a"); err != nil { + return err + } + + for _, dev := range cgroup.Resources.DeniedDevices { + if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *DevicesGroup) Remove(d *cgroupData) error { + return removePath(d.path("devices")) +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 00000000..e70dfe3b --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,61 @@ +// +build linux + +package fs + +import ( + "fmt" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type FreezerGroup struct { +} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(d *cgroupData) error { + _, err := d.join("freezer") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { + switch cgroup.Resources.Freezer { + case configs.Frozen, configs.Thawed: + if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + return err + } + + for { + state, err := readFile(path, "freezer.state") + if err != nil { + return err + } + if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { + break + } + time.Sleep(1 * time.Millisecond) + } + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + } + + return nil +} + +func (s *FreezerGroup) Remove(d *cgroupData) error { + return removePath(d.path("freezer")) +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs_unsupported.go new file mode 100644 index 00000000..3ef9e031 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package fs diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 00000000..2f972771 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,71 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct { +} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(d *cgroupData) error { + _, err := d.join("hugetlb") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, hugetlb := range cgroup.Resources.HugetlbLimit { + if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) Remove(d *cgroupData) error { + return removePath(d.path("hugetlb")) +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range HugePageSizes { + usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".") + value, err := getCgroupParamUint(path, usage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", usage, err) + } + hugetlbStats.Usage = value + + maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".") + value, err = getCgroupParamUint(path, maxUsage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + hugetlbStats.MaxUsage = value + + failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".") + value, err = getCgroupParamUint(path, failcnt) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go new file mode 100644 index 00000000..01c25eff --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,281 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +) + +type MemoryGroup struct { +} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(d *cgroupData) (err error) { + path, err := d.path("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + if memoryAssigned(d.config) { + if path != "" { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + } + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if err := s.SetKernelMemory(path, d.config); err != nil { + return err + } + } + + defer func() { + if err != nil { + os.RemoveAll(path) + } + }() + + // We need to join memory cgroup after set memory limits, because + // kmem.limit_in_bytes can only be set when the cgroup is empty. + _, err = d.join("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { + // This has to be done separately because it has special + // constraints (it can only be initialized before setting up a + // hierarchy or adding a task to the cgroups. However, if + // sucessfully initialized, it can be updated anytime afterwards) + if cgroup.Resources.KernelMemory != 0 { + kmemInitialized := false + // Is kmem.limit_in_bytes already set? + kmemValue, err := getCgroupParamUint(path, "memory.kmem.limit_in_bytes") + if err != nil { + return err + } + switch system.GetLongBit() { + case 32: + kmemInitialized = uint32(kmemValue) != uint32(math.MaxUint32) + case 64: + kmemInitialized = kmemValue != uint64(math.MaxUint64) + } + + if !kmemInitialized { + // If there's already tasks in the cgroup, we can't change the limit either + tasks, err := getCgroupParamString(path, "tasks") + if err != nil { + return err + } + if tasks != "" { + return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup") + } + } + + if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { + return err + } + } + return nil +} + +func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 { + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } else { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } else { + if cgroup.Resources.Memory != 0 { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + if cgroup.Resources.MemorySwap > 0 { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } + + return nil +} + +func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { + if err := setMemoryAndSwap(path, cgroup); err != nil { + return err + } + + if err := s.SetKernelMemory(path, cgroup); err != nil { + return err + } + + if cgroup.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + if cgroup.Resources.KernelMemoryTCP != 0 { + if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { + return err + } + } + if cgroup.Resources.OomKillDisable { + if err := writeFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { + return nil + } else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 { + if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness)) + } + + return nil +} + +func (s *MemoryGroup) Remove(d *cgroupData) error { + return removePath(d.path("memory")) +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + return nil +} + +func memoryAssigned(cgroup *configs.Cgroup) bool { + return cgroup.Resources.Memory != 0 || + cgroup.Resources.MemoryReservation != 0 || + cgroup.Resources.MemorySwap > 0 || + cgroup.Resources.KernelMemory > 0 || + cgroup.Resources.KernelMemoryTCP > 0 || + cgroup.Resources.OomKillDisable || + (cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1) +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = strings.Join([]string{"memory", name}, ".") + } + usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") + maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") + failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") + limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".") + + value, err := getCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + } + memoryData.Usage = value + value, err = getCgroupParamUint(path, maxUsage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + memoryData.MaxUsage = value + value, err = getCgroupParamUint(path, failcnt) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + memoryData.Failcnt = value + value, err = getCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go new file mode 100644 index 00000000..d8cf1d87 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go @@ -0,0 +1,40 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(d *cgroupData) error { + if s.Join { + // ignore errors if the named cgroup does not exist + d.join(s.GroupName) + } + return nil +} + +func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *NameGroup) Remove(d *cgroupData) error { + if s.Join { + removePath(d.path(s.GroupName)) + } + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 00000000..8a4054ba --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,41 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct { +} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(d *cgroupData) error { + _, err := d.join("net_cls") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.NetClsClassid != "" { + if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) Remove(d *cgroupData) error { + return removePath(d.path("net_cls")) +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 00000000..d0ab2af8 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,41 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct { +} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(d *cgroupData) error { + _, err := d.join("net_prio") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, prioMap := range cgroup.Resources.NetPrioIfpriomap { + if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) Remove(d *cgroupData) error { + return removePath(d.path("net_prio")) +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 00000000..5693676d --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,35 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct { +} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(d *cgroupData) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *PerfEventGroup) Remove(d *cgroupData) error { + return removePath(d.path("perf_event")) +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 00000000..f1e37205 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,73 @@ +// +build linux + +package fs + +import ( + "fmt" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + current, err := getCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + maxString, err := getCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = parseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go new file mode 100644 index 00000000..5ff0a161 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go @@ -0,0 +1,78 @@ +// +build linux + +package fs + +import ( + "errors" + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" +) + +var ( + ErrNotValidFormat = errors.New("line is not a valid key value format") +) + +// Saturates negative values at zero and returns a uint64. +// Due to kernel bugs, some of the memory cgroup stats can be negative. +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Parses a cgroup param and returns as name, value +// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234 +func getCgroupParamKeyValue(t string) (string, uint64, error) { + parts := strings.Fields(t) + switch len(parts) { + case 2: + value, err := parseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err) + } + + return parts[0], value, nil + default: + return "", 0, ErrNotValidFormat + } +} + +// Gets a single uint64 value from the specified cgroup file. +func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { + fileName := filepath.Join(cgroupPath, cgroupFile) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified cgroup file +func getCgroupParamString(cgroupPath, cgroupFile string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go new file mode 100644 index 00000000..b483f1bf --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -0,0 +1,106 @@ +// +build linux + +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// CpuUsage denotes the usage of a CPU. +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` +} + +type MemoryData struct { + Usage uint64 `json:"usage,omitempty"` + MaxUsage uint64 `json:"max_usage,omitempty"` + Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` +} + +type MemoryStats struct { + // memory used for cache + Cache uint64 `json:"cache,omitempty"` + // usage of memory + Usage MemoryData `json:"usage,omitempty"` + // usage of memory + swap + SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + + Stats map[string]uint64 `json:"stats,omitempty"` +} + +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes tranferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` + IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` + IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` + IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` +} + +type HugetlbStats struct { + // current res_counter usage for hugetlb + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // number of times hugetlb usage allocation failure. + Failcnt uint64 `json:"failcnt"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` + // the map is in the format "size of hugepage: stats of the hugepage" + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + hugetlbStats := make(map[string]HugetlbStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go new file mode 100644 index 00000000..7de9ae60 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -0,0 +1,55 @@ +// +build !linux + +package systemd + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func UseSystemd() bool { + return false +} + +func (m *Manager) Apply(pid int) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetPids() ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetAllPids() ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Destroy() error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetPaths() map[string]string { + return nil +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Set(container *configs.Config) error { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + return fmt.Errorf("Systemd not supported") +} + +func Freeze(c *configs.Cgroup, state configs.FreezerState) error { + return fmt.Errorf("Systemd not supported") +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go new file mode 100644 index 00000000..11e5ec04 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -0,0 +1,489 @@ +// +build linux + +package systemd + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/dbus" + systemdUtil "github.com/coreos/go-systemd/util" + "github.com/godbus/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +type subsystemSet []subsystem + +func (s subsystemSet) Get(name string) (subsystem, error) { + for _, ss := range s { + if ss.Name() == name { + return ss, nil + } + } + return nil, errSubsystemDoesNotExist +} + +var subsystems = subsystemSet{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +const ( + testScopeWait = 4 +) + +var ( + connLock sync.Mutex + theConn *systemdDbus.Conn + hasStartTransientUnit bool + hasTransientDefaultDependencies bool + hasDelegate bool +) + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func UseSystemd() bool { + if !systemdUtil.IsRunningSystemd() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemdDbus.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + return hasStartTransientUnit + } + } + } + + // Ensure the scope name we use doesn't exist. Use the Pid to + // avoid collisions between multiple libcontainer users on a + // single host. + scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid()) + testScopeExists := true + for i := 0; i <= testScopeWait; i++ { + if _, err := theConn.StopUnit(scope, "replace", nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { + testScopeExists = false + break + } + } + } + time.Sleep(time.Millisecond) + } + + // Bail out if we can't kill this scope without testing for DefaultDependencies + if testScopeExists { + return hasStartTransientUnit + } + + // Assume StartTransientUnit on a scope allows DefaultDependencies + hasTransientDefaultDependencies = true + ddf := newProp("DefaultDependencies", false) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasTransientDefaultDependencies = false + } + } + } + + // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) + + // Assume StartTransientUnit on a scope allows Delegate + hasDelegate = true + dl := newProp("Delegate", true) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasDelegate = false + } + } + } + + // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) + } + return hasStartTransientUnit +} + +func (m *Manager) Apply(pid int) error { + var ( + c = m.Cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, + systemdDbus.PropSlice(slice), + systemdDbus.PropDescription("docker container "+c.Name), + newProp("PIDs", []uint32{uint32(pid)}), + ) + + if hasDelegate { + // This is only supported on systemd versions 218 and above. + properties = append(properties, newProp("Delegate", true)) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + if hasTransientDefaultDependencies { + properties = append(properties, + newProp("DefaultDependencies", false)) + } + + if c.Resources.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(c.Resources.Memory))) + } + + if c.Resources.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", uint64(c.Resources.CpuShares))) + } + + if c.Resources.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) + } + + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { + if err := setKernelMemory(c); err != nil { + return err + } + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { + return err + } + + if err := joinCgroups(c, pid); err != nil { + return err + } + + paths := make(map[string]string) + for _, s := range subsystems { + subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[s.Name()] = subsystemPath + } + m.Paths = paths + return nil +} + +func (m *Manager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *Manager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} + +func writeFile(dir, file, data string) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { + path, err := getSubsystemPath(c, subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return "", err + } + + return path, nil +} + +func joinCgroups(c *configs.Cgroup, pid int) error { + for _, sys := range subsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + break + case "cpuset": + path, err := getSubsystemPath(c, name) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, c, pid); err != nil { + return err + } + break + default: + _, err := join(c, name, pid) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if name == "devices" { + return err + } + // For other subsystems, omit the `not found` error + // because they are optional. + if !cgroups.IsNotFound(err) { + return err + } + } + } + } + + return nil +} + +// systemd represents slice heirarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// test.slice/test-a.slice/test-a-b.slice. +func expandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += prefix + component + suffix + "/" + prefix += component + "-" + } + + return path, nil +} + +func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return "", err + } + + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + + slice := "system.slice" + if c.Parent != "" { + slice = c.Parent + } + + slice, err = expandSlice(slice) + if err != nil { + return "", err + } + + return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + path, err := getSubsystemPath(m.Cgroups, "freezer") + if err != nil { + return err + } + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := subsystems.Get("freezer") + if err != nil { + return err + } + err = freezer.Set(path, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *Manager) GetPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(path) +} + +func (m *Manager) GetAllPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := subsystems.Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *Manager) Set(container *configs.Config) error { + for _, sys := range subsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} + +func getUnitName(c *configs.Cgroup) string { + return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) +} + +func setKernelMemory(c *configs.Cgroup) error { + path, err := getSubsystemPath(c, "memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + return os.MkdirAll(path, 0755) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go new file mode 100644 index 00000000..1a7c4e1a --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -0,0 +1,413 @@ +// +build linux + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/docker/go-units" +) + +const cgroupNamePrefix = "name=" + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(subsystem string) (string, error) { + // We are not using mount.GetMounts() because it's super-inefficient, + // parsing it directly sped up x10 times because of not using Sscanf. + // It was one of two major performance drawbacks in container start. + if !isSubsystemAvailable(subsystem) { + return "", NewNotFoundError(subsystem) + } + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + txt := scanner.Text() + fields := strings.Split(txt, " ") + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], nil + } + } + } + if err := scanner.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError(subsystem) +} + +func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { + if !isSubsystemAvailable(subsystem) { + return "", "", NewNotFoundError(subsystem) + } + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + txt := scanner.Text() + fields := strings.Split(txt, " ") + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], fields[3], nil + } + } + } + if err := scanner.Err(); err != nil { + return "", "", err + } + + return "", "", NewNotFoundError(subsystem) +} + +func isSubsystemAvailable(subsystem string) bool { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return false + } + _, avail := cgroups[subsystem] + return avail +} + +func FindCgroupMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + text := scanner.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "cgroup" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "cgroup" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return filepath.Dir(fields[4]), nil + } + } + if err := scanner.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("cgroup") +} + +type Mount struct { + Mountpoint string + Root string + Subsystems []string +} + +func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", fmt.Errorf("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + scanner := bufio.NewScanner(mi) + numFound := 0 + for scanner.Scan() && numFound < len(ss) { + txt := scanner.Text() + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + return nil, fmt.Errorf("invalid mountinfo format") + } + if txt[sepIdx+3:sepIdx+9] != "cgroup" { + continue + } + fields := strings.Split(txt, " ") + m := Mount{ + Mountpoint: fields[4], + Root: fields[3], + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if !ss[opt] { + continue + } + if strings.HasPrefix(opt, cgroupNamePrefix) { + m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) + } else { + m.Subsystems = append(m.Subsystems, opt) + } + numFound++ + } + res = append(res, m) + } + if err := scanner.Err(); err != nil { + return nil, err + } + return res, nil +} + +func GetCgroupMounts() ([]Mount, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + all, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range all { + allMap[s] = true + } + return getCgroupMountsHelper(allMap, f) +} + +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + return subsystems, nil +} + +// GetThisCgroupDir returns the relative path to the cgroup docker is running in. +func GetThisCgroupDir(subsystem string) (string, error) { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetInitCgroupDir(subsystem string) (string, error) { + + cgroups, err := ParseCgroupFile("/proc/1/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func readProcsFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, "cgroup.procs")) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +// ParseCgroupFile parses the given cgroup file, typically from +// /proc//cgroup, into a map of subgroups to cgroup names. +func ParseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) + cgroups := make(map[string]string) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } + + for _, subs := range strings.Split(parts[1], ",") { + cgroups[subs] = parts[2] + } + } + return cgroups, nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +} + +func PathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +func EnterPid(cgroupPaths map[string]string, pid int) error { + for _, path := range cgroupPaths { + if PathExists(path) { + if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), + []byte(strconv.Itoa(pid)), 0700); err != nil { + return err + } + } + } + return nil +} + +// RemovePaths iterates over the provided paths removing them. +// We trying to remove all paths five times with increasing delay between tries. +// If after all there are not removed cgroups - appropriate error will be +// returned. +func RemovePaths(paths map[string]string) (err error) { + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + for s, p := range paths { + os.RemoveAll(p) + // TODO: here probably should be logging + _, err := os.Stat(p) + // We need this strange way of checking cgroups existence because + // RemoveAll almost always returns error, even on already removed + // cgroups + if os.IsNotExist(err) { + delete(paths, s) + } + } + if len(paths) == 0 { + return nil + } + } + return fmt.Errorf("Failed to remove paths: %v", paths) +} + +func GetHugePageSize() ([]string, error) { + var pageSizes []string + sizeList := []string{"B", "kB", "MB", "GB", "TB", "PB"} + files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") + if err != nil { + return pageSizes, err + } + for _, st := range files { + nameArray := strings.Split(st.Name(), "-") + pageSize, err := units.RAMInBytes(nameArray[1]) + if err != nil { + return []string{}, err + } + sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, sizeList) + pageSizes = append(pageSizes, sizeString) + } + + return pageSizes, nil +} + +// GetPids returns all pids, that were added to cgroup at path. +func GetPids(path string) ([]int, error) { + return readProcsFile(path) +} + +// GetAllPids returns all pids, that were added to cgroup at path and to all its +// subcgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + // collect pids from all sub-cgroups + err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { + dir, file := filepath.Split(p) + if file != "cgroup.procs" { + return nil + } + if iErr != nil { + return iErr + } + cPids, err := readProcsFile(dir) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go new file mode 100644 index 00000000..c7bdf1f6 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go @@ -0,0 +1,10 @@ +// +build linux,!go1.5 + +package libcontainer + +import "syscall" + +// GidMappingsEnableSetgroups was added in Go 1.5, so do nothing when building +// with earlier versions +func enableSetgroups(sys *syscall.SysProcAttr) { +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go new file mode 100644 index 00000000..e0f3ca16 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go new file mode 100644 index 00000000..f2eff91c --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -0,0 +1,124 @@ +// +build linux freebsd + +package configs + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type Cgroup struct { + // Deprecated, use Path instead + Name string `json:"name,omitempty"` + + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix decribes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources +} + +type Resources struct { + // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + // Deprecated + AllowAllDevices bool `json:"allow_all_devices,omitempty"` + // Deprecated + AllowedDevices []*Device `json:"allowed_devices,omitempty"` + // Deprecated + DeniedDevices []*Device `json:"denied_devices,omitempty"` + + Devices []*Device `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // Kernel memory limit (in bytes) + KernelMemory int64 `json:"kernel_memory"` + + // Kernel memory limit for TCP use (in bytes) + KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + + // CPU shares (relative weight vs. other containers) + CpuShares int64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod int64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod int64 `json:"cpu_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per divice, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *int64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid string `json:"net_cls_classid"` +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 00000000..95e2830a --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,6 @@ +// +build !windows,!linux,!freebsd + +package configs + +type Cgroup struct { +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go new file mode 100644 index 00000000..d74847b0 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go @@ -0,0 +1,6 @@ +package configs + +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct { +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config.go new file mode 100644 index 00000000..806e0be9 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -0,0 +1,332 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + "github.com/Sirupsen/logrus" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set. + // When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable. + // This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot. + PivotDir string `json:"pivot_dir"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capbilities not specified will be dropped from the processes capability mask + Capabilities []string `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj int `json:"oom_score_adj"` + + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string `json:"additional_groups"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks *Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` +} + +type Hooks struct { + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + Prestart []Hook + + // Poststart commands are executed after the container init process starts. + Poststart []Hook + + // Poststop commands are executed after the container init process exits. + Poststop []Hook +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state struct { + Prestart []CommandHook + Poststart []CommandHook + Poststop []CommandHook + } + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + deserialize := func(shooks []CommandHook) (hooks []Hook) { + for _, shook := range shooks { + hooks = append(hooks, shook) + } + + return hooks + } + + hooks.Prestart = deserialize(state.Prestart) + hooks.Poststart = deserialize(state.Poststart) + hooks.Poststop = deserialize(state.Poststop) + return nil +} + +func (hooks Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize(hooks.Prestart), + "poststart": serialize(hooks.Poststart), + "poststop": serialize(hooks.Poststop), + }) +} + +// HookState is the payload provided to a hook on execution. +type HookState struct { + Version string `json:"ociVersion"` + ID string `json:"id"` + Pid int `json:"pid"` + Root string `json:"root"` + BundlePath string `json:"bundlePath"` +} + +type Hook interface { + // Run executes the hook with the provided state. + Run(HookState) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(HookState) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(HookState) error +} + +func (f FuncHook) Run(s HookState) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s HookState) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + } + errC := make(chan error, 1) + go func() { + out, err := cmd.CombinedOutput() + if err != nil { + err = fmt.Errorf("%s: %s", err, out) + } + errC <- err + }() + if c.Timeout != nil { + select { + case err := <-errC: + return err + case <-time.After(*c.Timeout): + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } + } + return <-errC +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config_unix.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config_unix.go new file mode 100644 index 00000000..a60554a7 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/config_unix.go @@ -0,0 +1,51 @@ +// +build freebsd linux + +package configs + +import "fmt" + +// HostUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostUID() (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.") + } + id, found := c.hostIDFromMapping(0, c.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") + } + return id, nil + } + // Return default root uid 0 + return 0, nil +} + +// HostGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostGID() (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + id, found := c.hostIDFromMapping(0, c.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.") + } + return id, nil + } + // Return default root gid 0 + return 0, nil +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device.go new file mode 100644 index 00000000..8701bb21 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device.go @@ -0,0 +1,57 @@ +package configs + +import ( + "fmt" + "os" +) + +const ( + Wildcard = -1 +) + +// TODO Windows: This can be factored out in the future + +type Device struct { + // Device type, block, char, etc. + Type rune `json:"type"` + + // Path to the device. + Path string `json:"path"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Cgroup permissions format, rwm. + Permissions string `json:"permissions"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` + + // Write the file to the allowed list + Allow bool `json:"allow"` +} + +func (d *Device) CgroupString() string { + return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) +} + +func (d *Device) Mkdev() int { + return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) +} + +// deviceNumberString converts the device number to a string return result. +func deviceNumberString(number int64) string { + if number == Wildcard { + return "*" + } + return fmt.Sprint(number) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go new file mode 100644 index 00000000..ba1f437f --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -0,0 +1,125 @@ +// +build linux freebsd + +package configs + +var ( + // DefaultSimpleDevices are devices that are to be both allowed and created. + DefaultSimpleDevices = []*Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + Major: 1, + Minor: 3, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + Major: 1, + Minor: 5, + Permissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + Major: 1, + Minor: 7, + Permissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + Major: 5, + Minor: 0, + Permissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + Major: 1, + Minor: 9, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + Major: 1, + Minor: 8, + Permissions: "rwm", + FileMode: 0666, + }, + } + DefaultAllowedDevices = append([]*Device{ + // allow mknod for any device + { + Type: 'c', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + { + Type: 'b', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: Wildcard, + Permissions: "rwm", + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + }, + }, DefaultSimpleDevices...) + DefaultAutoCreatedDevices = append([]*Device{ + { + // /dev/fuse is created but not allowed. + // This is to allow java to work. Because java + // Insists on there being a /dev/fuse + // https://github.com/docker/docker/issues/514 + // https://github.com/docker/docker/issues/2393 + // + Path: "/dev/fuse", + Type: 'c', + Major: 10, + Minor: 229, + Permissions: "rwm", + }, + }, DefaultSimpleDevices...) +) diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go new file mode 100644 index 00000000..d3021638 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go new file mode 100644 index 00000000..9a0395ea --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/mount.go new file mode 100644 index 00000000..cc770c91 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -0,0 +1,30 @@ +package configs + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // Optional Command to be run before Source is mounted. + PremountCmds []Command `json:"premount_cmds"` + + // Optional Command to be run after Source is mounted. + PostmountCmds []Command `json:"postmount_cmds"` +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces.go new file mode 100644 index 00000000..a3329a31 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 00000000..fb4b8522 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,31 @@ +// +build linux + +package configs + +import "syscall" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: syscall.CLONE_NEWNET, + NEWNS: syscall.CLONE_NEWNS, + NEWUSER: syscall.CLONE_NEWUSER, + NEWIPC: syscall.CLONE_NEWIPC, + NEWUTS: syscall.CLONE_NEWUTS, + NEWPID: syscall.CLONE_NEWPID, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 00000000..0547223a --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,15 @@ +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") + return 0 +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") + return uintptr(0) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go new file mode 100644 index 00000000..b9c820d0 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go @@ -0,0 +1,127 @@ +// +build linux freebsd + +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// nsToFile converts the namespace type to its filename +func nsToFile(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := nsToFile(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWNET, + NEWPID, + NEWNS, + NEWUTS, + NEWIPC, + NEWUSER, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + if n.Path != "" { + return n.Path + } + return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 00000000..9a74033c --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux,!freebsd + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/network.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/network.go new file mode 100644 index 00000000..ccdb228e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/network.go @@ -0,0 +1,72 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name"` +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go new file mode 100644 index 00000000..448cde27 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -0,0 +1,138 @@ +package validate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/selinux" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.SelinuxEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return fmt.Errorf("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/console.go b/vendor/src/github.com/opencontainers/runc/libcontainer/console.go new file mode 100644 index 00000000..042a2a2e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/console.go @@ -0,0 +1,15 @@ +package libcontainer + +import "io" + +// Console represents a pseudo TTY. +type Console interface { + io.ReadWriter + io.Closer + + // Path returns the filesystem path to the slave side of the pty. + Path() string + + // Fd returns the fd for the master of the pty. + Fd() uintptr +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/vendor/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go new file mode 100644 index 00000000..3c89eda0 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go @@ -0,0 +1,13 @@ +// +build freebsd + +package libcontainer + +import ( + "errors" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func NewConsole(uid, gid int) (Console, error) { + return nil, errors.New("libcontainer console is not supported on FreeBSD") +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/console_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/console_linux.go new file mode 100644 index 00000000..7af771b6 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/console_linux.go @@ -0,0 +1,145 @@ +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/label" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func NewConsole(uid, gid int) (Console, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + console, err := ptsname(master) + if err != nil { + return nil, err + } + if err := unlockpt(master); err != nil { + return nil, err + } + if err := os.Chmod(console, 0600); err != nil { + return nil, err + } + if err := os.Chown(console, uid, gid); err != nil { + return nil, err + } + return &linuxConsole{ + slavePath: console, + master: master, + }, nil +} + +// newConsoleFromPath is an internal function returning an initialized console for use inside +// a container's MNT namespace. +func newConsoleFromPath(slavePath string) *linuxConsole { + return &linuxConsole{ + slavePath: slavePath, + } +} + +// linuxConsole is a linux psuedo TTY for use within a container. +type linuxConsole struct { + master *os.File + slavePath string +} + +func (c *linuxConsole) Fd() uintptr { + return c.master.Fd() +} + +func (c *linuxConsole) Path() string { + return c.slavePath +} + +func (c *linuxConsole) Read(b []byte) (int, error) { + return c.master.Read(b) +} + +func (c *linuxConsole) Write(b []byte) (int, error) { + return c.master.Write(b) +} + +func (c *linuxConsole) Close() error { + if m := c.master; m != nil { + return m.Close() + } + return nil +} + +// mount initializes the console inside the rootfs mounting with the specified mount label +// and applying the correct ownership of the console. +func (c *linuxConsole) mount(rootfs, mountLabel string) error { + oldMask := syscall.Umask(0000) + defer syscall.Umask(oldMask) + if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil { + return err + } + dest := filepath.Join(rootfs, "/dev/console") + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "") +} + +// dupStdio opens the slavePath for the console and dups the fds to the current +// processes stdio, fd 0,1,2. +func (c *linuxConsole) dupStdio() error { + slave, err := c.open(syscall.O_RDWR) + if err != nil { + return err + } + fd := int(slave.Fd()) + for _, i := range []int{0, 1, 2} { + if err := syscall.Dup3(fd, i, 0); err != nil { + return err + } + } + return nil +} + +// open is a clone of os.OpenFile without the O_CLOEXEC used to open the pty slave. +func (c *linuxConsole) open(flag int) (*os.File, error) { + r, e := syscall.Open(c.slavePath, flag, 0) + if e != nil { + return nil, &os.PathError{ + Op: "open", + Path: c.slavePath, + Err: e, + } + } + return os.NewFile(uintptr(r), c.slavePath), nil +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// unlockpt should be called before opening the slave side of a pty. +func unlockpt(f *os.File) error { + var u int32 + return ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// ptsname retrieves the name of the first available pts for the given master. +func ptsname(f *os.File) (string, error) { + var n int32 + if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/src/github.com/opencontainers/runc/libcontainer/console_solaris.go new file mode 100644 index 00000000..9e89f505 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/console_solaris.go @@ -0,0 +1,11 @@ +package libcontainer + +import ( + "errors" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func NewConsole(uid, gid int) (Console, error) { + return nil, errors.New("libcontainer console is not supported on Solaris") +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/console_windows.go b/vendor/src/github.com/opencontainers/runc/libcontainer/console_windows.go new file mode 100644 index 00000000..a68c02f6 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/console_windows.go @@ -0,0 +1,30 @@ +package libcontainer + +// NewConsole returns an initalized console that can be used within a container +func NewConsole(uid, gid int) (Console, error) { + return &windowsConsole{}, nil +} + +// windowsConsole is a Windows psuedo TTY for use within a container. +type windowsConsole struct { +} + +func (c *windowsConsole) Fd() uintptr { + return 0 +} + +func (c *windowsConsole) Path() string { + return "" +} + +func (c *windowsConsole) Read(b []byte) (int, error) { + return 0, nil +} + +func (c *windowsConsole) Write(b []byte) (int, error) { + return 0, nil +} + +func (c *windowsConsole) Close() error { + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/container.go b/vendor/src/github.com/opencontainers/runc/libcontainer/container.go new file mode 100644 index 00000000..1a71179c --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/container.go @@ -0,0 +1,157 @@ +// Package libcontainer provides a native Go implementation for creating containers +// with namespaces, cgroups, capabilities, and filesystem access controls. +// It allows you to manage the lifecycle of the container performing additional operations +// after the container is created. +package libcontainer + +import ( + "os" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// Status is the status of a container. +type Status int + +const ( + // Created is the status that denotes the container exists but has not been run yet. + Created Status = iota + // Running is the status that denotes the container exists and is running. + Running + // Pausing is the status that denotes the container exists, it is in the process of being paused. + Pausing + // Paused is the status that denotes the container exists, but all its processes are paused. + Paused + // Stopped is the status that denotes the container does not have a created or running process. + Stopped +) + +func (s Status) String() string { + switch s { + case Created: + return "created" + case Running: + return "running" + case Pausing: + return "pausing" + case Paused: + return "paused" + case Stopped: + return "stopped" + default: + return "unknown" + } +} + +// BaseState represents the platform agnostic pieces relating to a +// running container's state +type BaseState struct { + // ID is the container ID. + ID string `json:"id"` + + // InitProcessPid is the init process id in the parent namespace. + InitProcessPid int `json:"init_process_pid"` + + // InitProcessStartTime is the init process start time in clock cycles since boot time. + InitProcessStartTime string `json:"init_process_start"` + + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + + // Config is the container's configuration. + Config configs.Config `json:"config"` +} + +// BaseContainer is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. BaseContainer includes methods that are platform agnostic. +type BaseContainer interface { + // Returns the ID of the container + ID() string + + // Returns the current status of the container. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + Status() (Status, error) + + // State returns the current container's state information. + // + // errors: + // SystemError - System error. + State() (*State, error) + + // Returns the current config of the container. + Config() configs.Config + + // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + // + // Some of the returned PIDs may no longer refer to processes in the Container, unless + // the Container state is PAUSED in which case every PID in the slice is valid. + Processes() ([]int, error) + + // Returns statistics for the container. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + Stats() (*Stats, error) + + // Set resources of container as configured + // + // We can use this to change resources when containers are running. + // + // errors: + // SystemError - System error. + Set(config configs.Config) error + + // Start a process inside the container. Returns error if process fails to + // start. You can track process lifecycle with passed Process structure. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Start(process *Process) (err error) + + // Run immediatly starts the process inside the conatiner. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + + // Destroys the container after killing all running processes. + // + // Any event registrations are removed before the container is destroyed. + // No error is returned if the container is already destroyed. + // + // errors: + // SystemError - System error. + Destroy() error + + // Signal sends the provided signal code to the container's initial process. + // + // errors: + // SystemError - System error. + Signal(s os.Signal) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/container_linux.go new file mode 100644 index 00000000..294109cd --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -0,0 +1,1315 @@ +// +build linux + +package libcontainer + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "reflect" + "strings" + "sync" + "syscall" + "time" + + "github.com/Sirupsen/logrus" + "github.com/golang/protobuf/proto" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" + "github.com/vishvananda/netlink/nl" +) + +const stdioFdCount = 3 + +type linuxContainer struct { + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + initPath string + initArgs []string + initProcess parentProcess + criuPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time +} + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here + + // Path to all the cgroups setup for a container. Key is cgroup subsystem name + // with the value as the path. + CgroupPaths map[string]string `json:"cgroup_paths"` + + // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type + // with the value as the path. + NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` + + // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore + ExternalDescriptors []string `json:"external_descriptors,omitempty"` +} + +// Container is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + + // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Checkpoint(criuOpts *CriuOpts) error + + // Restore restores the checkpointed container to a running state using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Restore(process *Process, criuOpts *CriuOpts) error + + // If the Container state is RUNNING, sets the Container state to PAUSING and pauses + // the execution of any user processes. Asynchronously, when the container finished being paused the + // state is changed to PAUSED. + // If the Container state is PAUSED, do nothing. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // Systemerror - System error. + Pause() error + + // If the Container state is PAUSED, resumes the execution of any user processes in the + // Container before setting the Container state to RUNNING. + // If the Container state is RUNNING, do nothing. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // Systemerror - System error. + Resume() error + + // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. + // + // errors: + // Systemerror - System error. + NotifyOOM() (<-chan struct{}, error) + + // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level + // + // errors: + // Systemerror - System error. + NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) +} + +// ID returns the container's unique ID +func (c *linuxContainer) ID() string { + return c.id +} + +// Config returns the container's configuration +func (c *linuxContainer) Config() configs.Config { + return *c.config +} + +func (c *linuxContainer) Status() (Status, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentStatus() +} + +func (c *linuxContainer) State() (*State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentState() +} + +func (c *linuxContainer) Processes() ([]int, error) { + pids, err := c.cgroupManager.GetAllPids() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") + } + return pids, nil +} + +func (c *linuxContainer) Stats() (*Stats, error) { + var ( + err error + stats = &Stats{} + ) + if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") + } + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) + if err != nil { + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) + } + stats.Interfaces = append(stats.Interfaces, istats) + } + } + return stats, nil +} + +func (c *linuxContainer) Set(config configs.Config) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } + c.config = &config + return c.cgroupManager.Set(c.config) +} + +func (c *linuxContainer) Start(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + return c.start(process, status == Stopped) +} + +func (c *linuxContainer) Run(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if err := c.start(process, status == Stopped); err != nil { + return err + } + if status == Stopped { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo for reading") + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + if len(data) > 0 { + os.Remove(path) + return nil + } + return fmt.Errorf("cannot start an already running container") +} + +func (c *linuxContainer) start(process *Process, isInit bool) error { + parent, err := c.newParentProcess(process, isInit) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") + } + if err := parent.start(); err != nil { + // terminate the process to ensure that it properly is reaped. + if err := parent.terminate(); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCause(err, "starting container process") + } + // generate a timestamp indicating when the container was started + c.created = time.Now().UTC() + c.state = &runningState{ + c: c, + } + if isInit { + c.state = &createdState{ + c: c, + } + if err := c.updateState(parent); err != nil { + return err + } + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Root: c.config.Rootfs, + BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), + } + for i, hook := range c.config.Hooks.Poststart { + if err := hook.Run(s); err != nil { + if err := parent.terminate(); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCausef(err, "running poststart hook %d", i) + } + } + } + } + return nil +} + +func (c *linuxContainer) Signal(s os.Signal) error { + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") + } + return nil +} + +func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { + parentPipe, childPipe, err := newPipe() + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new init pipe") + } + rootDir, err := os.Open(c.root) + if err != nil { + return nil, err + } + cmd, err := c.commandTemplate(p, childPipe, rootDir) + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new command template") + } + if !doInit { + return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) + } + return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) +} + +func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { + cmd := &exec.Cmd{ + Path: c.initPath, + Args: c.initArgs, + } + cmd.Stdin = p.Stdin + cmd.Stdout = p.Stdout + cmd.Stderr = p.Stderr + cmd.Dir = c.config.Rootfs + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + // NOTE: when running a container with no PID namespace and the parent process spawning the container is + // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason + // even with the parent still running. + if c.config.ParentDeathSignal > 0 { + cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) + } + return cmd, nil +} + +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } + return &initProcess{ + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, + rootDir: rootDir, + }, nil +} + +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting container's current state") + } + // for setns process, we dont have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) + if err != nil { + return nil, err + } + // TODO: set on container for process management + return &setnsProcess{ + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, + rootDir: rootDir, + }, nil +} + +func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + cfg := &initConfig{ + Config: c.config, + Args: process.Args, + Env: process.Env, + User: process.User, + Cwd: process.Cwd, + Console: process.consolePath, + Capabilities: process.Capabilities, + PassedFilesCount: len(process.ExtraFiles), + ContainerId: c.ID(), + NoNewPrivileges: c.config.NoNewPrivileges, + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, + ExecFifoPath: filepath.Join(c.root, execFifoFilename), + } + if process.NoNewPrivileges != nil { + cfg.NoNewPrivileges = *process.NoNewPrivileges + } + if process.AppArmorProfile != "" { + cfg.AppArmorProfile = process.AppArmorProfile + } + if process.Label != "" { + cfg.ProcessLabel = process.Label + } + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } + return cfg +} + +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +func (c *linuxContainer) Destroy() error { + c.m.Lock() + defer c.m.Unlock() + return c.state.destroy() +} + +func (c *linuxContainer) Pause() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) + } + return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning) +} + +func (c *linuxContainer) Resume() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Paused { + return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + } + if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return c.state.transition(&runningState{ + c: c, + }) +} + +func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + return notifyOnOOM(c.cgroupManager.GetPaths()) +} + +func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) +} + +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion string) error { + var x, y, z, versionReq int + + _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 + if err != nil { + _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 + } + versionReq = x*10000 + y*100 + z + + out, err := exec.Command(c.criuPath, "-V").Output() + if err != nil { + return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath) + } + + x = 0 + y = 0 + z = 0 + if ep := strings.Index(string(out), "-"); ep >= 0 { + // criu Git version format + var version string + if sp := strings.Index(string(out), "GitID"); sp > 0 { + version = string(out)[sp:ep] + } else { + return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath) + } + + n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 + y++ + } else { + z++ + } + if n < 2 || err != nil { + return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) + } + } else { + // criu release version format + n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 + } + if n < 2 || err != nil { + return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) + } + } + + c.criuVersion = x*10000 + y*100 + z + + if c.criuVersion < versionReq { + return fmt.Errorf("CRIU version must be %s or higher", minVersion) + } + + return nil +} + +const descriptorsFilename = "descriptors.json" + +func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(mountDest), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + if err := c.checkCriuVersion("1.5.2"); err != nil { + return err + } + + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to save checkpoint") + } + + // Since a container can be C/R'ed multiple times, + // the checkpoint directory may already exist. + if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) { + return err + } + + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + + if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) { + return err + } + + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + + rpcOpts := criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + LogLevel: proto.Int32(4), + LogFile: proto.String("dump.log"), + Root: proto.String(c.config.Rootfs), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + Pid: proto.Int32(int32(c.initProcess.pid())), + ShellJob: proto.Bool(criuOpts.ShellJob), + LeaveRunning: proto.Bool(criuOpts.LeaveRunning), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + } + + // append optional criu opts, e.g., page-server and port + if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { + rpcOpts.Ps = &criurpc.CriuPageServerInfo{ + Address: proto.String(criuOpts.PageServer.Address), + Port: proto.Int32(criuOpts.PageServer.Port), + } + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + if err := c.checkCriuVersion("1.7"); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + rpcOpts.ManageCgroupsMode = &mode + } + + t := criurpc.CriuReqType_DUMP + req := &criurpc.CriuReq{ + Type: &t, + Opts: &rpcOpts, + } + + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuDumpMount(req, m) + break + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuDumpMount(req, b) + } + break + } + } + + // Write the FD info to a file in the image directory + + fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) + if err != nil { + return err + } + + err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) + if err != nil { + return err + } + + err = c.criuSwrk(nil, req, criuOpts, false) + if err != nil { + return err + } + return nil +} + +func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(m.Source), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + break + case "loopback": + break + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + +func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + if err := c.checkCriuVersion("1.5.2"); err != nil { + return err + } + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + // Since a container can be C/R'ed multiple times, + // the work directory may already exist. + if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to restore checkpoint") + } + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + // CRIU has a few requirements for a root directory: + // * it must be a mount point + // * its parent must not be overmounted + // c.config.Rootfs is bind-mounted to a temporary directory + // to satisfy these requirements. + root := filepath.Join(c.root, "criu-root") + if err := os.Mkdir(root, 0755); err != nil { + return err + } + defer os.Remove(root) + root, err = filepath.EvalSymlinks(root) + if err != nil { + return err + } + err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") + if err != nil { + return err + } + defer syscall.Unmount(root, syscall.MNT_DETACH) + t := criurpc.CriuReqType_RESTORE + req := &criurpc.CriuReq{ + Type: &t, + Opts: &criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + EvasiveDevices: proto.Bool(true), + LogLevel: proto.Int32(4), + LogFile: proto.String("restore.log"), + RstSibling: proto.Bool(true), + Root: proto.String(root), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + ShellJob: proto.Bool(criuOpts.ShellJob), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + }, + } + + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuRestoreMount(req, m) + break + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuRestoreMount(req, b) + } + break + } + } + + if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + if err := c.checkCriuVersion("1.7"); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + req.Opts.ManageCgroupsMode = &mode + } + + var ( + fds []string + fdJSON []byte + ) + if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { + return err + } + + if err := json.Unmarshal(fdJSON, &fds); err != nil { + return err + } + for i := range fds { + if s := fds[i]; strings.Contains(s, "pipe:") { + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String(s) + inheritFd.Fd = proto.Int32(int32(i)) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + } + } + return c.criuSwrk(process, req, criuOpts, true) +} + +func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + if err := c.cgroupManager.Apply(pid); err != nil { + return err + } + + path := fmt.Sprintf("/proc/%d/cgroup", pid) + cgroupsPaths, err := cgroups.ParseCgroupFile(path) + if err != nil { + return err + } + + for c, p := range cgroupsPaths { + cgroupRoot := &criurpc.CgroupRoot{ + Ctrl: proto.String(c), + Path: proto.String(p), + } + req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) + } + + return nil +} + +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return err + } + + logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) + criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") + criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") + defer criuClient.Close() + defer criuServer.Close() + + args := []string{"swrk", "3"} + logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) + logrus.Debugf("Using CRIU with following args: %s", args) + cmd := exec.Command(c.criuPath, args...) + if process != nil { + cmd.Stdin = process.Stdin + cmd.Stdout = process.Stdout + cmd.Stderr = process.Stderr + } + cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + + if err := cmd.Start(); err != nil { + return err + } + criuServer.Close() + + defer func() { + criuClient.Close() + _, err := cmd.Process.Wait() + if err != nil { + return + } + }() + + if applyCgroups { + err := c.criuApplyCgroups(cmd.Process.Pid, req) + if err != nil { + return err + } + } + + var extFds []string + if process != nil { + extFds, err = getPipeFds(cmd.Process.Pid) + if err != nil { + return err + } + } + + logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) + val := reflect.ValueOf(req.GetOpts()) + v := reflect.Indirect(val) + for i := 0; i < v.NumField(); i++ { + st := v.Type() + name := st.Field(i).Name + if strings.HasPrefix(name, "XXX_") { + continue + } + value := val.MethodByName("Get" + name).Call([]reflect.Value{}) + logrus.Debugf("CRIU option %s with value %v", name, value[0]) + } + data, err := proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClient.Write(data) + if err != nil { + return err + } + + buf := make([]byte, 10*4096) + for true { + n, err := criuClient.Read(buf) + if err != nil { + return err + } + if n == 0 { + return fmt.Errorf("unexpected EOF") + } + if n == len(buf) { + return fmt.Errorf("buffer is too small") + } + + resp := new(criurpc.CriuResp) + err = proto.Unmarshal(buf[:n], resp) + if err != nil { + return err + } + if !resp.GetSuccess() { + typeString := req.GetType().String() + return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) + } + + t := resp.GetType() + switch { + case t == criurpc.CriuReqType_NOTIFY: + if err := c.criuNotifications(resp, process, opts, extFds); err != nil { + return err + } + t = criurpc.CriuReqType_NOTIFY + req = &criurpc.CriuReq{ + Type: &t, + NotifySuccess: proto.Bool(true), + } + data, err = proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClient.Write(data) + if err != nil { + return err + } + continue + case t == criurpc.CriuReqType_RESTORE: + case t == criurpc.CriuReqType_DUMP: + break + default: + return fmt.Errorf("unable to parse the response %s", resp.String()) + } + + break + } + + // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. + // Here we want to wait only the CRIU process. + st, err := cmd.Process.Wait() + if err != nil { + return err + } + if !st.Success() { + return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) + } + return nil +} + +// block any external network activity +func lockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + + if err := strategy.detach(config); err != nil { + return err + } + } + return nil +} + +func unlockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err = strategy.attach(config); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error { + notify := resp.GetNotify() + if notify == nil { + return fmt.Errorf("invalid response: %s", resp.String()) + } + switch { + case notify.GetScript() == "post-dump": + f, err := os.Create(filepath.Join(c.root, "checkpoint")) + if err != nil { + return err + } + f.Close() + case notify.GetScript() == "network-unlock": + if err := unlockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "network-lock": + if err := lockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "setup-namespaces": + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: int(notify.GetPid()), + Root: c.config.Rootfs, + } + for i, hook := range c.config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + case notify.GetScript() == "post-restore": + pid := notify.GetPid() + r, err := newRestoredProcess(int(pid), fds) + if err != nil { + return err + } + process.ops = r + if err := c.state.transition(&restoredState{ + imageDir: opts.ImagesDirectory, + c: c, + }); err != nil { + return err + } + if err := c.updateState(r); err != nil { + return err + } + if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + logrus.Error(err) + } + } + } + return nil +} + +func (c *linuxContainer) updateState(process parentProcess) error { + c.initProcess = process + state, err := c.currentState() + if err != nil { + return err + } + return c.saveState(state) +} + +func (c *linuxContainer) saveState(s *State) error { + f, err := os.Create(filepath.Join(c.root, stateFilename)) + if err != nil { + return err + } + defer f.Close() + return utils.WriteJSON(f, s) +} + +func (c *linuxContainer) deleteState() error { + return os.Remove(filepath.Join(c.root, stateFilename)) +} + +func (c *linuxContainer) currentStatus() (Status, error) { + if err := c.refreshState(); err != nil { + return -1, err + } + return c.state.status(), nil +} + +// refreshState needs to be called to verify that the current state on the +// container is what is true. Because consumers of libcontainer can use it +// out of process we need to verify the container's status based on runtime +// information and not rely on our in process info. +func (c *linuxContainer) refreshState() error { + paused, err := c.isPaused() + if err != nil { + return err + } + if paused { + return c.state.transition(&pausedState{c: c}) + } + t, err := c.runType() + if err != nil { + return err + } + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: + return c.state.transition(&runningState{c: c}) + } + return c.state.transition(&stoppedState{c: c}) +} + +func (c *linuxContainer) runType() (Status, error) { + if c.initProcess == nil { + return Stopped, nil + } + pid := c.initProcess.pid() + // return Running if the init process is alive + if err := syscall.Kill(pid, 0); err != nil { + if err == syscall.ESRCH { + // It means the process does not exist anymore, could happen when the + // process exited just when we call the function, we should not return + // error in this case. + return Stopped, nil + } + return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) + } + // check if the process that is running is the init process or the user's process. + // this is the difference between the container Running and Created. + environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) + if err != nil { + return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid) + } + check := []byte("_LIBCONTAINER") + if bytes.Contains(environ, check) { + return Created, nil + } + return Running, nil +} + +func (c *linuxContainer) isPaused() (bool, error) { + data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) + if err != nil { + // If freezer cgroup is not mounted, the container would just be not paused. + if os.IsNotExist(err) { + return false, nil + } + return false, newSystemErrorWithCause(err, "checking if container is paused") + } + return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil +} + +func (c *linuxContainer) currentState() (*State, error) { + var ( + startTime string + externalDescriptors []string + pid = -1 + ) + if c.initProcess != nil { + pid = c.initProcess.pid() + startTime, _ = c.initProcess.startTime() + externalDescriptors = c.initProcess.externalDescriptors() + } + state := &State{ + BaseState: BaseState{ + ID: c.ID(), + Config: *c.config, + InitProcessPid: pid, + InitProcessStartTime: startTime, + Created: c.created, + }, + CgroupPaths: c.cgroupManager.GetPaths(), + NamespacePaths: make(map[configs.NamespaceType]string), + ExternalDescriptors: externalDescriptors, + } + if pid > 0 { + for _, ns := range c.config.Namespaces { + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + for _, nsType := range configs.NamespaceTypes() { + if !configs.IsNamespaceSupported(nsType) { + continue + } + if _, ok := state.NamespacePaths[nsType]; !ok { + ns := configs.Namespace{Type: nsType} + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + } + } + return state, nil +} + +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + nsTypes := []configs.NamespaceType{ + configs.NEWIPC, + configs.NEWUTS, + configs.NEWNET, + configs.NEWPID, + configs.NEWNS, + } + // join userns if the init process explicitly requires NEWUSER + if c.config.Namespaces.Contains(configs.NEWUSER) { + nsTypes = append(nsTypes, configs.NEWUSER) + } + for _, nsType := range nsTypes { + if p, ok := namespaces[nsType]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(nsType) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, p) + } + } + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write console path + if consolePath != "" { + r.AddData(&Bytemsg{ + Type: ConsolePathAttr, + Value: []byte(consolePath), + }) + } + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.GidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + + return bytes.NewReader(r.Serialize()), nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/src/github.com/opencontainers/runc/libcontainer/container_solaris.go new file mode 100644 index 00000000..bb84ff74 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/container_solaris.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/container_windows.go b/vendor/src/github.com/opencontainers/runc/libcontainer/container_windows.go new file mode 100644 index 00000000..bb84ff74 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/container_windows.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go b/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go new file mode 100644 index 00000000..b163fbbb --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go @@ -0,0 +1,37 @@ +// +build linux freebsd + +package libcontainer + +// cgroup restoring strategy provided by criu +type cgMode uint32 + +const ( + CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu + CRIU_CG_MODE_FULL // always restore all cgroups and their properties + CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system + CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT +) + +type CriuPageServerInfo struct { + Address string // IP address of CRIU page server + Port int32 // port number of CRIU page server +} + +type VethPairName struct { + ContainerInterfaceName string + HostInterfaceName string +} + +type CriuOpts struct { + ImagesDirectory string // directory for storing image files + WorkDirectory string // directory to cd and write logs/pidfiles/stats to + LeaveRunning bool // leave container in running state after checkpoint + TcpEstablished bool // checkpoint/restore established TCP connections + ExternalUnixConnections bool // allow external unix connections + ShellJob bool // allow to dump and restore shell jobs + FileLocks bool // handle file locks, for safety + PageServer CriuPageServerInfo // allow to dump to criu page server + VethPairs []VethPairName // pass the veth to criu when restore + ManageCgroupsMode cgMode // dump or restore cgroup mode + EmptyNs uint32 // don't c/r properties for namespace from this mask +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go b/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go new file mode 100644 index 00000000..bc920770 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go @@ -0,0 +1,6 @@ +package libcontainer + +// TODO Windows: This can ultimately be entirely factored out as criu is +// a Unix concept not relevant on Windows. +type CriuOpts struct { +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/Makefile b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/Makefile new file mode 100644 index 00000000..3e5346a3 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/Makefile @@ -0,0 +1,2 @@ +gen: criurpc.proto + protoc --go_out=. criurpc.proto diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go new file mode 100644 index 00000000..3c4fb773 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go @@ -0,0 +1,822 @@ +// Code generated by protoc-gen-go. +// source: criurpc.proto +// DO NOT EDIT! + +/* +Package criurpc is a generated protocol buffer package. + +It is generated from these files: + criurpc.proto + +It has these top-level messages: + CriuPageServerInfo + CriuVethPair + ExtMountMap + InheritFd + CgroupRoot + UnixSk + CriuOpts + CriuDumpResp + CriuRestoreResp + CriuNotify + CriuFeatures + CriuReq + CriuResp +*/ +package criurpc + +import proto "github.com/golang/protobuf/proto" +import math "math" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = math.Inf + +type CriuCgMode int32 + +const ( + CriuCgMode_IGNORE CriuCgMode = 0 + CriuCgMode_NONE CriuCgMode = 1 + CriuCgMode_PROPS CriuCgMode = 2 + CriuCgMode_SOFT CriuCgMode = 3 + CriuCgMode_FULL CriuCgMode = 4 + CriuCgMode_STRICT CriuCgMode = 5 + CriuCgMode_DEFAULT CriuCgMode = 6 +) + +var CriuCgMode_name = map[int32]string{ + 0: "IGNORE", + 1: "NONE", + 2: "PROPS", + 3: "SOFT", + 4: "FULL", + 5: "STRICT", + 6: "DEFAULT", +} +var CriuCgMode_value = map[string]int32{ + "IGNORE": 0, + "NONE": 1, + "PROPS": 2, + "SOFT": 3, + "FULL": 4, + "STRICT": 5, + "DEFAULT": 6, +} + +func (x CriuCgMode) Enum() *CriuCgMode { + p := new(CriuCgMode) + *p = x + return p +} +func (x CriuCgMode) String() string { + return proto.EnumName(CriuCgMode_name, int32(x)) +} +func (x *CriuCgMode) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode") + if err != nil { + return err + } + *x = CriuCgMode(value) + return nil +} + +type CriuReqType int32 + +const ( + CriuReqType_EMPTY CriuReqType = 0 + CriuReqType_DUMP CriuReqType = 1 + CriuReqType_RESTORE CriuReqType = 2 + CriuReqType_CHECK CriuReqType = 3 + CriuReqType_PRE_DUMP CriuReqType = 4 + CriuReqType_PAGE_SERVER CriuReqType = 5 + CriuReqType_NOTIFY CriuReqType = 6 + CriuReqType_CPUINFO_DUMP CriuReqType = 7 + CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 +) + +var CriuReqType_name = map[int32]string{ + 0: "EMPTY", + 1: "DUMP", + 2: "RESTORE", + 3: "CHECK", + 4: "PRE_DUMP", + 5: "PAGE_SERVER", + 6: "NOTIFY", + 7: "CPUINFO_DUMP", + 8: "CPUINFO_CHECK", + 9: "FEATURE_CHECK", +} +var CriuReqType_value = map[string]int32{ + "EMPTY": 0, + "DUMP": 1, + "RESTORE": 2, + "CHECK": 3, + "PRE_DUMP": 4, + "PAGE_SERVER": 5, + "NOTIFY": 6, + "CPUINFO_DUMP": 7, + "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, +} + +func (x CriuReqType) Enum() *CriuReqType { + p := new(CriuReqType) + *p = x + return p +} +func (x CriuReqType) String() string { + return proto.EnumName(CriuReqType_name, int32(x)) +} +func (x *CriuReqType) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(CriuReqType_value, data, "CriuReqType") + if err != nil { + return err + } + *x = CriuReqType(value) + return nil +} + +type CriuPageServerInfo struct { + Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"` + Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"` + Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"` + Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuPageServerInfo) Reset() { *m = CriuPageServerInfo{} } +func (m *CriuPageServerInfo) String() string { return proto.CompactTextString(m) } +func (*CriuPageServerInfo) ProtoMessage() {} + +func (m *CriuPageServerInfo) GetAddress() string { + if m != nil && m.Address != nil { + return *m.Address + } + return "" +} + +func (m *CriuPageServerInfo) GetPort() int32 { + if m != nil && m.Port != nil { + return *m.Port + } + return 0 +} + +func (m *CriuPageServerInfo) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +func (m *CriuPageServerInfo) GetFd() int32 { + if m != nil && m.Fd != nil { + return *m.Fd + } + return 0 +} + +type CriuVethPair struct { + IfIn *string `protobuf:"bytes,1,req,name=if_in" json:"if_in,omitempty"` + IfOut *string `protobuf:"bytes,2,req,name=if_out" json:"if_out,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuVethPair) Reset() { *m = CriuVethPair{} } +func (m *CriuVethPair) String() string { return proto.CompactTextString(m) } +func (*CriuVethPair) ProtoMessage() {} + +func (m *CriuVethPair) GetIfIn() string { + if m != nil && m.IfIn != nil { + return *m.IfIn + } + return "" +} + +func (m *CriuVethPair) GetIfOut() string { + if m != nil && m.IfOut != nil { + return *m.IfOut + } + return "" +} + +type ExtMountMap struct { + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *ExtMountMap) Reset() { *m = ExtMountMap{} } +func (m *ExtMountMap) String() string { return proto.CompactTextString(m) } +func (*ExtMountMap) ProtoMessage() {} + +func (m *ExtMountMap) GetKey() string { + if m != nil && m.Key != nil { + return *m.Key + } + return "" +} + +func (m *ExtMountMap) GetVal() string { + if m != nil && m.Val != nil { + return *m.Val + } + return "" +} + +type InheritFd struct { + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *InheritFd) Reset() { *m = InheritFd{} } +func (m *InheritFd) String() string { return proto.CompactTextString(m) } +func (*InheritFd) ProtoMessage() {} + +func (m *InheritFd) GetKey() string { + if m != nil && m.Key != nil { + return *m.Key + } + return "" +} + +func (m *InheritFd) GetFd() int32 { + if m != nil && m.Fd != nil { + return *m.Fd + } + return 0 +} + +type CgroupRoot struct { + Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"` + Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CgroupRoot) Reset() { *m = CgroupRoot{} } +func (m *CgroupRoot) String() string { return proto.CompactTextString(m) } +func (*CgroupRoot) ProtoMessage() {} + +func (m *CgroupRoot) GetCtrl() string { + if m != nil && m.Ctrl != nil { + return *m.Ctrl + } + return "" +} + +func (m *CgroupRoot) GetPath() string { + if m != nil && m.Path != nil { + return *m.Path + } + return "" +} + +type UnixSk struct { + Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *UnixSk) Reset() { *m = UnixSk{} } +func (m *UnixSk) String() string { return proto.CompactTextString(m) } +func (*UnixSk) ProtoMessage() {} + +func (m *UnixSk) GetInode() uint32 { + if m != nil && m.Inode != nil { + return *m.Inode + } + return 0 +} + +type CriuOpts struct { + ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd" json:"images_dir_fd,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` + LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running" json:"leave_running,omitempty"` + ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk" json:"ext_unix_sk,omitempty"` + TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established" json:"tcp_established,omitempty"` + EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices" json:"evasive_devices,omitempty"` + ShellJob *bool `protobuf:"varint,7,opt,name=shell_job" json:"shell_job,omitempty"` + FileLocks *bool `protobuf:"varint,8,opt,name=file_locks" json:"file_locks,omitempty"` + LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,def=2" json:"log_level,omitempty"` + LogFile *string `protobuf:"bytes,10,opt,name=log_file" json:"log_file,omitempty"` + Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"` + NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts" json:"notify_scripts,omitempty"` + Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"` + ParentImg *string `protobuf:"bytes,14,opt,name=parent_img" json:"parent_img,omitempty"` + TrackMem *bool `protobuf:"varint,15,opt,name=track_mem" json:"track_mem,omitempty"` + AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup" json:"auto_dedup,omitempty"` + WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd" json:"work_dir_fd,omitempty"` + LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap" json:"link_remap,omitempty"` + Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"` + CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,def=4294967295" json:"cpu_cap,omitempty"` + ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap" json:"force_irmap,omitempty"` + ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd" json:"exec_cmd,omitempty"` + ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt" json:"ext_mnt,omitempty"` + ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups" json:"manage_cgroups,omitempty"` + CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root" json:"cg_root,omitempty"` + RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling" json:"rst_sibling,omitempty"` + InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd" json:"inherit_fd,omitempty"` + AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt" json:"auto_ext_mnt,omitempty"` + ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing" json:"ext_sharing,omitempty"` + ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters" json:"ext_masters,omitempty"` + SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"` + EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"` + UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"` + ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"` + GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"` + IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"` + External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"` + EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"` + NoSeccomp *bool `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuOpts) Reset() { *m = CriuOpts{} } +func (m *CriuOpts) String() string { return proto.CompactTextString(m) } +func (*CriuOpts) ProtoMessage() {} + +const Default_CriuOpts_LogLevel int32 = 2 +const Default_CriuOpts_CpuCap uint32 = 4294967295 +const Default_CriuOpts_GhostLimit uint32 = 1048576 + +func (m *CriuOpts) GetImagesDirFd() int32 { + if m != nil && m.ImagesDirFd != nil { + return *m.ImagesDirFd + } + return 0 +} + +func (m *CriuOpts) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +func (m *CriuOpts) GetLeaveRunning() bool { + if m != nil && m.LeaveRunning != nil { + return *m.LeaveRunning + } + return false +} + +func (m *CriuOpts) GetExtUnixSk() bool { + if m != nil && m.ExtUnixSk != nil { + return *m.ExtUnixSk + } + return false +} + +func (m *CriuOpts) GetTcpEstablished() bool { + if m != nil && m.TcpEstablished != nil { + return *m.TcpEstablished + } + return false +} + +func (m *CriuOpts) GetEvasiveDevices() bool { + if m != nil && m.EvasiveDevices != nil { + return *m.EvasiveDevices + } + return false +} + +func (m *CriuOpts) GetShellJob() bool { + if m != nil && m.ShellJob != nil { + return *m.ShellJob + } + return false +} + +func (m *CriuOpts) GetFileLocks() bool { + if m != nil && m.FileLocks != nil { + return *m.FileLocks + } + return false +} + +func (m *CriuOpts) GetLogLevel() int32 { + if m != nil && m.LogLevel != nil { + return *m.LogLevel + } + return Default_CriuOpts_LogLevel +} + +func (m *CriuOpts) GetLogFile() string { + if m != nil && m.LogFile != nil { + return *m.LogFile + } + return "" +} + +func (m *CriuOpts) GetPs() *CriuPageServerInfo { + if m != nil { + return m.Ps + } + return nil +} + +func (m *CriuOpts) GetNotifyScripts() bool { + if m != nil && m.NotifyScripts != nil { + return *m.NotifyScripts + } + return false +} + +func (m *CriuOpts) GetRoot() string { + if m != nil && m.Root != nil { + return *m.Root + } + return "" +} + +func (m *CriuOpts) GetParentImg() string { + if m != nil && m.ParentImg != nil { + return *m.ParentImg + } + return "" +} + +func (m *CriuOpts) GetTrackMem() bool { + if m != nil && m.TrackMem != nil { + return *m.TrackMem + } + return false +} + +func (m *CriuOpts) GetAutoDedup() bool { + if m != nil && m.AutoDedup != nil { + return *m.AutoDedup + } + return false +} + +func (m *CriuOpts) GetWorkDirFd() int32 { + if m != nil && m.WorkDirFd != nil { + return *m.WorkDirFd + } + return 0 +} + +func (m *CriuOpts) GetLinkRemap() bool { + if m != nil && m.LinkRemap != nil { + return *m.LinkRemap + } + return false +} + +func (m *CriuOpts) GetVeths() []*CriuVethPair { + if m != nil { + return m.Veths + } + return nil +} + +func (m *CriuOpts) GetCpuCap() uint32 { + if m != nil && m.CpuCap != nil { + return *m.CpuCap + } + return Default_CriuOpts_CpuCap +} + +func (m *CriuOpts) GetForceIrmap() bool { + if m != nil && m.ForceIrmap != nil { + return *m.ForceIrmap + } + return false +} + +func (m *CriuOpts) GetExecCmd() []string { + if m != nil { + return m.ExecCmd + } + return nil +} + +func (m *CriuOpts) GetExtMnt() []*ExtMountMap { + if m != nil { + return m.ExtMnt + } + return nil +} + +func (m *CriuOpts) GetManageCgroups() bool { + if m != nil && m.ManageCgroups != nil { + return *m.ManageCgroups + } + return false +} + +func (m *CriuOpts) GetCgRoot() []*CgroupRoot { + if m != nil { + return m.CgRoot + } + return nil +} + +func (m *CriuOpts) GetRstSibling() bool { + if m != nil && m.RstSibling != nil { + return *m.RstSibling + } + return false +} + +func (m *CriuOpts) GetInheritFd() []*InheritFd { + if m != nil { + return m.InheritFd + } + return nil +} + +func (m *CriuOpts) GetAutoExtMnt() bool { + if m != nil && m.AutoExtMnt != nil { + return *m.AutoExtMnt + } + return false +} + +func (m *CriuOpts) GetExtSharing() bool { + if m != nil && m.ExtSharing != nil { + return *m.ExtSharing + } + return false +} + +func (m *CriuOpts) GetExtMasters() bool { + if m != nil && m.ExtMasters != nil { + return *m.ExtMasters + } + return false +} + +func (m *CriuOpts) GetSkipMnt() []string { + if m != nil { + return m.SkipMnt + } + return nil +} + +func (m *CriuOpts) GetEnableFs() []string { + if m != nil { + return m.EnableFs + } + return nil +} + +func (m *CriuOpts) GetUnixSkIno() []*UnixSk { + if m != nil { + return m.UnixSkIno + } + return nil +} + +func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode { + if m != nil && m.ManageCgroupsMode != nil { + return *m.ManageCgroupsMode + } + return CriuCgMode_IGNORE +} + +func (m *CriuOpts) GetGhostLimit() uint32 { + if m != nil && m.GhostLimit != nil { + return *m.GhostLimit + } + return Default_CriuOpts_GhostLimit +} + +func (m *CriuOpts) GetIrmapScanPaths() []string { + if m != nil { + return m.IrmapScanPaths + } + return nil +} + +func (m *CriuOpts) GetExternal() []string { + if m != nil { + return m.External + } + return nil +} + +func (m *CriuOpts) GetEmptyNs() uint32 { + if m != nil && m.EmptyNs != nil { + return *m.EmptyNs + } + return 0 +} + +func (m *CriuOpts) GetNoSeccomp() bool { + if m != nil && m.NoSeccomp != nil { + return *m.NoSeccomp + } + return false +} + +type CriuDumpResp struct { + Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuDumpResp) Reset() { *m = CriuDumpResp{} } +func (m *CriuDumpResp) String() string { return proto.CompactTextString(m) } +func (*CriuDumpResp) ProtoMessage() {} + +func (m *CriuDumpResp) GetRestored() bool { + if m != nil && m.Restored != nil { + return *m.Restored + } + return false +} + +type CriuRestoreResp struct { + Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuRestoreResp) Reset() { *m = CriuRestoreResp{} } +func (m *CriuRestoreResp) String() string { return proto.CompactTextString(m) } +func (*CriuRestoreResp) ProtoMessage() {} + +func (m *CriuRestoreResp) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +type CriuNotify struct { + Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuNotify) Reset() { *m = CriuNotify{} } +func (m *CriuNotify) String() string { return proto.CompactTextString(m) } +func (*CriuNotify) ProtoMessage() {} + +func (m *CriuNotify) GetScript() string { + if m != nil && m.Script != nil { + return *m.Script + } + return "" +} + +func (m *CriuNotify) GetPid() int32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + +// +// List of features which can queried via +// CRIU_REQ_TYPE__FEATURE_CHECK +type CriuFeatures struct { + MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuFeatures) Reset() { *m = CriuFeatures{} } +func (m *CriuFeatures) String() string { return proto.CompactTextString(m) } +func (*CriuFeatures) ProtoMessage() {} + +func (m *CriuFeatures) GetMemTrack() bool { + if m != nil && m.MemTrack != nil { + return *m.MemTrack + } + return false +} + +type CriuReq struct { + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` + NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success" json:"notify_success,omitempty"` + // + // When set service won't close the connection but + // will wait for more req-s to appear. Works not + // for all request types. + KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"` + // + // 'features' can be used to query which features + // are supported by the installed criu/kernel + // via RPC. + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuReq) Reset() { *m = CriuReq{} } +func (m *CriuReq) String() string { return proto.CompactTextString(m) } +func (*CriuReq) ProtoMessage() {} + +func (m *CriuReq) GetType() CriuReqType { + if m != nil && m.Type != nil { + return *m.Type + } + return CriuReqType_EMPTY +} + +func (m *CriuReq) GetOpts() *CriuOpts { + if m != nil { + return m.Opts + } + return nil +} + +func (m *CriuReq) GetNotifySuccess() bool { + if m != nil && m.NotifySuccess != nil { + return *m.NotifySuccess + } + return false +} + +func (m *CriuReq) GetKeepOpen() bool { + if m != nil && m.KeepOpen != nil { + return *m.KeepOpen + } + return false +} + +func (m *CriuReq) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + +type CriuResp struct { + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` + Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"` + Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"` + Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` + Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` + CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"` + Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuResp) Reset() { *m = CriuResp{} } +func (m *CriuResp) String() string { return proto.CompactTextString(m) } +func (*CriuResp) ProtoMessage() {} + +func (m *CriuResp) GetType() CriuReqType { + if m != nil && m.Type != nil { + return *m.Type + } + return CriuReqType_EMPTY +} + +func (m *CriuResp) GetSuccess() bool { + if m != nil && m.Success != nil { + return *m.Success + } + return false +} + +func (m *CriuResp) GetDump() *CriuDumpResp { + if m != nil { + return m.Dump + } + return nil +} + +func (m *CriuResp) GetRestore() *CriuRestoreResp { + if m != nil { + return m.Restore + } + return nil +} + +func (m *CriuResp) GetNotify() *CriuNotify { + if m != nil { + return m.Notify + } + return nil +} + +func (m *CriuResp) GetPs() *CriuPageServerInfo { + if m != nil { + return m.Ps + } + return nil +} + +func (m *CriuResp) GetCrErrno() int32 { + if m != nil && m.CrErrno != nil { + return *m.CrErrno + } + return 0 +} + +func (m *CriuResp) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + +func init() { + proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value) + proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto new file mode 100644 index 00000000..34fa9884 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto @@ -0,0 +1,174 @@ +message criu_page_server_info { + optional string address = 1; + optional int32 port = 2; + optional int32 pid = 3; + optional int32 fd = 4; +} + +message criu_veth_pair { + required string if_in = 1; + required string if_out = 2; +}; + +message ext_mount_map { + required string key = 1; + required string val = 2; +}; + +message inherit_fd { + required string key = 1; + required int32 fd = 2; +}; + +message cgroup_root { + optional string ctrl = 1; + required string path = 2; +}; + +message unix_sk { + required uint32 inode = 1; +}; + +enum criu_cg_mode { + IGNORE = 0; + NONE = 1; + PROPS = 2; + SOFT = 3; + FULL = 4; + STRICT = 5; + DEFAULT = 6; +}; + +message criu_opts { + required int32 images_dir_fd = 1; + optional int32 pid = 2; /* if not set on dump, will dump requesting process */ + + optional bool leave_running = 3; + optional bool ext_unix_sk = 4; + optional bool tcp_established = 5; + optional bool evasive_devices = 6; + optional bool shell_job = 7; + optional bool file_locks = 8; + optional int32 log_level = 9 [default = 2]; + optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */ + + optional criu_page_server_info ps = 11; + + optional bool notify_scripts = 12; + + optional string root = 13; + optional string parent_img = 14; + optional bool track_mem = 15; + optional bool auto_dedup = 16; + + optional int32 work_dir_fd = 17; + optional bool link_remap = 18; + repeated criu_veth_pair veths = 19; + + optional uint32 cpu_cap = 20 [default = 0xffffffff]; + optional bool force_irmap = 21; + repeated string exec_cmd = 22; + + repeated ext_mount_map ext_mnt = 23; + optional bool manage_cgroups = 24; /* backward compatibility */ + repeated cgroup_root cg_root = 25; + + optional bool rst_sibling = 26; /* swrk only */ + repeated inherit_fd inherit_fd = 27; /* swrk only */ + + optional bool auto_ext_mnt = 28; + optional bool ext_sharing = 29; + optional bool ext_masters = 30; + + repeated string skip_mnt = 31; + repeated string enable_fs = 32; + + repeated unix_sk unix_sk_ino = 33; + + optional criu_cg_mode manage_cgroups_mode = 34; + optional uint32 ghost_limit = 35 [default = 0x100000]; + repeated string irmap_scan_paths = 36; + repeated string external = 37; + optional uint32 empty_ns = 38; + optional bool no_seccomp = 39; +} + +message criu_dump_resp { + optional bool restored = 1; +} + +message criu_restore_resp { + required int32 pid = 1; +} + +message criu_notify { + optional string script = 1; + optional int32 pid = 2; +} + +enum criu_req_type { + EMPTY = 0; + DUMP = 1; + RESTORE = 2; + CHECK = 3; + PRE_DUMP = 4; + PAGE_SERVER = 5; + + NOTIFY = 6; + + CPUINFO_DUMP = 7; + CPUINFO_CHECK = 8; + + FEATURE_CHECK = 9; +} + +/* + * List of features which can queried via + * CRIU_REQ_TYPE__FEATURE_CHECK + */ +message criu_features { + optional bool mem_track = 1; +} + +/* + * Request -- each type corresponds to must-be-there + * request arguments of respective type + */ + +message criu_req { + required criu_req_type type = 1; + + optional criu_opts opts = 2; + optional bool notify_success = 3; + + /* + * When set service won't close the connection but + * will wait for more req-s to appear. Works not + * for all request types. + */ + optional bool keep_open = 4; + /* + * 'features' can be used to query which features + * are supported by the installed criu/kernel + * via RPC. + */ + optional criu_features features = 5; +} + +/* + * Response -- it states whether the request was served + * and additional request-specific information + */ + +message criu_resp { + required criu_req_type type = 1; + required bool success = 2; + + optional criu_dump_resp dump = 3; + optional criu_restore_resp restore = 4; + optional criu_notify notify = 5; + optional criu_page_server_info ps = 6; + + optional int32 cr_errno = 7; + optional criu_features features = 8; +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/error.go b/vendor/src/github.com/opencontainers/runc/libcontainer/error.go new file mode 100644 index 00000000..b0639270 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/error.go @@ -0,0 +1,70 @@ +package libcontainer + +import "io" + +// ErrorCode is the API error code type. +type ErrorCode int + +// API error codes. +const ( + // Factory errors + IdInUse ErrorCode = iota + InvalidIdFormat + + // Container errors + ContainerNotExists + ContainerPaused + ContainerNotStopped + ContainerNotRunning + ContainerNotPaused + + // Process errors + NoProcessOps + + // Common errors + ConfigInvalid + ConsoleExists + SystemError +) + +func (c ErrorCode) String() string { + switch c { + case IdInUse: + return "Id already in use" + case InvalidIdFormat: + return "Invalid format" + case ContainerPaused: + return "Container paused" + case ConfigInvalid: + return "Invalid configuration" + case SystemError: + return "System error" + case ContainerNotExists: + return "Container does not exist" + case ContainerNotStopped: + return "Container is not stopped" + case ContainerNotRunning: + return "Container is not running" + case ConsoleExists: + return "Console exists for process" + case ContainerNotPaused: + return "Container is not paused" + case NoProcessOps: + return "No process operations" + default: + return "Unknown error" + } +} + +// Error is the API error type. +type Error interface { + error + + // Returns a verbose string including the error message + // and a representation of the stack trace suitable for + // printing. + Detail(w io.Writer) error + + // Returns the error code for this error. + Code() ErrorCode +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/factory.go b/vendor/src/github.com/opencontainers/runc/libcontainer/factory.go new file mode 100644 index 00000000..f0ccb52e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/factory.go @@ -0,0 +1,45 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Factory interface { + // Creates a new container with the given id and starts the initial process inside it. + // id must be a string containing only letters, digits and underscores and must contain + // between 1 and 1024 characters, inclusive. + // + // The id must not already be in use by an existing container. Containers created using + // a factory with the same path (and file system) must have distinct ids. + // + // Returns the new container with a running process. + // + // errors: + // IdInUse - id is already in use by a container + // InvalidIdFormat - id has incorrect format + // ConfigInvalid - config is invalid + // Systemerror - System error + // + // On error, any partially created container parts are cleaned up (the operation is atomic). + Create(id string, config *configs.Config) (Container, error) + + // Load takes an ID for an existing container and returns the container information + // from the state. This presents a read only view of the container. + // + // errors: + // Path does not exist + // Container is stopped + // System error + Load(id string) (Container, error) + + // StartInitialization is an internal API to libcontainer used during the reexec of the + // container. + // + // Errors: + // Pipe connection error + // System error + StartInitialization() error + + // Type returns info string about factory type (e.g. lxc, libcontainer...) + Type() string +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/factory_linux.go new file mode 100644 index 00000000..5f7b0064 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -0,0 +1,319 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "runtime/debug" + "strconv" + "syscall" + + "github.com/docker/docker/pkg/mount" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/utils" +) + +const ( + stateFilename = "state.json" + execFifoFilename = "exec.fifo" +) + +var ( + idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + maxIdLen = 1024 +) + +// InitArgs returns an options func to configure a LinuxFactory with the +// provided init arguments. +func InitArgs(args ...string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + name := args[0] + if filepath.Base(name) == name { + if lp, err := exec.LookPath(name); err == nil { + name = lp + } + } else { + abs, err := filepath.Abs(name) + if err != nil { + return err + } + name = abs + } + l.InitPath = "/proc/self/exe" + l.InitArgs = append([]string{name}, args[1:]...) + return nil + } +} + +// InitPath returns an options func to configure a LinuxFactory with the +// provided absolute path to the init binary and arguements. +func InitPath(path string, args ...string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.InitPath = path + l.InitArgs = args + return nil + } +} + +// SystemdCgroups is an options func to configure a LinuxFactory to return +// containers that use systemd to create and manage cgroups. +func SystemdCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &systemd.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// Cgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to +// create and manage cgroups. +func Cgroupfs(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. +func TmpfsRoot(l *LinuxFactory) error { + mounted, err := mount.Mounted(l.Root) + if err != nil { + return err + } + if !mounted { + if err := syscall.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil { + return err + } + } + return nil +} + +// New returns a linux based container factory based in the root directory and +// configures the factory with the provided option funcs. +func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { + if root != "" { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, newGenericError(err, SystemError) + } + } + l := &LinuxFactory{ + Root: root, + Validator: validate.New(), + CriuPath: "criu", + } + InitArgs(os.Args[0], "init")(l) + Cgroupfs(l) + for _, opt := range options { + if err := opt(l); err != nil { + return nil, err + } + } + return l, nil +} + +// LinuxFactory implements the default factory interface for linux based systems. +type LinuxFactory struct { + // Root directory for the factory to store state. + Root string + + // InitPath is the absolute path to the init binary. + InitPath string + + // InitArgs are arguments for calling the init responsibilities for spawning + // a container. + InitArgs []string + + // CriuPath is the path to the criu binary used for checkpoint and restore of + // containers. + CriuPath string + + // Validator provides validation to container configurations. + Validator validate.Validator + + // NewCgroupsManager returns an initialized cgroups manager for a single container. + NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager +} + +func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + if err := l.validateID(id); err != nil { + return nil, err + } + if err := l.Validator.Validate(config); err != nil { + return nil, newGenericError(err, ConfigInvalid) + } + uid, err := config.HostUID() + if err != nil { + return nil, newGenericError(err, SystemError) + } + gid, err := config.HostGID() + if err != nil { + return nil, newGenericError(err, SystemError) + } + containerRoot := filepath.Join(l.Root, id) + if _, err := os.Stat(containerRoot); err == nil { + return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) + } else if !os.IsNotExist(err) { + return nil, newGenericError(err, SystemError) + } + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, uid, gid); err != nil { + return nil, newGenericError(err, SystemError) + } + fifoName := filepath.Join(containerRoot, execFifoFilename) + oldMask := syscall.Umask(0000) + if err := syscall.Mkfifo(fifoName, 0622); err != nil { + syscall.Umask(oldMask) + return nil, newGenericError(err, SystemError) + } + syscall.Umask(oldMask) + if err := os.Chown(fifoName, uid, gid); err != nil { + return nil, newGenericError(err, SystemError) + } + c := &linuxContainer{ + id: id, + root: containerRoot, + config: config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + } + c.state = &stoppedState{c: c} + return c, nil +} + +func (l *LinuxFactory) Load(id string) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + containerRoot := filepath.Join(l.Root, id) + state, err := l.loadState(containerRoot) + if err != nil { + return nil, err + } + r := &nonChildProcess{ + processPid: state.InitProcessPid, + processStartTime: state.InitProcessStartTime, + fds: state.ExternalDescriptors, + } + c := &linuxContainer{ + initProcess: r, + id: id, + config: &state.Config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, + } + c.state = &loadedState{c: c} + if err := c.refreshState(); err != nil { + return nil, err + } + return c, nil +} + +func (l *LinuxFactory) Type() string { + return "libcontainer" +} + +// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state +// This is a low level implementation detail of the reexec and should not be consumed externally +func (l *LinuxFactory) StartInitialization() (err error) { + var pipefd, rootfd int + for k, v := range map[string]*int{ + "_LIBCONTAINER_INITPIPE": &pipefd, + "_LIBCONTAINER_STATEDIR": &rootfd, + } { + s := os.Getenv(k) + + i, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("unable to convert %s=%s to int", k, s) + } + *v = i + } + var ( + pipe = os.NewFile(uintptr(pipefd), "pipe") + it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) + ) + // clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() + + var i initer + defer func() { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + // If container's init successed, syscall.Exec will not return, hence + // this defer function will never be called. + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + panic(err) + } + } + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { + panic(err) + } + // ensure that this pipe is always closed + pipe.Close() + }() + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + } + }() + i, err = newContainerInit(it, pipe, rootfd) + if err != nil { + return err + } + return i.Init() +} + +func (l *LinuxFactory) loadState(root string) (*State, error) { + f, err := os.Open(filepath.Join(root, stateFilename)) + if err != nil { + if os.IsNotExist(err) { + return nil, newGenericError(err, ContainerNotExists) + } + return nil, newGenericError(err, SystemError) + } + defer f.Close() + var state *State + if err := json.NewDecoder(f).Decode(&state); err != nil { + return nil, newGenericError(err, SystemError) + } + return state, nil +} + +func (l *LinuxFactory) validateID(id string) error { + if !idRegex.MatchString(id) { + return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + } + if len(id) > maxIdLen { + return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/generic_error.go b/vendor/src/github.com/opencontainers/runc/libcontainer/generic_error.go new file mode 100644 index 00000000..9c3d3249 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -0,0 +1,109 @@ +package libcontainer + +import ( + "fmt" + "io" + "text/template" + "time" + + "github.com/opencontainers/runc/libcontainer/stacktrace" +) + +type syncType uint8 + +const ( + procReady syncType = iota + procError + procRun + procHooks + procResume +) + +type syncT struct { + Type syncType `json:"type"` +} + +var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} +Code: {{.ECode}} +{{if .Message }} +Message: {{.Message}} +{{end}} +Frames:{{range $i, $frame := .Stack.Frames}} +--- +{{$i}}: {{$frame.Function}} +Package: {{$frame.Package}} +File: {{$frame.File}}@{{$frame.Line}}{{end}} +`)) + +func newGenericError(err error, c ErrorCode) Error { + if le, ok := err.(Error); ok { + return le + } + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: c, + Stack: stacktrace.Capture(1), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { + if le, ok := err.(Error); ok { + return le + } + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: SystemError, + Cause: cause, + Stack: stacktrace.Capture(2), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +type genericError struct { + Timestamp time.Time + ECode ErrorCode + Err error `json:"-"` + Cause string + Message string + Stack stacktrace.Stacktrace +} + +func (e *genericError) Error() string { + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) +} + +func (e *genericError) Code() ErrorCode { + return e.ECode +} + +func (e *genericError) Detail(w io.Writer) error { + return errorTemplate.Execute(w, e) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/init_linux.go new file mode 100644 index 00000000..d6d9901c --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -0,0 +1,369 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "strconv" + "strings" + "syscall" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/vishvananda/netlink" +) + +type initType string + +const ( + initSetns initType = "setns" + initStandard initType = "standard" +) + +type pid struct { + Pid int `json:"pid"` +} + +// network is an internal struct used to setup container networks. +type network struct { + configs.Network + + // TempVethPeerName is a unique temporary veth peer name that was placed into + // the container's namespace. + TempVethPeerName string `json:"temp_veth_peer_name"` +} + +// initConfig is used for transferring parameters from Exec() to Init() +type initConfig struct { + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities []string `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + Config *configs.Config `json:"config"` + Console string `json:"console"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + ExecFifoPath string `json:"start_pipe_path"` +} + +type initer interface { + Init() error +} + +func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err + } + if err := populateProcessEnvironment(config.Env); err != nil { + return nil, err + } + switch t { + case initSetns: + return &linuxSetnsInit{ + config: config, + }, nil + case initStandard: + return &linuxStandardInit{ + pipe: pipe, + parentPid: syscall.Getppid(), + config: config, + stateDirFD: stateDirFD, + }, nil + } + return nil, fmt.Errorf("unknown init type %q", t) +} + +// populateProcessEnvironment loads the provided environment variables into the +// current processes's environment. +func populateProcessEnvironment(env []string) error { + for _, pair := range env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaked file descriptors +// before executing the command inside the namespace +func finalizeNamespace(config *initConfig) error { + // Ensure that all unwanted fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { + return err + } + + capabilities := config.Config.Capabilities + if config.Capabilities != nil { + capabilities = config.Capabilities + } + w, err := newCapWhitelist(capabilities) + if err != nil { + return err + } + // drop capabilities in bounding set before changing user + if err := w.dropBoundingSet(); err != nil { + return err + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return err + } + if err := setupUser(config); err != nil { + return err + } + if err := system.ClearKeepCaps(); err != nil { + return err + } + // drop all other capabilities + if err := w.drop(); err != nil { + return err + } + if config.Cwd != "" { + if err := syscall.Chdir(config.Cwd); err != nil { + return err + } + } + return nil +} + +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil { + return err + } + // Wait for parent to give the all-clear. + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync.Type != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + +// syncParentHooks sends to the given pipe a JSON payload which indicates that +// the parent should execute pre-start hooks. It then waits for the parent to +// indicate that it is cleared to resume. +func syncParentHooks(pipe io.ReadWriter) error { + // Tell parent. + if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil { + return err + } + // Wait for parent to give the all-clear. + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync.Type != procResume { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *initConfig) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: syscall.Getuid(), + Gid: syscall.Getgid(), + Home: "/", + } + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return err + } + + var addGroups []int + if len(config.Config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) + if err != nil { + return err + } + } + // before we change to the container's user make sure that the processes STDIO + // is correctly owned by the user that we are switching to. + if err := fixStdioPermissions(execUser); err != nil { + return err + } + suppGroups := append(execUser.Sgids, addGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return err + } + + if err := system.Setgid(execUser.Gid); err != nil { + return err + } + if err := system.Setuid(execUser.Uid); err != nil { + return err + } + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return err + } + } + return nil +} + +// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. +// The ownership needs to match because it is created outside of the container and needs to be +// localized. +func fixStdioPermissions(u *user.ExecUser) error { + var null syscall.Stat_t + if err := syscall.Stat("/dev/null", &null); err != nil { + return err + } + for _, fd := range []uintptr{ + os.Stdin.Fd(), + os.Stderr.Fd(), + os.Stdout.Fd(), + } { + var s syscall.Stat_t + if err := syscall.Fstat(int(fd), &s); err != nil { + return err + } + // skip chown of /dev/null if it was used as one of the STDIO fds. + if s.Rdev == null.Rdev { + continue + } + if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil { + return err + } + } + return nil +} + +// setupNetwork sets up and initializes any network interface inside the container. +func setupNetwork(config *initConfig) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.initialize(config); err != nil { + return err + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + _, dst, err := net.ParseCIDR(config.Destination) + if err != nil { + return err + } + src := net.ParseIP(config.Source) + if src == nil { + return fmt.Errorf("Invalid source for route: %s", config.Source) + } + gw := net.ParseIP(config.Gateway) + if gw == nil { + return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) + } + l, err := netlink.LinkByName(config.InterfaceName) + if err != nil { + return err + } + route := &netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + Dst: dst, + Src: src, + Gw: gw, + LinkIndex: l.Attrs().Index, + } + if err := netlink.RouteAdd(route); err != nil { + return err + } + } + return nil +} + +func setupRlimits(limits []configs.Rlimit, pid int) error { + for _, rlimit := range limits { + if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} + +func setOomScoreAdj(oomScoreAdj int, pid int) error { + path := fmt.Sprintf("/proc/%d/oom_score_adj", pid) + + return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) +} + +// killCgroupProcesses freezes then iterates over all the processes inside the +// manager's cgroups sending a SIGKILL to each process then waiting for them to +// exit. +func killCgroupProcesses(m cgroups.Manager) error { + var procs []*os.Process + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Warn(err) + } + pids, err := m.GetAllPids() + if err != nil { + m.Freeze(configs.Thawed) + return err + } + for _, pid := range pids { + p, err := os.FindProcess(pid) + if err != nil { + logrus.Warn(err) + continue + } + procs = append(procs, p) + if err := p.Kill(); err != nil { + logrus.Warn(err) + } + } + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } + for _, p := range procs { + if _, err := p.Wait(); err != nil { + logrus.Warn(err) + } + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/src/github.com/opencontainers/runc/libcontainer/keys/keyctl.go new file mode 100644 index 00000000..c67fd15b --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -0,0 +1,66 @@ +// +build linux + +package keyctl + +import ( + "fmt" + "strconv" + "strings" + "syscall" + "unsafe" +) + +const KEYCTL_JOIN_SESSION_KEYRING = 1 +const KEYCTL_SETPERM = 5 +const KEYCTL_DESCRIBE = 6 + +type KeySerial uint32 + +func JoinSessionKeyring(name string) (KeySerial, error) { + var _name *byte + var err error + + if len(name) > 0 { + _name, err = syscall.BytePtrFromString(name) + if err != nil { + return KeySerial(0), err + } + } + + sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0) + if errn != 0 { + return 0, fmt.Errorf("could not create session key: %v", errn) + } + return KeySerial(sessKeyId), nil +} + +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, +// anding the bits with the given mask (clearing permissions) and setting +// additional permission bits +func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { + dest := make([]byte, 1024) + destBytes := unsafe.Pointer(&dest[0]) + + if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 { + return err + } + + res := strings.Split(string(dest), ";") + if len(res) < 5 { + return fmt.Errorf("Destination buffer for key description is too small") + } + + // parse permissions + perm64, err := strconv.ParseUint(res[3], 16, 32) + if err != nil { + return err + } + + perm := (uint32(perm64) & mask) | setbits + + if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 { + return err + } + + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/label/label.go b/vendor/src/github.com/opencontainers/runc/libcontainer/label/label.go new file mode 100644 index 00000000..684bb41b --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/label/label.go @@ -0,0 +1,80 @@ +// +build !selinux !linux + +package label + +// InitLabels returns the process label and file labels to be used within +// the container. A list of options can be passed into this function to alter +// the labels. +func InitLabels(options []string) (string, string, error) { + return "", "", nil +} + +func GenLabels(options string) (string, string, error) { + return "", "", nil +} + +func FormatMountLabel(src string, mountLabel string) string { + return src +} + +func SetProcessLabel(processLabel string) error { + return nil +} + +func GetFileLabel(path string) (string, error) { + return "", nil +} + +func SetFileLabel(path string, fileLabel string) error { + return nil +} + +func SetFileCreateLabel(fileLabel string) error { + return nil +} + +func Relabel(path string, fileLabel string, shared bool) error { + return nil +} + +func GetPidLabel(pid int) (string, error) { + return "", nil +} + +func Init() { +} + +func ReserveLabel(label string) error { + return nil +} + +func UnreserveLabel(label string) error { + return nil +} + +// DupSecOpt takes a process label and returns security options that +// can be used to set duplicate labels on future container processes +func DupSecOpt(src string) []string { + return nil +} + +// DisableSecOpt returns a security opt that can disable labeling +// support for future container processes +func DisableSecOpt() []string { + return nil +} + +// Validate checks that the label does not include unexpected options +func Validate(label string) error { + return nil +} + +// RelabelNeeded checks whether the user requested a relabel +func RelabelNeeded(label string) bool { + return false +} + +// IsShared checks that the label includes a "shared" mark +func IsShared(label string) bool { + return false +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/label/label_selinux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/label/label_selinux.go new file mode 100644 index 00000000..4493bda7 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/label/label_selinux.go @@ -0,0 +1,197 @@ +// +build selinux,linux + +package label + +import ( + "fmt" + "strings" + + "github.com/opencontainers/runc/libcontainer/selinux" +) + +// Valid Label Options +var validOptions = map[string]bool{ + "disable": true, + "type": true, + "user": true, + "role": true, + "level": true, +} + +var ErrIncompatibleLabel = fmt.Errorf("Bad SELinux option z and Z can not be used together") + +// InitLabels returns the process label and file labels to be used within +// the container. A list of options can be passed into this function to alter +// the labels. The labels returned will include a random MCS String, that is +// guaranteed to be unique. +func InitLabels(options []string) (string, string, error) { + if !selinux.SelinuxEnabled() { + return "", "", nil + } + processLabel, mountLabel := selinux.GetLxcContexts() + if processLabel != "" { + pcon := selinux.NewContext(processLabel) + mcon := selinux.NewContext(mountLabel) + for _, opt := range options { + if opt == "disable" { + return "", "", nil + } + if i := strings.Index(opt, ":"); i == -1 { + return "", "", fmt.Errorf("Bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt) + } + con := strings.SplitN(opt, ":", 2) + if !validOptions[con[0]] { + return "", "", fmt.Errorf("Bad label option %q, valid options 'disable, user, role, level, type'", con[0]) + + } + pcon[con[0]] = con[1] + if con[0] == "level" || con[0] == "user" { + mcon[con[0]] = con[1] + } + } + processLabel = pcon.Get() + mountLabel = mcon.Get() + } + return processLabel, mountLabel, nil +} + +// DEPRECATED: The GenLabels function is only to be used during the transition to the official API. +func GenLabels(options string) (string, string, error) { + return InitLabels(strings.Fields(options)) +} + +// FormatMountLabel returns a string to be used by the mount command. +// The format of this string will be used to alter the labeling of the mountpoint. +// The string returned is suitable to be used as the options field of the mount command. +// If you need to have additional mount point options, you can pass them in as +// the first parameter. Second parameter is the label that you wish to apply +// to all content in the mount point. +func FormatMountLabel(src, mountLabel string) string { + if mountLabel != "" { + switch src { + case "": + src = fmt.Sprintf("context=%q", mountLabel) + default: + src = fmt.Sprintf("%s,context=%q", src, mountLabel) + } + } + return src +} + +// SetProcessLabel takes a process label and tells the kernel to assign the +// label to the next program executed by the current process. +func SetProcessLabel(processLabel string) error { + if processLabel == "" { + return nil + } + return selinux.Setexeccon(processLabel) +} + +// GetProcessLabel returns the process label that the kernel will assign +// to the next program executed by the current process. If "" is returned +// this indicates that the default labeling will happen for the process. +func GetProcessLabel() (string, error) { + return selinux.Getexeccon() +} + +// GetFileLabel returns the label for specified path +func GetFileLabel(path string) (string, error) { + return selinux.Getfilecon(path) +} + +// SetFileLabel modifies the "path" label to the specified file label +func SetFileLabel(path string, fileLabel string) error { + if selinux.SelinuxEnabled() && fileLabel != "" { + return selinux.Setfilecon(path, fileLabel) + } + return nil +} + +// SetFileCreateLabel tells the kernel the label for all files to be created +func SetFileCreateLabel(fileLabel string) error { + if selinux.SelinuxEnabled() { + return selinux.Setfscreatecon(fileLabel) + } + return nil +} + +// Relabel changes the label of path to the filelabel string. +// It changes the MCS label to s0 if shared is true. +// This will allow all containers to share the content. +func Relabel(path string, fileLabel string, shared bool) error { + if !selinux.SelinuxEnabled() { + return nil + } + + if fileLabel == "" { + return nil + } + + exclude_paths := map[string]bool{"/": true, "/usr": true, "/etc": true} + if exclude_paths[path] { + return fmt.Errorf("Relabeling of %s is not allowed", path) + } + + if shared { + c := selinux.NewContext(fileLabel) + c["level"] = "s0" + fileLabel = c.Get() + } + return selinux.Chcon(path, fileLabel, true) +} + +// GetPidLabel will return the label of the process running with the specified pid +func GetPidLabel(pid int) (string, error) { + return selinux.Getpidcon(pid) +} + +// Init initialises the labeling system +func Init() { + selinux.SelinuxEnabled() +} + +// ReserveLabel will record the fact that the MCS label has already been used. +// This will prevent InitLabels from using the MCS label in a newly created +// container +func ReserveLabel(label string) error { + selinux.ReserveLabel(label) + return nil +} + +// UnreserveLabel will remove the reservation of the MCS label. +// This will allow InitLabels to use the MCS label in a newly created +// containers +func UnreserveLabel(label string) error { + selinux.FreeLxcContexts(label) + return nil +} + +// DupSecOpt takes an process label and returns security options that +// can be used to set duplicate labels on future container processes +func DupSecOpt(src string) []string { + return selinux.DupSecOpt(src) +} + +// DisableSecOpt returns a security opt that can disable labeling +// support for future container processes +func DisableSecOpt() []string { + return selinux.DisableSecOpt() +} + +// Validate checks that the label does not include unexpected options +func Validate(label string) error { + if strings.Contains(label, "z") && strings.Contains(label, "Z") { + return ErrIncompatibleLabel + } + return nil +} + +// RelabelNeeded checks whether the user requested a relabel +func RelabelNeeded(label string) bool { + return strings.Contains(label, "z") || strings.Contains(label, "Z") +} + +// IsShared checks that the label includes a "shared" mark +func IsShared(label string) bool { + return strings.Contains(label, "z") +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/message_linux.go new file mode 100644 index 00000000..400bd362 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -0,0 +1,89 @@ +// +build linux + +package libcontainer + +import ( + "syscall" + + "github.com/vishvananda/netlink/nl" +) + +// list of known message types we want to send to bootstrap program +// The number is randomly chosen to not conflict with known netlink types +const ( + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + ConsolePathAttr uint16 = 27282 + NsPathsAttr uint16 = 27283 + UidmapAttr uint16 = 27284 + GidmapAttr uint16 = 27285 + SetgroupAttr uint16 = 27286 + // When syscall.NLA_HDRLEN is in gccgo, take this out. + syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) +) + +type Int32msg struct { + Type uint16 + Value uint32 +} + +// Serialize serializes the message. +// Int32msg has the following representation +// | nlattr len | nlattr type | +// | uint32 value | +func (msg *Int32msg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + native.PutUint32(buf[4:8], msg.Value) + return buf +} + +func (msg *Int32msg) Len() int { + return syscall_NLA_HDRLEN + 4 +} + +// Bytemsg has the following representation +// | nlattr len | nlattr type | +// | value | pad | +type Bytemsg struct { + Type uint16 + Value []byte +} + +func (msg *Bytemsg) Serialize() []byte { + l := msg.Len() + buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1)) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(l)) + native.PutUint16(buf[2:4], msg.Type) + copy(buf[4:], msg.Value) + return buf +} + +func (msg *Bytemsg) Len() int { + return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated +} + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return syscall_NLA_HDRLEN + 1 +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/network_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/network_linux.go new file mode 100644 index 00000000..5075bee4 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -0,0 +1,259 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "net" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/vishvananda/netlink" +) + +var strategies = map[string]networkStrategy{ + "veth": &veth{}, + "loopback": &loopback{}, +} + +// networkStrategy represents a specific network configuration for +// a container's networking stack +type networkStrategy interface { + create(*network, int) error + initialize(*network) error + detach(*configs.Network) error + attach(*configs.Network) error +} + +// getStrategy returns the specific network strategy for the +// provided type. +func getStrategy(tpe string) (networkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, fmt.Errorf("unknown strategy type %q", tpe) + } + return s, nil +} + +// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo. +func getNetworkInterfaceStats(interfaceName string) (*NetworkInterface, error) { + out := &NetworkInterface{Name: interfaceName} + // This can happen if the network runtime information is missing - possible if the + // container was created by an old version of libcontainer. + if interfaceName == "" { + return out, nil + } + type netStatsPair struct { + // Where to write the output. + Out *uint64 + // The network stats file to read. + File string + } + // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container. + netStats := []netStatsPair{ + {Out: &out.RxBytes, File: "tx_bytes"}, + {Out: &out.RxPackets, File: "tx_packets"}, + {Out: &out.RxErrors, File: "tx_errors"}, + {Out: &out.RxDropped, File: "tx_dropped"}, + + {Out: &out.TxBytes, File: "rx_bytes"}, + {Out: &out.TxPackets, File: "rx_packets"}, + {Out: &out.TxErrors, File: "rx_errors"}, + {Out: &out.TxDropped, File: "rx_dropped"}, + } + for _, netStat := range netStats { + data, err := readSysfsNetworkStats(interfaceName, netStat.File) + if err != nil { + return nil, err + } + *(netStat.Out) = data + } + return out, nil +} + +// Reads the specified statistics available under /sys/class/net//statistics +func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) { + data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// loopback is a network strategy that provides a basic loopback device +type loopback struct { +} + +func (l *loopback) create(n *network, nspid int) error { + return nil +} + +func (l *loopback) initialize(config *network) error { + return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}}) +} + +func (l *loopback) attach(n *configs.Network) (err error) { + return nil +} + +func (l *loopback) detach(n *configs.Network) (err error) { + return nil +} + +// veth is a network strategy that uses a bridge and creates +// a veth pair, one that is attached to the bridge on the host and the other +// is placed inside the container's namespace +type veth struct { +} + +func (v *veth) detach(n *configs.Network) (err error) { + return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) +} + +// attach a container network interface to an external network +func (v *veth) attach(n *configs.Network) (err error) { + brl, err := netlink.LinkByName(n.Bridge) + if err != nil { + return err + } + br, ok := brl.(*netlink.Bridge) + if !ok { + return fmt.Errorf("Wrong device type %T", brl) + } + host, err := netlink.LinkByName(n.HostInterfaceName) + if err != nil { + return err + } + + if err := netlink.LinkSetMaster(host, br); err != nil { + return err + } + if err := netlink.LinkSetMTU(host, n.Mtu); err != nil { + return err + } + if n.HairpinMode { + if err := netlink.LinkSetHairpin(host, true); err != nil { + return err + } + } + if err := netlink.LinkSetUp(host); err != nil { + return err + } + + return nil +} + +func (v *veth) create(n *network, nspid int) (err error) { + tmpName, err := v.generateTempPeerName() + if err != nil { + return err + } + n.TempVethPeerName = tmpName + if n.Bridge == "" { + return fmt.Errorf("bridge is not specified") + } + veth := &netlink.Veth{ + LinkAttrs: netlink.LinkAttrs{ + Name: n.HostInterfaceName, + TxQLen: n.TxQueueLen, + }, + PeerName: n.TempVethPeerName, + } + if err := netlink.LinkAdd(veth); err != nil { + return err + } + defer func() { + if err != nil { + netlink.LinkDel(veth) + } + }() + if err := v.attach(&n.Network); err != nil { + return err + } + child, err := netlink.LinkByName(n.TempVethPeerName) + if err != nil { + return err + } + return netlink.LinkSetNsPid(child, nspid) +} + +func (v *veth) generateTempPeerName() (string, error) { + return utils.GenerateRandomName("veth", 7) +} + +func (v *veth) initialize(config *network) error { + peer := config.TempVethPeerName + if peer == "" { + return fmt.Errorf("peer is not specified") + } + child, err := netlink.LinkByName(peer) + if err != nil { + return err + } + if err := netlink.LinkSetDown(child); err != nil { + return err + } + if err := netlink.LinkSetName(child, config.Name); err != nil { + return err + } + // get the interface again after we changed the name as the index also changes. + if child, err = netlink.LinkByName(config.Name); err != nil { + return err + } + if config.MacAddress != "" { + mac, err := net.ParseMAC(config.MacAddress) + if err != nil { + return err + } + if err := netlink.LinkSetHardwareAddr(child, mac); err != nil { + return err + } + } + ip, err := netlink.ParseAddr(config.Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip); err != nil { + return err + } + if config.IPv6Address != "" { + ip6, err := netlink.ParseAddr(config.IPv6Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip6); err != nil { + return err + } + } + if err := netlink.LinkSetMTU(child, config.Mtu); err != nil { + return err + } + if err := netlink.LinkSetUp(child); err != nil { + return err + } + if config.Gateway != "" { + gw := net.ParseIP(config.Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + if config.IPv6Gateway != "" { + gw := net.ParseIP(config.IPv6Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/notify_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/notify_linux.go new file mode 100644 index 00000000..839a50c5 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/notify_linux.go @@ -0,0 +1,89 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "syscall" +) + +const oomCgroupName = "memory" + +type PressureLevel uint + +const ( + LowPressure PressureLevel = iota + MediumPressure + CriticalPressure +) + +func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) { + evFile, err := os.Open(filepath.Join(cgDir, evName)) + if err != nil { + return nil, err + } + fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) + if syserr != 0 { + evFile.Close() + return nil, syserr + } + + eventfd := os.NewFile(fd, "eventfd") + + eventControlPath := filepath.Join(cgDir, "cgroup.event_control") + data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) + if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { + eventfd.Close() + evFile.Close() + return nil, err + } + ch := make(chan struct{}) + go func() { + defer func() { + close(ch) + eventfd.Close() + evFile.Close() + }() + buf := make([]byte, 8) + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + ch <- struct{}{} + } + }() + return ch, nil +} + +// notifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + return registerMemoryEvent(dir, "memory.oom_control", "") +} + +func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + if level > CriticalPressure { + return nil, fmt.Errorf("invalid pressure level %d", level) + } + + levelStr := []string{"low", "medium", "critical"}[level] + return registerMemoryEvent(dir, "memory.pressure_level", levelStr) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/README.md b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/README.md new file mode 100644 index 00000000..823323e9 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/README.md @@ -0,0 +1,25 @@ +## nsenter + +The `nsenter` package registers a special init constructor that is called before +the Go runtime has a chance to boot. This provides us the ability to `setns` on +existing namespaces and avoid the issues that the Go runtime has with multiple +threads. This constructor will be called if this package is registered, +imported, in your go application. + +The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/) +package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, +called the preamble, is used as a header when compiling the C parts of the package. +So every time we import package `nsenter`, the C code function `nsexec()` would be +called. And package `nsenter` is now only imported in `main_unix.go`, so every time +before we call `cmd.Start` on linux, that C code would run. + +`nsexec()` will first check the environment variable `_LIBCONTAINER_INITPID` +which will give the process of the container that should be joined. Namespaces fd will +be found from `/proc/[pid]/ns` and set by `setns` syscall. + +And then get the pipe number from `_LIBCONTAINER_INITPIPE`, error message could +be transferred through it. If tty is added, `_LIBCONTAINER_CONSOLE_PATH` will +have value and start a console for output. + +Finally, `nsexec()` will clone a child process , exit the parent process and let +the Go runtime take over. diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go new file mode 100644 index 00000000..07f4d63e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go @@ -0,0 +1,12 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 00000000..63c7a3ec --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 00000000..ac701ca3 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,5 @@ +// +build !linux !cgo + +package nsenter + +import "C" diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c new file mode 100644 index 00000000..cb7a6629 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -0,0 +1,471 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// All arguments should be above the stack because it grows down +struct clone_arg { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__((aligned(16))); + char stack_ptr[0]; + jmp_buf *env; +}; + +struct nsenter_config { + uint32_t cloneflags; + char *uidmap; + int uidmap_len; + char *gidmap; + int gidmap_len; + uint8_t is_setgroup; + int consolefd; +}; + +// list of known message types we want to send to bootstrap program +// These are defined in libcontainer/message_linux.go +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 + +// Use raw setns syscall for versions of glibc that don't include it +// (namely glibc-2.12) +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 + #define _GNU_SOURCE + #include "syscall.h" + #if defined(__NR_setns) && !defined(SYS_setns) + #define SYS_setns __NR_setns + #endif + + #ifdef SYS_setns + int setns(int fd, int nstype) + { + return syscall(SYS_setns, fd, nstype); + } + #endif +#endif + +#define pr_perror(fmt, ...) \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) + +static int child_func(void *_arg) +{ + struct clone_arg *arg = (struct clone_arg *)_arg; + longjmp(*arg->env, 1); +} + +static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline)); +static int clone_parent(jmp_buf *env, int flags) +{ + struct clone_arg ca; + int child; + + ca.env = env; + child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, + &ca); + // On old kernels, CLONE_PARENT cannot work with CLONE_NEWPID, + // unshare before clone to workaround this. + if (child == -1 && errno == EINVAL) { + if (unshare(flags)) { + pr_perror("Unable to unshare namespaces"); + return -1; + } + child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, + &ca); + } + return child; +} + +// get init pipe from the parent. It's used to read bootstrap data, and to +// write pid to after nsexec finishes setting up the environment. +static int get_init_pipe() +{ + char buf[PATH_MAX]; + char *initpipe; + int pipenum = -1; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL) { + return -1; + } + + pipenum = atoi(initpipe); + snprintf(buf, sizeof(buf), "%d", pipenum); + if (strcmp(initpipe, buf)) { + pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); + exit(1); + } + + return pipenum; +} + +// num_namespaces returns the number of additional namespaces to setns. The +// argument is a comma-separated string of namespace paths. +static int num_namespaces(char *nspaths) +{ + int i; + int size = 0; + + for (i = 0; nspaths[i]; i++) { + if (nspaths[i] == ',') { + size += 1; + } + } + + return size + 1; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *)buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *)buf; +} + +static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len) +{ + char buf[PATH_MAX]; + int len; + int fd; + + len = snprintf(buf, sizeof(buf), pathfmt, pid); + if (len < 0) { + pr_perror("failed to construct '%s' for %d", pathfmt, pid); + exit(1); + } + + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); + exit(1); + } + + len = write(fd, map, map_len); + if (len == -1) { + pr_perror("failed to write to %s", buf); + close(fd); + exit(1); + } else if (len != map_len) { + pr_perror("Failed to write data to %s (%d/%d)", + buf, len, map_len); + close(fd); + exit(1); + } + + close(fd); +} + +static void update_process_uidmap(int pid, char *map, int map_len) +{ + if ((map == NULL) || (map_len <= 0)) { + return; + } + + update_process_idmap("/proc/%d/uid_map", pid, map, map_len); +} + +static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len) +{ + if ((map == NULL) || (map_len <= 0)) { + return; + } + + if (is_setgroup == 1) { + int fd; + int len; + char buf[PATH_MAX]; + + len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid); + if (len < 0) { + pr_perror("failed to get setgroups path for %d", pid); + exit(1); + } + + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); + exit(1); + } + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support + // /proc/PID/setgroups, write will return + // ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", buf); + close(fd); + exit(1); + } + } + close(fd); + } + + update_process_idmap("/proc/%d/gid_map", pid, map, map_len); +} + + +static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], + struct nsenter_config *config) +{ + int len; + int childpid; + char buf[PATH_MAX]; + uint8_t syncbyte = 1; + + // We must fork to actually enter the PID namespace, use CLONE_PARENT + // so the child can have the right parent, and we don't need to forward + // the child's exit code or resend its death signal. + childpid = clone_parent(env, config->cloneflags); + if (childpid < 0) { + pr_perror("Unable to fork"); + exit(1); + } + + // update uid_map and gid_map for the child process if they + // were provided + update_process_uidmap(childpid, config->uidmap, config->uidmap_len); + update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len); + + // Send the sync signal to the child + close(syncpipe[0]); + syncbyte = 1; + if (write(syncpipe[1], &syncbyte, 1) != 1) { + pr_perror("failed to write sync byte to child"); + exit(1); + } + + // Send the child pid back to our parent + len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid); + if ((len < 0) || (write(pipenum, buf, len) != len)) { + pr_perror("Unable to send a child pid"); + kill(childpid, SIGKILL); + exit(1); + } + + exit(0); +} + +static struct nsenter_config process_nl_attributes(int pipenum, char *data, int data_size) +{ + struct nsenter_config config = {0}; + struct nlattr *nlattr; + int payload_len; + int start = 0; + + config.consolefd = -1; + while (start < data_size) { + nlattr = (struct nlattr *)(data + start); + start += NLA_HDRLEN; + payload_len = nlattr->nla_len - NLA_HDRLEN; + + if (nlattr->nla_type == CLONE_FLAGS_ATTR) { + config.cloneflags = readint32(data + start); + } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { + // get the console path before setns because it may + // change mnt namespace + config.consolefd = open(data + start, O_RDWR); + if (config.consolefd < 0) { + pr_perror("Failed to open console %s", + data + start); + exit(1); + } + } else if (nlattr->nla_type == NS_PATHS_ATTR) { + // if custom namespaces are required, open all + // descriptors and perform setns on them + int i, j; + int nslen = num_namespaces(data + start); + int fds[nslen]; + char *nslist[nslen]; + char *ns; + char *saveptr = NULL; + + for (i = 0; i < nslen; i++) { + char *str = NULL; + + if (i == 0) { + str = data + start; + } + ns = strtok_r(str, ",", &saveptr); + if (ns == NULL) { + break; + } + fds[i] = open(ns, O_RDONLY); + if (fds[i] == -1) { + for (j = 0; j < i; j++) { + close(fds[j]); + } + pr_perror("Failed to open %s", ns); + exit(1); + } + nslist[i] = ns; + } + + for (i = 0; i < nslen; i++) { + if (setns(fds[i], 0) != 0) { + pr_perror("Failed to setns to %s", nslist[i]); + exit(1); + } + close(fds[i]); + } + } else if (nlattr->nla_type == UIDMAP_ATTR) { + config.uidmap = data + start; + config.uidmap_len = payload_len; + } else if (nlattr->nla_type == GIDMAP_ATTR) { + config.gidmap = data + start; + config.gidmap_len = payload_len; + } else if (nlattr->nla_type == SETGROUP_ATTR) { + config.is_setgroup = readint8(data + start); + } else { + pr_perror("Unknown netlink message type %d", + nlattr->nla_type); + exit(1); + } + + start += NLA_ALIGN(payload_len); + } + + return config; +} + +void nsexec(void) +{ + int pipenum; + + // If we don't have init pipe, then just return to the go routine, + // we'll only have init pipe for start or exec + pipenum = get_init_pipe(); + if (pipenum == -1) { + return; + } + + // Retrieve the netlink header + struct nlmsghdr nl_msg_hdr; + int len; + + if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Invalid netlink header length %d", len); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { + pr_perror("Failed to read netlink message"); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type != INIT_MSG) { + pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type); + exit(1); + } + + // Retrieve data + int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); + char data[nl_total_size]; + + if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { + pr_perror("Failed to read netlink payload, %d != %d", len, + nl_total_size); + exit(1); + } + + jmp_buf env; + int syncpipe[2] = {-1, -1}; + struct nsenter_config config = process_nl_attributes(pipenum, + data, nl_total_size); + + // required clone_flags to be passed + if (config.cloneflags == -1) { + pr_perror("Missing clone_flags"); + exit(1); + } + // prepare sync pipe between parent and child. We need this to let the + // child know that the parent has finished setting up + if (pipe(syncpipe) != 0) { + pr_perror("Failed to setup sync pipe between parent and child"); + exit(1); + } + + if (setjmp(env) == 1) { + // Child + uint8_t s = 0; + int consolefd = config.consolefd; + + // close the writing side of pipe + close(syncpipe[1]); + + // sync with parent + if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) { + pr_perror("Failed to read sync byte from parent"); + exit(1); + } + + if (setsid() == -1) { + pr_perror("setsid failed"); + exit(1); + } + + if (setuid(0) == -1) { + pr_perror("setuid failed"); + exit(1); + } + + if (setgid(0) == -1) { + pr_perror("setgid failed"); + exit(1); + } + + if (setgroups(0, NULL) == -1) { + pr_perror("setgroups failed"); + exit(1); + } + + if (consolefd != -1) { + if (ioctl(consolefd, TIOCSCTTY, 0) == -1) { + pr_perror("ioctl TIOCSCTTY failed"); + exit(1); + } + if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) { + pr_perror("Failed to dup stdin"); + exit(1); + } + if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) { + pr_perror("Failed to dup stdout"); + exit(1); + } + if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) { + pr_perror("Failed to dup stderr"); + exit(1); + } + } + + // Finish executing, let the Go runtime take over. + return; + } + + // Parent + start_child(pipenum, &env, syncpipe, &config); +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/process.go b/vendor/src/github.com/opencontainers/runc/libcontainer/process.go new file mode 100644 index 00000000..46fb2749 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/process.go @@ -0,0 +1,121 @@ +package libcontainer + +import ( + "fmt" + "io" + "math" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type processOperations interface { + wait() (*os.ProcessState, error) + signal(sig os.Signal) error + pid() int +} + +// Process specifies the configuration and IO for a process inside +// a container. +type Process struct { + // The command to be run followed by any arguments. + Args []string + + // Env specifies the environment variables for the process. + Env []string + + // User will set the uid and gid of the executing process running inside the container + // local to the container's user and group configuration. + User string + + // Cwd will change the processes current working directory inside the container's rootfs. + Cwd string + + // Stdin is a pointer to a reader which provides the standard input stream. + Stdin io.Reader + + // Stdout is a pointer to a writer which receives the standard output stream. + Stdout io.Writer + + // Stderr is a pointer to a writer which receives the standard error stream. + Stderr io.Writer + + // ExtraFiles specifies additional open files to be inherited by the container + ExtraFiles []*os.File + + // consolePath is the path to the console allocated to the container. + consolePath string + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities []string + + // AppArmorProfile specifies the profile to apply to the process and is + // changed at the time the process is execed + AppArmorProfile string + + // Label specifies the label to apply to the process. It is commonly used by selinux + Label string + + // NoNewPrivileges controls whether processes can gain additional privileges. + NoNewPrivileges *bool + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []configs.Rlimit + + ops processOperations +} + +// Wait waits for the process to exit. +// Wait releases any resources associated with the Process +func (p Process) Wait() (*os.ProcessState, error) { + if p.ops == nil { + return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.wait() +} + +// Pid returns the process ID +func (p Process) Pid() (int, error) { + // math.MinInt32 is returned here, because it's invalid value + // for the kill() system call. + if p.ops == nil { + return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.pid(), nil +} + +// Signal sends a signal to the Process. +func (p Process) Signal(sig os.Signal) error { + if p.ops == nil { + return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.signal(sig) +} + +// IO holds the process's STDIO +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} + +// NewConsole creates new console for process and returns it +func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) { + console, err := NewConsole(rootuid, rootgid) + if err != nil { + return nil, err + } + p.consolePath = console.Path() + return console, nil +} + +// ConsoleFromPath sets the process's console with the path provided +func (p *Process) ConsoleFromPath(path string) error { + if p.consolePath != "" { + return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists) + } + p.consolePath = path + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/process_linux.go new file mode 100644 index 00000000..33db3923 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -0,0 +1,492 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" +) + +type parentProcess interface { + // pid returns the pid for the running process. + pid() int + + // start starts the process execution. + start() error + + // send a SIGKILL to the process and wait for the exit. + terminate() error + + // wait waits on the process returning the process state. + wait() (*os.ProcessState, error) + + // startTime return's the process start time. + startTime() (string, error) + + signal(os.Signal) error + + externalDescriptors() []string + + setExternalDescriptors(fds []string) +} + +type setnsProcess struct { + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader + rootDir *os.File +} + +func (p *setnsProcess) startTime() (string, error) { + return system.GetProcessStartTime(p.pid()) +} + +func (p *setnsProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return syscall.Kill(p.pid(), s) +} + +func (p *setnsProcess) start() (err error) { + defer p.parentPipe.Close() + err = p.cmd.Start() + p.childPipe.Close() + p.rootDir.Close() + if err != nil { + return newSystemErrorWithCause(err, "starting setns process") + } + if p.bootstrapData != nil { + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + } + if err = p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "executing setns process") + } + if len(p.cgroupPaths) > 0 { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + } + } + // set oom_score_adj + if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting oom score") + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for process") + } + if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { + return newSystemErrorWithCause(err, "writing config to pipe") + } + + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "calling shutdown on init pipe") + } + // wait for the child process to fully complete and receive an error message + // if one was encoutered + var ierr *genericError + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding init error from pipe") + } + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +func (p *setnsProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "waiting on setns process to finish") + } + if !status.Success() { + p.cmd.Wait() + return newSystemError(&exec.ExitError{ProcessState: status}) + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "reading pid from init pipe") + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +// terminate sends a SIGKILL to the forked process for the setns routine then waits to +// avoid the process becomming a zombie. +func (p *setnsProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *setnsProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + + // Return actual ProcessState even on Wait error + return p.cmd.ProcessState, err +} + +func (p *setnsProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *setnsProcess) externalDescriptors() []string { + return p.fds +} + +func (p *setnsProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +type initProcess struct { + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool + rootDir *os.File +} + +func (p *initProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *initProcess) externalDescriptors() []string { + return p.fds +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +func (p *initProcess) start() error { + defer p.parentPipe.Close() + err := p.cmd.Start() + p.process.ops = p + p.childPipe.Close() + p.rootDir.Close() + if err != nil { + p.process.ops = nil + return newSystemErrorWithCause(err, "starting init process command") + } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "running exec setns process for init") + } + // Save the standard descriptor names before the container process + // can potentially move them (e.g., via dup2()). If we don't do this now, + // we won't know at checkpoint time which file descriptor to look up. + fds, err := getPipeFds(p.pid()) + if err != nil { + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) + } + p.setExternalDescriptors(fds) + // Do this before syncing with child so that no children + // can escape the cgroup + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + } + }() + if err := p.createNetworkInterfaces(); err != nil { + return newSystemErrorWithCause(err, "creating nework interfaces") + } + if err := p.sendConfig(); err != nil { + return newSystemErrorWithCause(err, "sending config to init process") + } + var ( + procSync syncT + sentRun bool + sentResume bool + ierr *genericError + ) + + dec := json.NewDecoder(p.parentPipe) +loop: + for { + if err := dec.Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemErrorWithCause(err, "decoding sync type from init pipe") + } + switch procSync.Type { + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } + // set oom_score_adj + if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting oom score for ready process") + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for ready process") + } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + } + // Sync with child. + if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { + return newSystemErrorWithCause(err, "reading syncT run type") + } + sentRun = true + case procHooks: + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"), + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + // Sync with child. + if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { + return newSystemErrorWithCause(err, "reading syncT resume type") + } + sentResume = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + } + if !sentRun { + return newSystemErrorWithCause(ierr, "container init failed") + } + if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { + return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "shutting down init pipe") + } + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +func (p *initProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + if err != nil { + return p.cmd.ProcessState, err + } + // we should kill all processes in cgroup when init is died if we use host PID namespace + if p.sharePidns { + killCgroupProcesses(p.manager) + } + return p.cmd.ProcessState, nil +} + +func (p *initProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *initProcess) startTime() (string, error) { + return system.GetProcessStartTime(p.pid()) +} + +func (p *initProcess) sendConfig() error { + // send the config to the container's init process, we don't use JSON Encode + // here because there might be a problem in JSON decoder in some cases, see: + // https://github.com/docker/docker/issues/14203#issuecomment-174177790 + return utils.WriteJSON(p.parentPipe, p.config) +} + +func (p *initProcess) createNetworkInterfaces() error { + for _, config := range p.config.Config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + n := &network{ + Network: *config, + } + if err := strategy.create(n, p.pid()); err != nil { + return err + } + p.config.Networks = append(p.config.Networks, n) + } + return nil +} + +func (p *initProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return syscall.Kill(p.pid(), s) +} + +func (p *initProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func getPipeFds(pid int) ([]string, error) { + fds := make([]string, 3) + + dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") + for i := 0; i < 3; i++ { + f := filepath.Join(dirPath, strconv.Itoa(i)) + target, err := os.Readlink(f) + if err != nil { + return fds, err + } + fds[i] = target + } + return fds, nil +} + +// InitializeIO creates pipes for use with the process's STDIO +// and returns the opposite side for each +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stderr, i.Stderr = w, r + // change ownership of the pipes incase we are in a user namespace + for _, fd := range fds { + if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil { + return nil, err + } + } + return i, nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/restored_process.go b/vendor/src/github.com/opencontainers/runc/libcontainer/restored_process.go new file mode 100644 index 00000000..a96f4ca5 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/restored_process.go @@ -0,0 +1,122 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/system" +) + +func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) { + var ( + err error + ) + proc, err := os.FindProcess(pid) + if err != nil { + return nil, err + } + started, err := system.GetProcessStartTime(pid) + if err != nil { + return nil, err + } + return &restoredProcess{ + proc: proc, + processStartTime: started, + fds: fds, + }, nil +} + +type restoredProcess struct { + proc *os.Process + processStartTime string + fds []string +} + +func (p *restoredProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *restoredProcess) pid() int { + return p.proc.Pid +} + +func (p *restoredProcess) terminate() error { + err := p.proc.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *restoredProcess) wait() (*os.ProcessState, error) { + // TODO: how do we wait on the actual process? + // maybe use --exec-cmd in criu + st, err := p.proc.Wait() + if err != nil { + return nil, err + } + return st, nil +} + +func (p *restoredProcess) startTime() (string, error) { + return p.processStartTime, nil +} + +func (p *restoredProcess) signal(s os.Signal) error { + return p.proc.Signal(s) +} + +func (p *restoredProcess) externalDescriptors() []string { + return p.fds +} + +func (p *restoredProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +// nonChildProcess represents a process where the calling process is not +// the parent process. This process is created when a factory loads a container from +// a persisted state. +type nonChildProcess struct { + processPid int + processStartTime string + fds []string +} + +func (p *nonChildProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *nonChildProcess) pid() int { + return p.processPid +} + +func (p *nonChildProcess) terminate() error { + return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError) +} + +func (p *nonChildProcess) wait() (*os.ProcessState, error) { + return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError) +} + +func (p *nonChildProcess) startTime() (string, error) { + return p.processStartTime, nil +} + +func (p *nonChildProcess) signal(s os.Signal) error { + proc, err := os.FindProcess(p.processPid) + if err != nil { + return err + } + return proc.Signal(s) +} + +func (p *nonChildProcess) externalDescriptors() []string { + return p.fds +} + +func (p *nonChildProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go new file mode 100644 index 00000000..fe1db75e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -0,0 +1,733 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/docker/docker/pkg/mount" + "github.com/docker/docker/pkg/symlink" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/label" + "github.com/opencontainers/runc/libcontainer/system" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +// needsSetupDev returns true if /dev needs to be set up. +func needsSetupDev(config *configs.Config) bool { + for _, m := range config.Mounts { + if m.Device == "bind" && (m.Destination == "/dev" || m.Destination == "/dev/") { + return false + } + } + return true +} + +// setupRootfs sets up the devices, mount points, and filesystems for use inside a +// new mount namespace. +func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { + if err := prepareRoot(config); err != nil { + return newSystemErrorWithCause(err, "preparing rootfs") + } + + setupDev := needsSetupDev(config) + for _, m := range config.Mounts { + for _, precmd := range m.PremountCmds { + if err := mountCmd(precmd); err != nil { + return newSystemErrorWithCause(err, "running premount command") + } + } + if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs) + } + + for _, postcmd := range m.PostmountCmds { + if err := mountCmd(postcmd); err != nil { + return newSystemErrorWithCause(err, "running postmount command") + } + } + } + if setupDev { + if err := createDevices(config); err != nil { + return newSystemErrorWithCause(err, "creating device nodes") + } + if err := setupPtmx(config, console); err != nil { + return newSystemErrorWithCause(err, "setting up ptmx") + } + if err := setupDevSymlinks(config.Rootfs); err != nil { + return newSystemErrorWithCause(err, "setting up /dev symlinks") + } + } + // Signal the parent to run the pre-start hooks. + // The hooks are run after the mounts are setup, but before we switch to the new + // root, so that the old root is still available in the hooks for any mount + // manipulations. + if err := syncParentHooks(pipe); err != nil { + return err + } + if err := syscall.Chdir(config.Rootfs); err != nil { + return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) + } + if config.NoPivotRoot { + err = msMoveRoot(config.Rootfs) + } else { + err = pivotRoot(config.Rootfs, config.PivotDir) + } + if err != nil { + return newSystemErrorWithCause(err, "jailing process inside rootfs") + } + if setupDev { + if err := reOpenDevNull(); err != nil { + return newSystemErrorWithCause(err, "reopening /dev/null inside container") + } + } + // remount dev as ro if specifed + for _, m := range config.Mounts { + if m.Destination == "/dev" { + if m.Flags&syscall.MS_RDONLY != 0 { + if err := remountReadonly(m.Destination); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) + } + } + break + } + } + // set rootfs ( / ) as readonly + if config.Readonlyfs { + if err := setReadonly(); err != nil { + return newSystemErrorWithCause(err, "setting rootfs as readonly") + } + } + syscall.Umask(0022) + return nil +} + +func mountCmd(cmd configs.Command) error { + command := exec.Command(cmd.Path, cmd.Args[:]...) + command.Env = cmd.Env + command.Dir = cmd.Dir + if out, err := command.CombinedOutput(); err != nil { + return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) + } + return nil +} + +func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + + switch m.Device { + case "proc", "sysfs": + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + // Selinux kernels do not support labeling of /proc or /sys + return mountPropagate(m, rootfs, "") + case "mqueue": + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + // older kernels do not support labeling of /dev/mqueue + if err := mountPropagate(m, rootfs, ""); err != nil { + return err + } + return label.SetFileLabel(dest, mountLabel) + } + return nil + case "tmpfs": + stat, err := os.Stat(dest) + if err != nil { + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + if stat != nil { + if err = os.Chmod(dest, stat.Mode()); err != nil { + return err + } + } + return nil + case "bind": + stat, err := os.Stat(m.Source) + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. + return err + } + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + if dest, err = symlink.FollowSymlinkInScope(filepath.Join(rootfs, m.Destination), rootfs); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := createIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + // bind mount won't change mount options, we need remount to make mount options effective. + // first check that we have non-default options required before attempting a remount + if m.Flags&^(syscall.MS_REC|syscall.MS_REMOUNT|syscall.MS_BIND) != 0 { + // only remount if unique mount options are set + if err := remount(m, rootfs); err != nil { + return err + } + } + + if m.Relabel != "" { + if err := label.Validate(m.Relabel); err != nil { + return err + } + shared := label.IsShared(m.Relabel) + if err := label.Relabel(m.Source, mountLabel, shared); err != nil { + return err + } + } + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + var merged []string + for _, b := range binds { + ss := filepath.Base(b.Destination) + if strings.Contains(ss, ",") { + merged = append(merged, ss) + } + } + tmpfs := &configs.Mount{ + Source: "tmpfs", + Device: "tmpfs", + Destination: m.Destination, + Flags: defaultMountFlags, + Data: "mode=755", + PropagationFlags: m.PropagationFlags, + } + if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil { + return err + } + for _, b := range binds { + if err := mountToRootfs(b, rootfs, mountLabel); err != nil { + return err + } + } + // create symlinks for merged cgroups + cwd, err := os.Getwd() + if err != nil { + return err + } + if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil { + return err + } + for _, mc := range merged { + for _, ss := range strings.Split(mc, ",") { + if err := os.Symlink(mc, ss); err != nil { + // if cgroup already exists, then okay(it could have been created before) + if os.IsExist(err) { + continue + } + os.Chdir(cwd) + return err + } + } + } + if err := os.Chdir(cwd); err != nil { + return err + } + if m.Flags&syscall.MS_RDONLY != 0 { + // remount cgroup root as readonly + mcgrouproot := &configs.Mount{ + Source: m.Destination, + Device: "bind", + Destination: m.Destination, + Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND, + } + if err := remount(mcgrouproot, rootfs); err != nil { + return err + } + } + default: + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + return mountPropagate(m, rootfs, mountLabel) + } + return nil +} + +func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { + mounts, err := cgroups.GetCgroupMounts() + if err != nil { + return nil, err + } + + cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + var binds []*configs.Mount + + for _, mm := range mounts { + dir, err := mm.GetThisCgroupDir(cgroupPaths) + if err != nil { + return nil, err + } + relDir, err := filepath.Rel(mm.Root, dir) + if err != nil { + return nil, err + } + binds = append(binds, &configs.Mount{ + Device: "bind", + Source: filepath.Join(mm.Mountpoint, relDir), + Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")), + Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags, + PropagationFlags: m.PropagationFlags, + }) + } + + return binds, nil +} + +// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. +// dest is required to be an abs path and have any symlinks resolved before calling this function. +func checkMountDestination(rootfs, dest string) error { + if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) { + return fmt.Errorf("mounting into / is prohibited") + } + invalidDestinations := []string{ + "/proc", + } + // White list, it should be sub directories of invalid destinations + validDestinations := []string{ + // These entries can be bind mounted by files emulated by fuse, + // so commands like top, free displays stats in container. + "/proc/cpuinfo", + "/proc/diskstats", + "/proc/meminfo", + "/proc/stat", + "/proc/net/dev", + } + for _, valid := range validDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) + if err != nil { + return err + } + if path == "." { + return nil + } + } + for _, invalid := range invalidDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) + if err != nil { + return err + } + if path == "." || !strings.HasPrefix(path, "..") { + return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) + } + } + return nil +} + +func setupDevSymlinks(rootfs string) error { + var links = [][2]string{ + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } + // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink + // in /dev if it exists in /proc. + if _, err := os.Stat("/proc/kcore"); err == nil { + links = append(links, [2]string{"/proc/kcore", "/dev/core"}) + } + for _, link := range links { + var ( + src = link[0] + dst = filepath.Join(rootfs, link[1]) + ) + if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink %s %s %s", src, dst, err) + } + } + return nil +} + +// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs +// this method will make them point to `/dev/null` in this container's rootfs. This +// needs to be called after we chroot/pivot into the container's rootfs so that any +// symlinks are resolved locally. +func reOpenDevNull() error { + var stat, devNullStat syscall.Stat_t + file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) + if err != nil { + return fmt.Errorf("Failed to open /dev/null - %s", err) + } + defer file.Close() + if err := syscall.Fstat(int(file.Fd()), &devNullStat); err != nil { + return err + } + for fd := 0; fd < 3; fd++ { + if err := syscall.Fstat(fd, &stat); err != nil { + return err + } + if stat.Rdev == devNullStat.Rdev { + // Close and re-open the fd. + if err := syscall.Dup3(int(file.Fd()), fd, 0); err != nil { + return err + } + } + } + return nil +} + +// Create the device nodes in the container. +func createDevices(config *configs.Config) error { + useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) + oldMask := syscall.Umask(0000) + for _, node := range config.Devices { + // containers running in a user namespace are not allowed to mknod + // devices so we can just bind mount it from the host. + if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { + syscall.Umask(oldMask) + return err + } + } + syscall.Umask(oldMask) + return nil +} + +func bindMountDeviceNode(dest string, node *configs.Device) error { + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return syscall.Mount(node.Path, dest, "bind", syscall.MS_BIND, "") +} + +// Creates the device node in the rootfs of the container. +func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { + dest := filepath.Join(rootfs, node.Path) + if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { + return err + } + + if bind { + return bindMountDeviceNode(dest, node) + } + if err := mknodDevice(dest, node); err != nil { + if os.IsExist(err) { + return nil + } else if os.IsPermission(err) { + return bindMountDeviceNode(dest, node) + } + return err + } + return nil +} + +func mknodDevice(dest string, node *configs.Device) error { + fileMode := node.FileMode + switch node.Type { + case 'c': + fileMode |= syscall.S_IFCHR + case 'b': + fileMode |= syscall.S_IFBLK + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + if err := syscall.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil { + return err + } + return syscall.Chown(dest, int(node.Uid), int(node.Gid)) +} + +func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { + for _, m := range mountinfo { + if m.Mountpoint == dir { + return m + } + } + return nil +} + +// Get the parent mount point of directory passed in as argument. Also return +// optional fields. +func getParentMount(rootfs string) (string, string, error) { + var path string + + mountinfos, err := mount.GetMounts() + if err != nil { + return "", "", err + } + + mountinfo := getMountInfo(mountinfos, rootfs) + if mountinfo != nil { + return rootfs, mountinfo.Optional, nil + } + + path = rootfs + for { + path = filepath.Dir(path) + + mountinfo = getMountInfo(mountinfos, path) + if mountinfo != nil { + return path, mountinfo.Optional, nil + } + + if path == "/" { + break + } + } + + // If we are here, we did not find parent mount. Something is wrong. + return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs) +} + +// Make parent mount private if it was shared +func rootfsParentMountPrivate(rootfs string) error { + sharedMount := false + + parentMount, optionalOpts, err := getParentMount(rootfs) + if err != nil { + return err + } + + optsSplit := strings.Split(optionalOpts, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + sharedMount = true + break + } + } + + // Make parent mount PRIVATE if it was shared. It is needed for two + // reasons. First of all pivot_root() will fail if parent mount is + // shared. Secondly when we bind mount rootfs it will propagate to + // parent namespace and we don't want that to happen. + if sharedMount { + return syscall.Mount("", parentMount, "", syscall.MS_PRIVATE, "") + } + + return nil +} + +func prepareRoot(config *configs.Config) error { + flag := syscall.MS_SLAVE | syscall.MS_REC + if config.RootPropagation != 0 { + flag = config.RootPropagation + } + if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { + return err + } + if config.NoPivotRoot { + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err + } + } + + return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") +} + +func setReadonly() error { + return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") +} + +func setupPtmx(config *configs.Config, console *linuxConsole) error { + ptmx := filepath.Join(config.Rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if console != nil { + return console.mount(config.Rootfs, config.MountLabel) + } + return nil +} + +func pivotRoot(rootfs, pivotBaseDir string) (err error) { + if pivotBaseDir == "" { + pivotBaseDir = "/" + } + tmpDir := filepath.Join(rootfs, pivotBaseDir) + if err := os.MkdirAll(tmpDir, 0755); err != nil { + return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err) + } + pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) + } + defer func() { + errVal := os.Remove(pivotDir) + if err == nil { + err = errVal + } + }() + if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { + // Make the parent mount private + if err := rootfsParentMountPrivate(rootfs); err != nil { + return err + } + // Try again + if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + } + if err := syscall.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + // path to pivot dir now changed, update + pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir)) + + // Make pivotDir rprivate to make sure any of the unmounts don't + // propagate to parent. + if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { + return err + } + + if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + return nil +} + +func msMoveRoot(rootfs string) error { + if err := syscall.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return err + } + if err := syscall.Chroot("."); err != nil { + return err + } + return syscall.Chdir("/") +} + +// createIfNotExists creates a file or a directory only if it does not already exist. +func createIfNotExists(path string, isDir bool) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + return os.MkdirAll(path, 0755) + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + return nil +} + +// remountReadonly will bind over the top of an existing path and ensure that it is read-only. +func remountReadonly(path string) error { + for i := 0; i < 5; i++ { + if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) { + switch err { + case syscall.EINVAL: + // Probably not a mountpoint, use bind-mount + if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil { + return err + } + return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "") + case syscall.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + return nil + } + return fmt.Errorf("unable to mount %s as readonly max retries reached", path) +} + +// maskFile bind mounts /dev/null over the top of the specified path inside a container +// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ). +func maskFile(path string) error { + if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// writeSystemProperty writes the value to a path under /proc/sys as determined from the key. +// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. +func writeSystemProperty(key, value string) error { + keyPath := strings.Replace(key, ".", "/", -1) + return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) +} + +func remount(m *configs.Mount, rootfs string) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags|syscall.MS_REMOUNT), ""); err != nil { + return err + } + return nil +} + +// Do the mount operation followed by additional mounts required to take care +// of propagation flags. +func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { + var ( + dest = m.Destination + data = label.FormatMountLabel(m.Data, mountLabel) + flags = m.Flags + ) + if dest == "/dev" { + flags &= ^syscall.MS_RDONLY + } + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + + if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { + return err + } + + for _, pflag := range m.PropagationFlags { + if err := syscall.Mount("", dest, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/config.go new file mode 100644 index 00000000..ded5a6bb --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -0,0 +1,76 @@ +package seccomp + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok == true { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header, though some +// (notable, SCMP_ACT_TRACE) are not available in this implementation and will +// return errors. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok == true { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok == true { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 00000000..ec0ac1c0 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,229 @@ +// +build linux,cgo,seccomp + +package seccomp + +import ( + "bufio" + "fmt" + "os" + "strings" + "syscall" + + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" +) + +var ( + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM)) + actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM)) + + // SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER. + SeccompModeFilter = uintptr(2) +) + +// Filters given syscalls in a container, preventing them from being used +// Started in the container init process, and carried over to all child processes +// Setns calls, however, require a separate invocation, as they are not children +// of the init until they join the namespace +func InitSeccomp(config *configs.Seccomp) error { + if config == nil { + return fmt.Errorf("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction) + if err != nil { + return fmt.Errorf("error initializing seccomp - invalid default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return fmt.Errorf("error creating filter: %s", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return err + } + + if err := filter.AddArch(scmpArch); err != nil { + return err + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return fmt.Errorf("error setting no new privileges: %s", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return fmt.Errorf("encountered nil syscall while initializing Seccomp") + } + + if err = matchCall(filter, call); err != nil { + return err + } + } + + if err = filter.Load(); err != nil { + return fmt.Errorf("error loading seccomp filter into kernel: %s", err) + } + + return nil +} + +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill: + return actKill, nil + case configs.Errno: + return actErrno, nil + case configs.Trap: + return actTrap, nil + case configs.Allow: + return actAllow, nil + case configs.Trace: + return actTrace, nil + default: + return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, fmt.Errorf("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { + if call == nil || filter == nil { + return fmt.Errorf("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return fmt.Errorf("empty string is not a valid syscall") + } + + // If we can't resolve the syscall, assume it's not supported on this kernel + // Ignore it, don't error out + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + return nil + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action) + if err != nil { + return err + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err = filter.AddRule(callNum, callAct); err != nil { + return err + } + } else { + // Conditional match - convert the per-arg rules into library format + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return err + } + + conditions = append(conditions, newCond) + } + + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return err + } + } + + return nil +} + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + return status, nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 00000000..44df1ad4 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) error { + if config != nil { + return ErrSeccompNotEnabled + } + return nil +} + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go new file mode 100644 index 00000000..2a18e2ad --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -0,0 +1,499 @@ +// +build linux + +package selinux + +import ( + "bufio" + "crypto/rand" + "encoding/binary" + "fmt" + "io" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "sync" + "syscall" + + "github.com/opencontainers/runc/libcontainer/system" +) + +const ( + Enforcing = 1 + Permissive = 0 + Disabled = -1 + selinuxDir = "/etc/selinux/" + selinuxConfig = selinuxDir + "config" + selinuxTypeTag = "SELINUXTYPE" + selinuxTag = "SELINUX" + selinuxPath = "/sys/fs/selinux" + xattrNameSelinux = "security.selinux" + stRdOnly = 0x01 +) + +var ( + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + mcsList = make(map[string]bool) + mcsLock sync.Mutex + selinuxfs = "unknown" + selinuxEnabled = false // Stores whether selinux is currently enabled + selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet +) + +type SELinuxContext map[string]string + +// SetDisabled disables selinux support for the package +func SetDisabled() { + selinuxEnabled, selinuxEnabledChecked = false, true +} + +// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs +// filesystem or an empty string if no mountpoint is found. Selinuxfs is +// a proc-like pseudo-filesystem that exposes the selinux policy API to +// processes. The existence of an selinuxfs mount is used to determine +// whether selinux is currently enabled or not. +func getSelinuxMountPoint() string { + if selinuxfs != "unknown" { + return selinuxfs + } + selinuxfs = "" + + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return selinuxfs + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + txt := scanner.Text() + // Safe as mountinfo encodes mountpoints with spaces as \040. + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + continue + } + if !strings.Contains(txt[sepIdx:], "selinuxfs") { + continue + } + fields := strings.Split(txt, " ") + if len(fields) < 5 { + continue + } + selinuxfs = fields[4] + break + } + + if selinuxfs != "" { + var buf syscall.Statfs_t + syscall.Statfs(selinuxfs, &buf) + if (buf.Flags & stRdOnly) == 1 { + selinuxfs = "" + } + } + return selinuxfs +} + +// SelinuxEnabled returns whether selinux is currently enabled. +func SelinuxEnabled() bool { + if selinuxEnabledChecked { + return selinuxEnabled + } + selinuxEnabledChecked = true + if fs := getSelinuxMountPoint(); fs != "" { + if con, _ := Getcon(); con != "kernel" { + selinuxEnabled = true + } + } + return selinuxEnabled +} + +func readConfig(target string) (value string) { + var ( + val, key string + bufin *bufio.Reader + ) + + in, err := os.Open(selinuxConfig) + if err != nil { + return "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err != io.EOF { + return "" + } + done = true + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == target { + return strings.Trim(val, "\"") + } + } + } + return "" +} + +func getSELinuxPolicyRoot() string { + return selinuxDir + readConfig(selinuxTypeTag) +} + +func readCon(name string) (string, error) { + var val string + + in, err := os.Open(name) + if err != nil { + return "", err + } + defer in.Close() + + _, err = fmt.Fscanf(in, "%s", &val) + return val, err +} + +// Setfilecon sets the SELinux label for this path or returns an error. +func Setfilecon(path string, scon string) error { + return system.Lsetxattr(path, xattrNameSelinux, []byte(scon), 0) +} + +// Getfilecon returns the SELinux label for this path or returns an error. +func Getfilecon(path string) (string, error) { + con, err := system.Lgetxattr(path, xattrNameSelinux) + if err != nil { + return "", err + } + // Trim the NUL byte at the end of the byte buffer, if present. + if len(con) > 0 && con[len(con)-1] == '\x00' { + con = con[:len(con)-1] + } + return string(con), nil +} + +func Setfscreatecon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid()), scon) +} + +func Getfscreatecon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid())) +} + +// Getcon returns the SELinux label of the current process thread, or an error. +func Getcon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/current", syscall.Gettid())) +} + +// Getpidcon returns the SELinux label of the given pid, or an error. +func Getpidcon(pid int) (string, error) { + return readCon(fmt.Sprintf("/proc/%d/attr/current", pid)) +} + +func Getexeccon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid())) +} + +func writeCon(name string, val string) error { + out, err := os.OpenFile(name, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + if val != "" { + _, err = out.Write([]byte(val)) + } else { + _, err = out.Write(nil) + } + return err +} + +func Setexeccon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), scon) +} + +func (c SELinuxContext) Get() string { + return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"]) +} + +func NewContext(scon string) SELinuxContext { + c := make(SELinuxContext) + + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + c["user"] = con[0] + c["role"] = con[1] + c["type"] = con[2] + c["level"] = con[3] + } + return c +} + +func ReserveLabel(scon string) { + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + mcsAdd(con[3]) + } +} + +func selinuxEnforcePath() string { + return fmt.Sprintf("%s/enforce", selinuxPath) +} + +func SelinuxGetEnforce() int { + var enforce int + + enforceS, err := readCon(selinuxEnforcePath()) + if err != nil { + return -1 + } + + enforce, err = strconv.Atoi(string(enforceS)) + if err != nil { + return -1 + } + return enforce +} + +func SelinuxSetEnforce(mode int) error { + return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode)) +} + +func SelinuxGetEnforceMode() int { + switch readConfig(selinuxTag) { + case "enforcing": + return Enforcing + case "permissive": + return Permissive + } + return Disabled +} + +func mcsAdd(mcs string) error { + mcsLock.Lock() + defer mcsLock.Unlock() + if mcsList[mcs] { + return fmt.Errorf("MCS Label already exists") + } + mcsList[mcs] = true + return nil +} + +func mcsDelete(mcs string) { + mcsLock.Lock() + mcsList[mcs] = false + mcsLock.Unlock() +} + +func IntToMcs(id int, catRange uint32) string { + var ( + SETSIZE = int(catRange) + TIER = SETSIZE + ORD = id + ) + + if id < 1 || id > 523776 { + return "" + } + + for ORD > TIER { + ORD = ORD - TIER + TIER-- + } + TIER = SETSIZE - TIER + ORD = ORD + TIER + return fmt.Sprintf("s0:c%d,c%d", TIER, ORD) +} + +func uniqMcs(catRange uint32) string { + var ( + n uint32 + c1, c2 uint32 + mcs string + ) + + for { + binary.Read(rand.Reader, binary.LittleEndian, &n) + c1 = n % catRange + binary.Read(rand.Reader, binary.LittleEndian, &n) + c2 = n % catRange + if c1 == c2 { + continue + } else { + if c1 > c2 { + t := c1 + c1 = c2 + c2 = t + } + } + mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) + if err := mcsAdd(mcs); err != nil { + continue + } + break + } + return mcs +} + +func FreeLxcContexts(scon string) { + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + mcsDelete(con[3]) + } +} + +func GetLxcContexts() (processLabel string, fileLabel string) { + var ( + val, key string + bufin *bufio.Reader + ) + + if !SelinuxEnabled() { + return "", "" + } + lxcPath := fmt.Sprintf("%s/contexts/lxc_contexts", getSELinuxPolicyRoot()) + in, err := os.Open(lxcPath) + if err != nil { + return "", "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err == io.EOF { + done = true + } else { + goto exit + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == "process" { + processLabel = strings.Trim(val, "\"") + } + if key == "file" { + fileLabel = strings.Trim(val, "\"") + } + } + } + + if processLabel == "" || fileLabel == "" { + return "", "" + } + +exit: + // mcs := IntToMcs(os.Getpid(), 1024) + mcs := uniqMcs(1024) + scon := NewContext(processLabel) + scon["level"] = mcs + processLabel = scon.Get() + scon = NewContext(fileLabel) + scon["level"] = mcs + fileLabel = scon.Get() + return processLabel, fileLabel +} + +func SecurityCheckContext(val string) error { + return writeCon(fmt.Sprintf("%s.context", selinuxPath), val) +} + +func CopyLevel(src, dest string) (string, error) { + if src == "" { + return "", nil + } + if err := SecurityCheckContext(src); err != nil { + return "", err + } + if err := SecurityCheckContext(dest); err != nil { + return "", err + } + scon := NewContext(src) + tcon := NewContext(dest) + mcsDelete(tcon["level"]) + mcsAdd(scon["level"]) + tcon["level"] = scon["level"] + return tcon.Get(), nil +} + +// Prevent users from relabing system files +func badPrefix(fpath string) error { + var badprefixes = []string{"/usr"} + + for _, prefix := range badprefixes { + if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) { + return fmt.Errorf("Relabeling content in %s is not allowed.", prefix) + } + } + return nil +} + +// Chcon changes the fpath file object to the SELinux label scon. +// If the fpath is a directory and recurse is true Chcon will walk the +// directory tree setting the label +func Chcon(fpath string, scon string, recurse bool) error { + if scon == "" { + return nil + } + if err := badPrefix(fpath); err != nil { + return err + } + callback := func(p string, info os.FileInfo, err error) error { + return Setfilecon(p, scon) + } + + if recurse { + return filepath.Walk(fpath, callback) + } + + return Setfilecon(fpath, scon) +} + +// DupSecOpt takes an SELinux process label and returns security options that +// can will set the SELinux Type and Level for future container processes +func DupSecOpt(src string) []string { + if src == "" { + return nil + } + con := NewContext(src) + if con["user"] == "" || + con["role"] == "" || + con["type"] == "" || + con["level"] == "" { + return nil + } + return []string{"label=user:" + con["user"], + "label=role:" + con["role"], + "label=type:" + con["type"], + "label=level:" + con["level"]} +} + +// DisableSecOpt returns a security opt that can be used to disabling SELinux +// labeling support for future container processes +func DisableSecOpt() []string { + return []string{"label=disable"} +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/setgroups_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/setgroups_linux.go new file mode 100644 index 00000000..c7bdb605 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/setgroups_linux.go @@ -0,0 +1,11 @@ +// +build linux,go1.5 + +package libcontainer + +import "syscall" + +// Set the GidMappingsEnableSetgroups member to true, so the process's +// setgroups proc entry wont be set to 'deny' if GidMappings are set +func enableSetgroups(sys *syscall.SysProcAttr) { + sys.GidMappingsEnableSetgroups = true +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/setns_init_linux.go new file mode 100644 index 00000000..4b78ae80 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -0,0 +1,53 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/label" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" +) + +// linuxSetnsInit performs the container's initialization for running a new process +// inside an existing container. +type linuxSetnsInit struct { + config *initConfig +} + +func (l *linuxSetnsInit) getSessionRingName() string { + return fmt.Sprintf("_ses.%s", l.config.ContainerId) +} + +func (l *linuxSetnsInit) Init() error { + if !l.config.Config.NoNewKeyring { + // do not inherit the parent's session keyring + if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil { + return err + } + } + if l.config.NoNewPrivileges { + if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + if l.config.Config.Seccomp != nil { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go new file mode 100644 index 00000000..8c0540e8 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -0,0 +1,750 @@ +// +build linux + +// Package specconv implements conversion of specifications to libcontainer +// configurations +package specconv + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" +) + +const wildcard = -1 + +var namespaceMapping = map[specs.NamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, +} + +var mountPropagationMapping = map[string]int{ + "rprivate": syscall.MS_PRIVATE | syscall.MS_REC, + "private": syscall.MS_PRIVATE, + "rslave": syscall.MS_SLAVE | syscall.MS_REC, + "slave": syscall.MS_SLAVE, + "rshared": syscall.MS_SHARED | syscall.MS_REC, + "shared": syscall.MS_SHARED, + "": syscall.MS_PRIVATE | syscall.MS_REC, +} + +var allowedDevices = []*configs.Device{ + // allow mknod for any device + { + Type: 'c', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'b', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + Allow: true, + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: wildcard, + Permissions: "rwm", + Allow: true, + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, +} + +type CreateOpts struct { + CgroupName string + UseSystemdCgroup bool + NoPivotRoot bool + NoNewKeyring bool + Spec *specs.Spec +} + +// CreateLibcontainerConfig creates a new libcontainer configuration from a +// given specification and a cgroup name +func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + // runc's cwd will always be the bundle path + rcwd, err := os.Getwd() + if err != nil { + return nil, err + } + cwd, err := filepath.Abs(rcwd) + if err != nil { + return nil, err + } + spec := opts.Spec + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + labels := []string{} + for k, v := range spec.Annotations { + labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + } + config := &configs.Config{ + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + } + + exists := false + if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) + } + + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + config.Namespaces.Add(t, ns.Path) + } + if config.Namespaces.Contains(configs.NEWNET) { + config.Networks = []*configs.Network{ + { + Type: "loopback", + }, + } + } + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + c, err := createCgroupConfig(opts.CgroupName, opts.UseSystemdCgroup, spec) + if err != nil { + return nil, err + } + config.Cgroups = c + // set extra path masking for libcontainer for the various unsafe places in proc + config.MaskPaths = spec.Linux.MaskedPaths + config.ReadonlyPaths = spec.Linux.ReadonlyPaths + if spec.Linux.Seccomp != nil { + seccomp, err := setupSeccomp(spec.Linux.Seccomp) + if err != nil { + return nil, err + } + config.Seccomp = seccomp + } + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + config.Sysctl = spec.Linux.Sysctl + if spec.Linux.Resources != nil && spec.Linux.Resources.OOMScoreAdj != nil { + config.OomScoreAdj = *spec.Linux.Resources.OOMScoreAdj + } + for _, g := range spec.Process.User.AdditionalGids { + config.AdditionalGroups = append(config.AdditionalGroups, strconv.FormatUint(uint64(g), 10)) + } + createHooks(spec, config) + config.MountLabel = spec.Linux.MountLabel + config.Version = specs.Version + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { + flags, pgflags, data := parseMountOptions(m.Options) + source := m.Source + if m.Type == "bind" { + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: m.Type, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + PropagationFlags: pgflags, + } +} + +func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) { + var ( + err error + myCgroupPath string + ) + + c := &configs.Cgroup{ + Resources: &configs.Resources{}, + } + + if spec.Linux.CgroupsPath != nil { + myCgroupPath = libcontainerUtils.CleanPath(*spec.Linux.CgroupsPath) + if useSystemdCgroup { + myCgroupPath = *spec.Linux.CgroupsPath + } + } + + if useSystemdCgroup { + if myCgroupPath == "" { + c.Parent = "system.slice" + c.ScopePrefix = "runc" + c.Name = name + } else { + // Parse the path from expected "slice:prefix:name" + // for e.g. "system.slice:docker:1234" + parts := strings.Split(myCgroupPath, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") + } + c.Parent = parts[0] + c.ScopePrefix = parts[1] + c.Name = parts[2] + } + } else { + if myCgroupPath == "" { + myCgroupPath, err = cgroups.GetThisCgroupDir("devices") + if err != nil { + return nil, err + } + myCgroupPath = filepath.Join(myCgroupPath, name) + } + c.Path = myCgroupPath + } + + c.Resources.AllowedDevices = allowedDevices + r := spec.Linux.Resources + if r == nil { + return c, nil + } + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != nil { + t = *d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == nil || *d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToDeviceRune(t) + if err != nil { + return nil, err + } + dd := &configs.Device{ + Type: dt, + Major: major, + Minor: minor, + Permissions: *d.Access, + Allow: d.Allow, + } + c.Resources.Devices = append(c.Resources.Devices, dd) + } + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = int64(*r.Memory.Limit) + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = int64(*r.Memory.Reservation) + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = int64(*r.Memory.Swap) + } + if r.Memory.Kernel != nil { + c.Resources.KernelMemory = int64(*r.Memory.Kernel) + } + if r.Memory.KernelTCP != nil { + c.Resources.KernelMemoryTCP = int64(*r.Memory.KernelTCP) + } + if r.Memory.Swappiness != nil { + swappiness := int64(*r.Memory.Swappiness) + c.Resources.MemorySwappiness = &swappiness + } + } + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = int64(*r.CPU.Shares) + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = int64(*r.CPU.Quota) + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = int64(*r.CPU.Period) + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = int64(*r.CPU.RealtimeRuntime) + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = int64(*r.CPU.RealtimePeriod) + } + if r.CPU.Cpus != nil { + c.Resources.CpusetCpus = *r.CPU.Cpus + } + if r.CPU.Mems != nil { + c.Resources.CpusetMems = *r.CPU.Mems + } + } + if r.Pids != nil { + c.Resources.PidsLimit = *r.Pids.Limit + } + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) + } + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } + } + } + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: *l.Pagesize, + Limit: *l.Limit, + }) + } + if r.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.DisableOOMKiller + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = string(*r.Network.ClassID) + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + return c, nil +} + +func stringToDeviceRune(s string) (rune, error) { + switch s { + case "a": + return 'a', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid device type %q", s) + } +} + +func createDevices(spec *specs.Spec, config *configs.Config) error { + // add whitelisted devices + config.Devices = []*configs.Device{ + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + } + // merge in additional devices from the spec + for _, d := range spec.Linux.Devices { + var uid, gid uint32 + if d.UID != nil { + uid = *d.UID + } + if d.GID != nil { + gid = *d.GID + } + dt, err := stringToDeviceRune(d.Type) + if err != nil { + return err + } + device := &configs.Device{ + Type: dt, + Path: d.Path, + Major: d.Major, + Minor: d.Minor, + FileMode: *d.FileMode, + Uid: uid, + Gid: gid, + } + config.Devices = append(config.Devices, device) + } + return nil +} + +func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { + if len(spec.Linux.UIDMappings) == 0 { + return nil + } + create := func(m specs.IDMapping) configs.IDMap { + return configs.IDMap{ + HostID: int(m.HostID), + ContainerID: int(m.ContainerID), + Size: int(m.Size), + } + } + for _, m := range spec.Linux.UIDMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GIDMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + rootUID, err := config.HostUID() + if err != nil { + return err + } + rootGID, err := config.HostGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUID) + node.Gid = uint32(rootGID) + } + return nil +} + +// parseMountOptions parses the string and returns the flags, propagation +// flags and any mount data that it contains. +func parseMountOptions(options []string) (int, []int, string) { + var ( + flag int + pgflag []int + data []string + ) + flags := map[string]struct { + clear bool + flag int + }{ + "async": {true, syscall.MS_SYNCHRONOUS}, + "atime": {true, syscall.MS_NOATIME}, + "bind": {false, syscall.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, syscall.MS_NODEV}, + "diratime": {true, syscall.MS_NODIRATIME}, + "dirsync": {false, syscall.MS_DIRSYNC}, + "exec": {true, syscall.MS_NOEXEC}, + "mand": {false, syscall.MS_MANDLOCK}, + "noatime": {false, syscall.MS_NOATIME}, + "nodev": {false, syscall.MS_NODEV}, + "nodiratime": {false, syscall.MS_NODIRATIME}, + "noexec": {false, syscall.MS_NOEXEC}, + "nomand": {true, syscall.MS_MANDLOCK}, + "norelatime": {true, syscall.MS_RELATIME}, + "nostrictatime": {true, syscall.MS_STRICTATIME}, + "nosuid": {false, syscall.MS_NOSUID}, + "rbind": {false, syscall.MS_BIND | syscall.MS_REC}, + "relatime": {false, syscall.MS_RELATIME}, + "remount": {false, syscall.MS_REMOUNT}, + "ro": {false, syscall.MS_RDONLY}, + "rw": {true, syscall.MS_RDONLY}, + "strictatime": {false, syscall.MS_STRICTATIME}, + "suid": {true, syscall.MS_NOSUID}, + "sync": {false, syscall.MS_SYNCHRONOUS}, + } + propagationFlags := map[string]struct { + clear bool + flag int + }{ + "private": {false, syscall.MS_PRIVATE}, + "shared": {false, syscall.MS_SHARED}, + "slave": {false, syscall.MS_SLAVE}, + "unbindable": {false, syscall.MS_UNBINDABLE}, + "rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC}, + "rshared": {false, syscall.MS_SHARED | syscall.MS_REC}, + "rslave": {false, syscall.MS_SLAVE | syscall.MS_REC}, + "runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC}, + } + for _, o := range options { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f.flag != 0 { + pgflag = append(pgflag, f.flag) + } else { + data = append(data, o) + } + } + return flag, pgflag, strings.Join(data, ",") +} + +func setupSeccomp(config *specs.Seccomp) (*configs.Seccomp, error) { + if config == nil { + return nil, nil + } + + // No default action specified, no syscalls listed, assume seccomp disabled + if config.DefaultAction == "" && len(config.Syscalls) == 0 { + return nil, nil + } + + newConfig := new(configs.Seccomp) + newConfig.Syscalls = []*configs.Syscall{} + + if len(config.Architectures) > 0 { + newConfig.Architectures = []string{} + for _, arch := range config.Architectures { + newArch, err := seccomp.ConvertStringToArch(string(arch)) + if err != nil { + return nil, err + } + newConfig.Architectures = append(newConfig.Architectures, newArch) + } + } + + // Convert default action from string representation + newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) + if err != nil { + return nil, err + } + newConfig.DefaultAction = newDefaultAction + + // Loop through all syscall blocks and convert them to libcontainer format + for _, call := range config.Syscalls { + newAction, err := seccomp.ConvertStringToAction(string(call.Action)) + if err != nil { + return nil, err + } + + newCall := configs.Syscall{ + Name: call.Name, + Action: newAction, + Args: []*configs.Arg{}, + } + + // Loop through all the arguments of the syscall and convert them + for _, arg := range call.Args { + newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) + if err != nil { + return nil, err + } + + newArg := configs.Arg{ + Index: arg.Index, + Value: arg.Value, + ValueTwo: arg.ValueTwo, + Op: newOp, + } + + newCall.Args = append(newCall.Args, &newArg) + } + + newConfig.Syscalls = append(newConfig.Syscalls, &newCall) + } + + return newConfig, nil +} + +func createHooks(rspec *specs.Spec, config *configs.Config) { + config.Hooks = &configs.Hooks{} + for _, h := range rspec.Hooks.Prestart { + cmd := createCommandHook(h) + config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststart { + cmd := createCommandHook(h) + config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststop { + cmd := createCommandHook(h) + config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd)) + } +} + +func createCommandHook(h specs.Hook) configs.Command { + cmd := configs.Command{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + } + if h.Timeout != nil { + d := time.Duration(*h.Timeout) * time.Second + cmd.Timeout = &d + } + return cmd +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go new file mode 100644 index 00000000..0bbe1495 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go @@ -0,0 +1,27 @@ +package stacktrace + +import "runtime" + +// Capture captures a stacktrace for the current calling go program +// +// skip is the number of frames to skip +func Capture(userSkip int) Stacktrace { + var ( + skip = userSkip + 1 // add one for our own function + frames []Frame + prevPc uintptr + ) + for i := skip; ; i++ { + pc, file, line, ok := runtime.Caller(i) + //detect if caller is repeated to avoid loop, gccgo + //currently runs into a loop without this check + if !ok || pc == prevPc { + break + } + frames = append(frames, NewFrame(pc, file, line)) + prevPc = pc + } + return Stacktrace{ + Frames: frames, + } +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go new file mode 100644 index 00000000..0d590d9a --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/frame.go @@ -0,0 +1,38 @@ +package stacktrace + +import ( + "path/filepath" + "runtime" + "strings" +) + +// NewFrame returns a new stack frame for the provided information +func NewFrame(pc uintptr, file string, line int) Frame { + fn := runtime.FuncForPC(pc) + if fn == nil { + return Frame{} + } + pack, name := parseFunctionName(fn.Name()) + return Frame{ + Line: line, + File: filepath.Base(file), + Package: pack, + Function: name, + } +} + +func parseFunctionName(name string) (string, string) { + i := strings.LastIndex(name, ".") + if i == -1 { + return "", name + } + return name[:i], name[i+1:] +} + +// Frame contains all the information for a stack frame within a go program +type Frame struct { + File string + Function string + Package string + Line int +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go new file mode 100644 index 00000000..5e8b58d2 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stacktrace/stacktrace.go @@ -0,0 +1,5 @@ +package stacktrace + +type Stacktrace struct { + Frames []Frame +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go new file mode 100644 index 00000000..b3ff6a0e --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -0,0 +1,178 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io" + "os" + "os/exec" + "syscall" + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/label" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" +) + +type linuxStandardInit struct { + pipe io.ReadWriteCloser + parentPid int + stateDirFD int + config *initConfig +} + +func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { + var newperms uint32 + + if l.config.Config.Namespaces.Contains(configs.NEWUSER) { + // with user ns we need 'other' search permissions + newperms = 0x8 + } else { + // without user ns we need 'UID' search permissions + newperms = 0x80000 + } + + // create a unique per session container name that we can + // join in setns; however, other containers can also join it + return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +} + +// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value +// the kernel +const PR_SET_NO_NEW_PRIVS = 0x26 + +func (l *linuxStandardInit) Init() error { + if !l.config.Config.NoNewKeyring { + ringname, keepperms, newperms := l.getSessionRingParams() + + // do not inherit the parent's session keyring + sessKeyId, err := keyctl.JoinSessionKeyring(ringname) + if err != nil { + return err + } + // make session keyring searcheable + if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return err + } + } + + var console *linuxConsole + if l.config.Console != "" { + console = newConsoleFromPath(l.config.Console) + if err := console.dupStdio(); err != nil { + return err + } + } + if console != nil { + if err := system.Setctty(); err != nil { + return err + } + } + if err := setupNetwork(l.config); err != nil { + return err + } + if err := setupRoute(l.config.Config); err != nil { + return err + } + + label.Init() + // InitializeMountNamespace() can be executed only for a new mount namespace + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := setupRootfs(l.config.Config, console, l.pipe); err != nil { + return err + } + } + if hostname := l.config.Config.Hostname; hostname != "" { + if err := syscall.Sethostname([]byte(hostname)); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + + for key, value := range l.config.Config.Sysctl { + if err := writeSystemProperty(key, value); err != nil { + return err + } + } + for _, path := range l.config.Config.ReadonlyPaths { + if err := remountReadonly(path); err != nil { + return err + } + } + for _, path := range l.config.Config.MaskPaths { + if err := maskFile(path); err != nil { + return err + } + } + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return err + } + if l.config.NoNewPrivileges { + if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return err + } + // compare the parent from the inital start of the init process and make sure that it did not change. + // if the parent changes that means it died and we were reparened to something else so we should + // just kill ourself and not cause problems for someone else. + if syscall.Getppid() != l.parentPid { + return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) + } + // check for the arg before waiting to make sure it exists and it is returned + // as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // close the pipe to signal that we have completed our init. + l.pipe.Close() + // wait for the fifo to be opened on the other side before + // exec'ing the users process. + fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "openat exec fifo") + } + if _, err := syscall.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/state_linux.go new file mode 100644 index 00000000..26628240 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -0,0 +1,253 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +func newStateTransitionError(from, to containerState) error { + return &stateTransitionError{ + From: from.status().String(), + To: to.status().String(), + } +} + +// stateTransitionError is returned when an invalid state transition happens from one +// state to another. +type stateTransitionError struct { + From string + To string +} + +func (s *stateTransitionError) Error() string { + return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To) +} + +type containerState interface { + transition(containerState) error + destroy() error + status() Status +} + +func destroy(c *linuxContainer) error { + if !c.config.Namespaces.Contains(configs.NEWPID) { + if err := killCgroupProcesses(c.cgroupManager); err != nil { + logrus.Warn(err) + } + } + err := c.cgroupManager.Destroy() + if rerr := os.RemoveAll(c.root); err == nil { + err = rerr + } + c.initProcess = nil + if herr := runPoststopHooks(c); err == nil { + err = herr + } + c.state = &stoppedState{c: c} + return err +} + +func runPoststopHooks(c *linuxContainer) error { + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Root: c.config.Rootfs, + BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), + } + for _, hook := range c.config.Hooks.Poststop { + if err := hook.Run(s); err != nil { + return err + } + } + } + return nil +} + +// stoppedState represents a container is a stopped/destroyed state. +type stoppedState struct { + c *linuxContainer +} + +func (b *stoppedState) status() Status { + return Stopped +} + +func (b *stoppedState) transition(s containerState) error { + switch s.(type) { + case *runningState: + b.c.state = s + return nil + case *restoredState: + b.c.state = s + return nil + case *stoppedState: + return nil + } + return newStateTransitionError(b, s) +} + +func (b *stoppedState) destroy() error { + return destroy(b.c) +} + +// runningState represents a container that is currently running. +type runningState struct { + c *linuxContainer +} + +func (r *runningState) status() Status { + return Running +} + +func (r *runningState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + } + r.c.state = s + return nil + case *pausedState: + r.c.state = s + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *runningState) destroy() error { + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + } + return destroy(r.c) +} + +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(syscall.SIGKILL) + return destroy(i.c) +} + +// pausedState represents a container that is currently pause. It cannot be destroyed in a +// paused state and must transition back to running first. +type pausedState struct { + c *linuxContainer +} + +func (p *pausedState) status() Status { + return Paused +} + +func (p *pausedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *stoppedState: + p.c.state = s + return nil + case *pausedState: + return nil + } + return newStateTransitionError(p, s) +} + +func (p *pausedState) destroy() error { + t, err := p.c.runType() + if err != nil { + return err + } + if t != Running && t != Created { + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return destroy(p.c) + } + return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) +} + +// restoredState is the same as the running state but also has accociated checkpoint +// information that maybe need destroyed when the container is stopped and destroy is called. +type restoredState struct { + imageDir string + c *linuxContainer +} + +func (r *restoredState) status() Status { + return Running +} + +func (r *restoredState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *restoredState) destroy() error { + if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + return err + } + } + return destroy(r.c) +} + +// loadedState is used whenever a container is restored, loaded, or setting additional +// processes inside and it should not be destroyed when it is exiting. +type loadedState struct { + c *linuxContainer + s Status +} + +func (n *loadedState) status() Status { + return n.s +} + +func (n *loadedState) transition(s containerState) error { + n.c.state = s + return nil +} + +func (n *loadedState) destroy() error { + if err := n.c.refreshState(); err != nil { + return err + } + return n.c.state.destroy() +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stats.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stats.go new file mode 100644 index 00000000..303e4b94 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stats.go @@ -0,0 +1,15 @@ +package libcontainer + +type NetworkInterface struct { + // Name is the name of the network interface. + Name string + + RxBytes uint64 + RxPackets uint64 + RxErrors uint64 + RxDropped uint64 + TxBytes uint64 + TxPackets uint64 + TxErrors uint64 + TxDropped uint64 +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stats_freebsd.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_freebsd.go new file mode 100644 index 00000000..f8d1d689 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_freebsd.go @@ -0,0 +1,5 @@ +package libcontainer + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stats_linux.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_linux.go new file mode 100644 index 00000000..c629dc67 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_linux.go @@ -0,0 +1,8 @@ +package libcontainer + +import "github.com/opencontainers/runc/libcontainer/cgroups" + +type Stats struct { + Interfaces []*NetworkInterface + CgroupStats *cgroups.Stats +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_solaris.go new file mode 100644 index 00000000..da78c1c2 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_solaris.go @@ -0,0 +1,7 @@ +package libcontainer + +// Solaris - TODO + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/stats_windows.go b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_windows.go new file mode 100644 index 00000000..f8d1d689 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/stats_windows.go @@ -0,0 +1,5 @@ +package libcontainer + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 00000000..3466bfce --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,121 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "syscall" +) + +const ( + exitSignalOffset = 128 +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + if size > 64 { + size = 64 + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status syscall.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontianer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} diff --git a/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 00000000..408918f2 --- /dev/null +++ b/vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,33 @@ +// +build !windows + +package utils + +import ( + "io/ioutil" + "strconv" + "syscall" +) + +func CloseExecFrom(minFd int) error { + fdList, err := ioutil.ReadDir("/proc/self/fd") + if err != nil { + return err + } + for _, fi := range fdList { + fd, err := strconv.Atoi(fi.Name()) + if err != nil { + // ignore non-numeric file names + continue + } + + if fd < minFd { + // ignore descriptors lower than our specified minimum + continue + } + + // intentionally ignore errors from syscall.CloseOnExec + syscall.CloseOnExec(fd) + // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) + } + return nil +}