From 271cac86348fe376a8bd9d365c0495dc1763edd6 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 24 Jan 2017 14:57:11 -0800 Subject: [PATCH] Update runc to ce450bcc6c135cae93ee2a99d41a308c179ff6dc Signed-off-by: Michael Crosby --- vendor.conf | 4 +- .../runc/libcontainer/nsenter/namespace.h | 32 + .../runc/libcontainer/nsenter/nsenter.go | 12 + .../libcontainer/nsenter/nsenter_gccgo.go | 25 + .../nsenter/nsenter_unsupported.go | 5 + .../runc/libcontainer/nsenter/nsexec.c | 759 ++++++++++++++++++ .../runc/libcontainer/system/proc.go | 20 +- .../runc/libcontainer/user/user.go | 2 +- .../runc/libcontainer/utils/cmsg.c | 148 ++++ .../runc/libcontainer/utils/cmsg.go | 57 ++ .../runc/libcontainer/utils/cmsg.h | 36 + .../runc/libcontainer/utils/utils.go | 126 +++ .../runc/libcontainer/utils/utils_unix.go | 33 + 13 files changed, 1254 insertions(+), 5 deletions(-) create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go diff --git a/vendor.conf b/vendor.conf index aadfa63..02bd0a2 100644 --- a/vendor.conf +++ b/vendor.conf @@ -1,5 +1,5 @@ # go-runc client for runc; master as of 01/20/2017 -github.com/crosbymichael/go-runc afca56d262e694d9056e937a0877a39ab879aeb4 +github.com/crosbymichael/go-runc 7b66c5da30493c5eb9c655cab67ba88071891ac5 # go-metrics client to prometheus; master as of 12/16/2016 github.com/docker/go-metrics 0f35294225552d968a13f9c5bc71a3fa44b2eb87 # prometheus client; latest release as of 12/16/2016 @@ -31,7 +31,7 @@ github.com/nats-io/go-nats-streaming v0.3.4 # gnatsd; latest release as of 12/16/2016 github.com/nats-io/gnatsd v0.9.6 # runc, latest release as of 12/16/2016 -github.com/opencontainers/runc v1.0.0-rc2 +github.com/opencontainers/runc ce450bcc6c135cae93ee2a99d41a308c179ff6dc # OCI runtime spec, latest release as of 12/16/2016 github.com/opencontainers/runtime-spec v1.0.0-rc3 # logrus, latest release as of 12/16/2016 diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h new file mode 100644 index 0000000..9e9bdca --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go new file mode 100644 index 0000000..07f4d63 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go @@ -0,0 +1,12 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 0000000..63c7a3e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 0000000..ac701ca --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,5 @@ +// +build !linux !cgo + +package nsenter + +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c new file mode 100644 index 0000000..7d15aeb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -0,0 +1,759 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */ + + /* XXX: This doesn't help with segfaults and other such issues. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ +}; + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* JSON buffer. */ +#define JSON_MAX 4096 + +/* Assume the stack grows down, so arguments should be above it. */ +struct clone_t { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__ ((aligned(16))); + char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ + jmp_buf *env; + int jmpval; +}; + +struct nlconfig_t { + char *data; + uint32_t cloneflags; + char *uidmap; + size_t uidmap_len; + char *gidmap; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; + uint8_t is_setgroup; +}; + +/* + * List of netlink message types sent to us as part of bootstrapping the init. + * These constants are defined in libcontainer/message_linux.go. + */ +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define NS_PATHS_ATTR 27282 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 +#define SETGROUP_ATTR 27285 + +/* + * Use the raw syscall for versions of glibc which don't include a function for + * it, namely (glibc 2.12). + */ +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif + +#ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +#endif + +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif + +/* XXX: This is ugly. */ +static int syncfd = -1; + +/* TODO(cyphar): Fix this so it correctly deals with syncT. */ +#define bail(fmt, ...) \ + do { \ + int ret = __COUNTER__ + 1; \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + if (syncfd >= 0) { \ + enum sync_t s = SYNC_ERR; \ + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ + fprintf(stderr, "nsenter: failed: write(s)"); \ + if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ + fprintf(stderr, "nsenter: failed: write(ret)"); \ + } \ + exit(ret); \ + } while(0) + +static int write_file(char *data, size_t data_len, char *pathfmt, ...) +{ + int fd, len, ret = 0; + char path[PATH_MAX]; + + va_list ap; + va_start(ap, pathfmt); + len = vsnprintf(path, PATH_MAX, pathfmt, ap); + va_end(ap); + if (len < 0) + return -1; + + fd = open(path, O_RDWR); + if (fd < 0) { + ret = -1; + goto out; + } + + len = write(fd, data, data_len); + if (len != data_len) { + ret = -1; + goto out; + } + +out: + close(fd); + return ret; +} + +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; + +/* This *must* be called before we touch gid_map. */ +static void update_setgroups(int pid, enum policy_t setgroup) +{ + char *policy; + + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + /* Nothing to do. */ + return; + } + + if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { + /* + * If the kernel is too old to support /proc/pid/setgroups, + * open(2) or write(2) will return ENOENT. This is fine. + */ + if (errno != ENOENT) + bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); + } +} + +static void update_uidmap(int pid, char *map, int map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) + bail("failed to update /proc/%d/uid_map", pid); +} + +static void update_gidmap(int pid, char *map, int map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) + bail("failed to update /proc/%d/gid_map", pid); +} + +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} + +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; + + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void nl_parse(int fd, struct nlconfig_t *config) +{ + size_t len, size; + struct nlmsghdr hdr; + char *data, *current; + + /* Retrieve the netlink header. */ + len = read(fd, &hdr, NLMSG_HDRLEN); + if (len != NLMSG_HDRLEN) + bail("invalid netlink header length %lu", len); + + if (hdr.nlmsg_type == NLMSG_ERROR) + bail("failed to read netlink message"); + + if (hdr.nlmsg_type != INIT_MSG) + bail("unexpected msg type %d", hdr.nlmsg_type); + + /* Retrieve data. */ + size = NLMSG_PAYLOAD(&hdr, 0); + current = data = malloc(size); + if (!data) + bail("failed to allocate %zu bytes of memory for nl_payload", size); + + len = read(fd, data, size); + if (len != size) + bail("failed to read netlink payload, %lu != %lu", len, size); + + /* Parse the netlink payload. */ + config->data = data; + while (current < data + size) { + struct nlattr *nlattr = (struct nlattr *)current; + size_t payload_len = nlattr->nla_len - NLA_HDRLEN; + + /* Advance to payload. */ + current += NLA_HDRLEN; + + /* Handle payload. */ + switch (nlattr->nla_type) { + case CLONE_FLAGS_ATTR: + config->cloneflags = readint32(current); + break; + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; + case UIDMAP_ATTR: + config->uidmap = current; + config->uidmap_len = payload_len; + break; + case GIDMAP_ATTR: + config->gidmap = current; + config->gidmap_len = payload_len; + break; + case SETGROUP_ATTR: + config->is_setgroup = readint8(current); + break; + default: + bail("unknown netlink message type %d", nlattr->nla_type); + } + + current += NLA_ALIGN(payload_len); + } +} + +void nl_free(struct nlconfig_t *config) +{ + free(config->data); +} + +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", path); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + +void nsexec(void) +{ + int pipenum; + jmp_buf env; + int syncpipe[2]; + struct nlconfig_t config = {0}; + + /* + * If we don't have an init pipe, just return to the go routine. + * We'll only get an init pipe for start or exec. + */ + pipenum = initpipe(); + if (pipenum == -1) + return; + + /* make the process non-dumpable */ + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) { + bail("failed to set process as non-dumpable"); + } + + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + + /* Pipe so we can tell the child when we've finished setting up. */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) + bail("failed to setup sync pipe between parent and child"); + + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order for this unsharing code to be more extensible we need to split + * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case + * would be if we did clone(CLONE_NEWUSER) and the other namespaces + * separately, but because of SELinux issues we cannot really do that. But + * we cannot just dump the namespace flags into clone(...) because several + * usecases (such as rootless containers) require more granularity around + * the namespace setup. In addition, some older kernels had issues where + * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot + * handle this while also dealing with SELinux so we choose SELinux support + * over broken kernel support). + * + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT: { + int len, ready = 0; + pid_t child; + char buf[JSON_MAX]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* + * State machine for synchronisation with the children. + * + * Father only return when both child and grandchild are + * ready, so we can receive all possible error codes + * generated by children. + */ + while (ready < 2) { + enum sync_t s; + + /* This doesn't need to be global, we're in the parent. */ + int syncfd = syncpipe[1]; + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: { + /* We have to mirror the error code of the child. */ + int ret; + + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + } + break; + case SYNC_USERMAP_PLS: + /* Enable setgroups(2) if we've been asked to. */ + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + + /* Set up mappings. */ + update_uidmap(child, config.uidmap, config.uidmap_len); + update_gidmap(child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_USERMAP_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); + break; + case SYNC_RECVPID_PLS: { + pid_t old = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(old, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(old, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + } + + ready++; + break; + case SYNC_RECVPID_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); + break; + case SYNC_CHILD_READY: + ready++; + break; + default: + bail("unexpected sync value"); + break; + } + } + + /* Send the init_func pid back to our parent. */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD: { + pid_t child; + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + */ + if (config.cloneflags & CLONE_NEWUSER) { + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + } + + /* + * TODO: What about non-namespace clone flags that we're dropping here? + * + * We fork again because of PID namespace, setns(2) or unshare(2) don't + * change the PID namespace of the calling process, because doing so + * would change the caller's idea of its own PID (as reported by getpid()), + * which would break many applications and libraries, so we must fork + * to actually enter the new PID namespace. + */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT: { + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + + /* Close sync pipes. */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + break; + } + + /* Should never be reached. */ + bail("should never be reached"); +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go index 37808a2..a0e9637 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go @@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) { if err != nil { return "", err } + return parseStartTime(string(data)) +} - parts := strings.Split(string(data), " ") +func parseStartTime(stat string) (string, error) { // the starttime is located at pos 22 // from the man page // @@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) { // (22) The time the process started after system boot. In kernels before Linux 2.6, this // value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks // (divide by sysconf(_SC_CLK_TCK)). - return parts[22-1], nil // starts at 1 + // + // NOTE: + // pos 2 could contain space and is inside `(` and `)`: + // (2) comm %s + // The filename of the executable, in parentheses. + // This is visible whether or not the executable is + // swapped out. + // + // the following is an example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + + // get parts after last `)`: + s := strings.Split(stat, ")") + parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ") + return parts[22-3], nil // starts at 3 (after the filename pos `2`) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 43fd39e..f258d62 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -343,7 +343,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( if len(groups) > 0 { // First match wins, even if there's more than one matching entry. user.Gid = groups[0].Gid - } else if groupArg != "" { + } else { // If we can't find a group with the given name, the only other valid // option is if it's a numeric group name with no associated entry in group. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c new file mode 100644 index 0000000..e77ca69 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c @@ -0,0 +1,148 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmsg.h" + +#define error(fmt, ...) \ + ({ \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + errno = ECOMM; \ + goto err; /* return value */ \ + }) + +/* + * Sends a file descriptor along the sockfd provided. Returns the return + * value of sendmsg(2). Any synchronisation and preparation of state + * should be done external to this (we expect the other side to be in + * recvfd() in the code). + */ +ssize_t sendfd(int sockfd, struct file_t file) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + int *fdptr; + int ret; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* + * We need to send some other data along with the ancillary data, + * otherwise the other side won't recieve any data. This is very + * well-hidden in the documentation (and only applies to + * SOCK_STREAM). See the bottom part of unix(7). + */ + iov[0].iov_base = file.name; + iov[0].iov_len = strlen(file.name) + 1; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + fdptr = (int *) CMSG_DATA(cmsg); + memcpy(fdptr, &file.fd, sizeof(int)); + + return sendmsg(sockfd, &msg, 0); +} + +/* + * Receives a file descriptor from the sockfd provided. Returns the file + * descriptor as sent from sendfd(). It will return the file descriptor + * or die (literally) trying. Any synchronisation and preparation of + * state should be done external to this (we expect the other side to be + * in sendfd() in the code). + */ +struct file_t recvfd(int sockfd) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + struct file_t file = {0}; + int *fdptr; + int olderrno; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* Allocate a buffer. */ + /* TODO: Make this dynamic with MSG_PEEK. */ + file.name = malloc(TAG_BUFFER); + if (!file.name) + error("recvfd: failed to allocate file.tag buffer\n"); + + /* + * We need to "recieve" the non-ancillary data even though we don't + * plan to use it at all. Otherwise, things won't work as expected. + * See unix(7) and other well-hidden documentation. + */ + iov[0].iov_base = file.name; + iov[0].iov_len = TAG_BUFFER; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + ssize_t ret = recvmsg(sockfd, &msg, 0); + if (ret < 0) + goto err; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) + error("recvfd: got NULL from CMSG_FIRSTHDR"); + if (cmsg->cmsg_level != SOL_SOCKET) + error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level); + if (cmsg->cmsg_type != SCM_RIGHTS) + error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type); + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + error("recvfd: expected correct CMSG_LEN in cmsg: %lu", cmsg->cmsg_len); + + fdptr = (int *) CMSG_DATA(cmsg); + if (!fdptr || *fdptr < 0) + error("recvfd: recieved invalid pointer"); + + file.fd = *fdptr; + return file; + +err: + olderrno = errno; + free(file.name); + errno = olderrno; + return (struct file_t){0}; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 0000000..ee89374 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,57 @@ +// +build linux + +package utils + +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +#include +#include +#include "cmsg.h" +*/ +import "C" + +import ( + "os" + "unsafe" +) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + file, err := C.recvfd(C.int(socket.Fd())) + if err != nil { + return nil, err + } + defer C.free(unsafe.Pointer(file.name)) + return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket, file *os.File) error { + var cfile C.struct_file_t + cfile.fd = C.int(file.Fd()) + cfile.name = C.CString(file.Name()) + defer C.free(unsafe.Pointer(cfile.name)) + + _, err := C.sendfd(C.int(socket.Fd()), cfile) + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h new file mode 100644 index 0000000..3fe7642 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h @@ -0,0 +1,36 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if !defined(CMSG_H) +#define CMSG_H + +#include + +/* TODO: Implement this properly with MSG_PEEK. */ +#define TAG_BUFFER 4096 + +/* This mirrors Go's (*os.File). */ +struct file_t { + char *name; + int fd; +}; + +struct file_t recvfd(int sockfd); +ssize_t sendfd(int sockfd, struct file_t file); + +#endif /* !defined(CMSG_H) */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 0000000..2b35b9a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,126 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "syscall" + "unsafe" +) + +const ( + exitSignalOffset = 128 +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + if size > 64 { + size = 64 + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status syscall.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 0000000..408918f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,33 @@ +// +build !windows + +package utils + +import ( + "io/ioutil" + "strconv" + "syscall" +) + +func CloseExecFrom(minFd int) error { + fdList, err := ioutil.ReadDir("/proc/self/fd") + if err != nil { + return err + } + for _, fi := range fdList { + fd, err := strconv.Atoi(fi.Name()) + if err != nil { + // ignore non-numeric file names + continue + } + + if fd < minFd { + // ignore descriptors lower than our specified minimum + continue + } + + // intentionally ignore errors from syscall.CloseOnExec + syscall.CloseOnExec(fd) + // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) + } + return nil +}