Update runc to ce450bcc6c135cae93ee2a99d41a308c179ff6dc

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
2017-01-24 14:57:11 -08:00 · 2017-01-24 14:57:11 -08:00 · 271cac8634
commit 271cac8634
parent e09b0b0c35
13 changed files with 1254 additions and 5 deletions
--- a/vendor.conf
+++ b/vendor.conf
@ -1,5 +1,5 @@
 # go-runc client for runc; master as of 01/20/2017
-github.com/crosbymichael/go-runc afca56d262e694d9056e937a0877a39ab879aeb4
+github.com/crosbymichael/go-runc 7b66c5da30493c5eb9c655cab67ba88071891ac5
 # go-metrics client to prometheus; master as of 12/16/2016
 github.com/docker/go-metrics 0f35294225552d968a13f9c5bc71a3fa44b2eb87
 # prometheus client; latest release as of 12/16/2016
@ -31,7 +31,7 @@ github.com/nats-io/go-nats-streaming v0.3.4
 # gnatsd; latest release as of 12/16/2016
 github.com/nats-io/gnatsd v0.9.6
 # runc, latest release as of 12/16/2016
-github.com/opencontainers/runc v1.0.0-rc2
+github.com/opencontainers/runc ce450bcc6c135cae93ee2a99d41a308c179ff6dc
 # OCI runtime spec, latest release as of 12/16/2016
 github.com/opencontainers/runtime-spec v1.0.0-rc3
 # logrus, latest release as of 12/16/2016
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
@ -0,0 +1,32 @@
 #ifndef NSENTER_NAMESPACE_H
 #define NSENTER_NAMESPACE_H
 #ifndef _GNU_SOURCE
 #	define _GNU_SOURCE
 #endif
 #include <sched.h>
 /* All of these are taken from include/uapi/linux/sched.h */
 #ifndef CLONE_NEWNS
 #	define CLONE_NEWNS 0x00020000 /* New mount namespace group */
 #endif
 #ifndef CLONE_NEWCGROUP
 #	define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
 #endif
 #ifndef CLONE_NEWUTS
 #	define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
 #endif
 #ifndef CLONE_NEWIPC
 #	define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
 #endif
 #ifndef CLONE_NEWUSER
 #	define CLONE_NEWUSER 0x10000000 /* New user namespace */
 #endif
 #ifndef CLONE_NEWPID
 #	define CLONE_NEWPID 0x20000000 /* New pid namespace */
 #endif
 #ifndef CLONE_NEWNET
 #	define CLONE_NEWNET 0x40000000 /* New network namespace */
 #endif
 #endif /* NSENTER_NAMESPACE_H */
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
@ -0,0 +1,12 @@
 // +build linux,!gccgo
 package nsenter
 /*
 #cgo CFLAGS: -Wall
 extern void nsexec();
 void __attribute__((constructor)) init(void) {
 	nsexec();
 }
 */
 import "C"
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
@ -0,0 +1,25 @@
 // +build linux,gccgo
 package nsenter
 /*
 #cgo CFLAGS: -Wall
 extern void nsexec();
 void __attribute__((constructor)) init(void) {
 	nsexec();
 }
 */
 import "C"
 // AlwaysFalse is here to stay false
 // (and be exported so the compiler doesn't optimize out its reference)
 var AlwaysFalse bool
 func init() {
 	if AlwaysFalse {
 		// by referencing this C init() in a noop test, it will ensure the compiler
 		// links in the C function.
 		// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
 		C.init()
 	}
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
@ -0,0 +1,5 @@
 // +build !linux !cgo
 package nsenter
 import "C"
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
@ -0,0 +1,759 @@
 #define _GNU_SOURCE
 #include <endian.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <grp.h>
 #include <sched.h>
 #include <setjmp.h>
 #include <signal.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <sys/prctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <linux/limits.h>
 #include <linux/netlink.h>
 #include <linux/types.h>
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
 	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
 	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
 	SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */
 	/* XXX: This doesn't help with segfaults and other such issues. */
 	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
 };
 /* longjmp() arguments. */
 #define JUMP_PARENT 0x00
 #define JUMP_CHILD  0xA0
 #define JUMP_INIT   0xA1
 /* JSON buffer. */
 #define JSON_MAX 4096
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
 	/*
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
 	char stack[4096] __attribute__ ((aligned(16)));
 	char stack_ptr[0];
 	/* There's two children. This is used to execute the different code. */
 	jmp_buf *env;
 	int jmpval;
 };
 struct nlconfig_t {
 	char *data;
 	uint32_t cloneflags;
 	char *uidmap;
 	size_t uidmap_len;
 	char *gidmap;
 	size_t gidmap_len;
 	char *namespaces;
 	size_t namespaces_len;
 	uint8_t is_setgroup;
 };
 /*
 * List of netlink message types sent to us as part of bootstrapping the init.
 * These constants are defined in libcontainer/message_linux.go.
 */
 #define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
 #define UIDMAP_ATTR		27283
 #define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 /*
 * Use the raw syscall for versions of glibc which don't include a function for
 * it, namely (glibc 2.12).
 */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
 #	define _GNU_SOURCE
 #	include "syscall.h"
 #	if !defined(SYS_setns) && defined(__NR_setns)
 #		define SYS_setns __NR_setns
 #	endif
 #ifndef SYS_setns
 #	error "setns(2) syscall not supported by glibc version"
 #endif
 int setns(int fd, int nstype)
 {
 	return syscall(SYS_setns, fd, nstype);
 }
 #endif
 /* XXX: This is ugly. */
 static int syncfd = -1;
 /* TODO(cyphar): Fix this so it correctly deals with syncT. */
 #define bail(fmt, ...)								\
 	do {									\
 		int ret = __COUNTER__ + 1;					\
 		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
 		if (syncfd >= 0) {						\
 			enum sync_t s = SYNC_ERR;				\
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
 				fprintf(stderr, "nsenter: failed: write(s)");	\
 			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
 				fprintf(stderr, "nsenter: failed: write(ret)");	\
 		}								\
 		exit(ret);							\
 	} while(0)
 static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 {
 	int fd, len, ret = 0;
 	char path[PATH_MAX];
 	va_list ap;
 	va_start(ap, pathfmt);
 	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
 	va_end(ap);
 	if (len < 0)
 		return -1;
 	fd = open(path, O_RDWR);
 	if (fd < 0) {
 		ret = -1;
 		goto out;
 	}
 	len = write(fd, data, data_len);
 	if (len != data_len) {
 		ret = -1;
 		goto out;
 	}
 out:
 	close(fd);
 	return ret;
 }
 enum policy_t {
 	SETGROUPS_DEFAULT = 0,
 	SETGROUPS_ALLOW,
 	SETGROUPS_DENY,
 };
 /* This *must* be called before we touch gid_map. */
 static void update_setgroups(int pid, enum policy_t setgroup)
 {
 	char *policy;
 	switch (setgroup) {
 		case SETGROUPS_ALLOW:
 			policy = "allow";
 			break;
 		case SETGROUPS_DENY:
 			policy = "deny";
 			break;
 		case SETGROUPS_DEFAULT:
 			/* Nothing to do. */
 			return;
 	}
 	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
 		/*
 		 * If the kernel is too old to support /proc/pid/setgroups,
 		 * open(2) or write(2) will return ENOENT. This is fine.
 		 */
 		if (errno != ENOENT)
 			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
 	}
 }
 static void update_uidmap(int pid, char *map, int map_len)
 {
 	if (map == NULL || map_len <= 0)
 		return;
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
 		bail("failed to update /proc/%d/uid_map", pid);
 }
 static void update_gidmap(int pid, char *map, int map_len)
 {
 	if (map == NULL || map_len <= 0)
 		return;
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
 		bail("failed to update /proc/%d/gid_map", pid);
 }
 /* A dummy function that just jumps to the given jumpval. */
 static int child_func(void *arg) __attribute__ ((noinline));
 static int child_func(void *arg)
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 }
 static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 	struct clone_t ca = {
 		.env    = env,
 		.jmpval = jmpval,
 	};
 	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
 }
 /*
 * Gets the init pipe fd from the environment, which is used to read the
 * bootstrap data and tell the parent what the new pid is after we finish
 * setting up the environment.
 */
 static int initpipe(void)
 {
 	int pipenum;
 	char *initpipe, *endptr;
 	initpipe = getenv("_LIBCONTAINER_INITPIPE");
 	if (initpipe == NULL || *initpipe == '\0')
 		return -1;
 	pipenum = strtol(initpipe, &endptr, 10);
 	if (*endptr != '\0')
 		bail("unable to parse _LIBCONTAINER_INITPIPE");
 	return pipenum;
 }
 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
 static int nsflag(char *name)
 {
 	if (!strcmp(name, "cgroup"))
 		return CLONE_NEWCGROUP;
 	else if (!strcmp(name, "ipc"))
 		return CLONE_NEWIPC;
 	else if (!strcmp(name, "mnt"))
 		return CLONE_NEWNS;
 	else if (!strcmp(name, "net"))
 		return CLONE_NEWNET;
 	else if (!strcmp(name, "pid"))
 		return CLONE_NEWPID;
 	else if (!strcmp(name, "user"))
 		return CLONE_NEWUSER;
 	else if (!strcmp(name, "uts"))
 		return CLONE_NEWUTS;
 	/* If we don't recognise a name, fallback to 0. */
 	return 0;
 }
 static uint32_t readint32(char *buf)
 {
 	return *(uint32_t *) buf;
 }
 static uint8_t readint8(char *buf)
 {
 	return *(uint8_t *) buf;
 }
 static void nl_parse(int fd, struct nlconfig_t *config)
 {
 	size_t len, size;
 	struct nlmsghdr hdr;
 	char *data, *current;
 	/* Retrieve the netlink header. */
 	len = read(fd, &hdr, NLMSG_HDRLEN);
 	if (len != NLMSG_HDRLEN)
 		bail("invalid netlink header length %lu", len);
 	if (hdr.nlmsg_type == NLMSG_ERROR)
 		bail("failed to read netlink message");
 	if (hdr.nlmsg_type != INIT_MSG)
 		bail("unexpected msg type %d", hdr.nlmsg_type);
 	/* Retrieve data. */
 	size = NLMSG_PAYLOAD(&hdr, 0);
 	current = data = malloc(size);
 	if (!data)
 		bail("failed to allocate %zu bytes of memory for nl_payload", size);
 	len = read(fd, data, size);
 	if (len != size)
 		bail("failed to read netlink payload, %lu != %lu", len, size);
 	/* Parse the netlink payload. */
 	config->data = data;
 	while (current < data + size) {
 		struct nlattr *nlattr = (struct nlattr *)current;
 		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
 		/* Advance to payload. */
 		current += NLA_HDRLEN;
 		/* Handle payload. */
 		switch (nlattr->nla_type) {
 		case CLONE_FLAGS_ATTR:
 			config->cloneflags = readint32(current);
 			break;
 		case NS_PATHS_ATTR:
 			config->namespaces = current;
 			config->namespaces_len = payload_len;
 			break;
 		case UIDMAP_ATTR:
 			config->uidmap = current;
 			config->uidmap_len = payload_len;
 			break;
 		case GIDMAP_ATTR:
 			config->gidmap = current;
 			config->gidmap_len = payload_len;
 			break;
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
 		default:
 			bail("unknown netlink message type %d", nlattr->nla_type);
 		}
 		current += NLA_ALIGN(payload_len);
 	}
 }
 void nl_free(struct nlconfig_t *config)
 {
 	free(config->data);
 }
 void join_namespaces(char *nslist)
 {
 	int num = 0, i;
 	char *saveptr = NULL;
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 		int fd;
 		int ns;
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
 	if (!namespace || !strlen(namespace) || !strlen(nslist))
 		bail("ns paths are empty");
 	/*
 	 * We have to open the file descriptors first, since after
 	 * we join the mnt namespace we might no longer be able to
 	 * access the paths.
 	 */
 	do {
 		int fd;
 		char *path;
 		struct namespace_t *ns;
 		/* Resize the namespace array. */
 		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
 		if (!namespaces)
 			bail("failed to reallocate namespace array");
 		ns = &namespaces[num - 1];
 		/* Split 'ns:path'. */
 		path = strstr(namespace, ":");
 		if (!path)
 			bail("failed to parse %s", namespace);
 		*path++ = '\0';
 		fd = open(path, O_RDONLY);
 		if (fd < 0)
 			bail("failed to open %s", path);
 		ns->fd = fd;
 		ns->ns = nsflag(namespace);
 		strncpy(ns->path, path, PATH_MAX);
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 	/*
 	 * The ordering in which we join namespaces is important. We should
 	 * always join the user namespace *first*. This is all guaranteed
 	 * from the container_linux.go side of this, so we're just going to
 	 * follow the order given to us.
 	 */
 	for (i = 0; i < num; i++) {
 		struct namespace_t ns = namespaces[i];
 		if (setns(ns.fd, ns.ns) < 0)
 			bail("failed to setns to %s", ns.path);
 		close(ns.fd);
 	}
 	free(namespaces);
 }
 void nsexec(void)
 {
 	int pipenum;
 	jmp_buf env;
 	int syncpipe[2];
 	struct nlconfig_t config = {0};
 	/*
 	 * If we don't have an init pipe, just return to the go routine.
 	 * We'll only get an init pipe for start or exec.
 	 */
 	pipenum = initpipe();
 	if (pipenum == -1)
 		return;
 	/* make the process non-dumpable */
 	if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) {
 		bail("failed to set process as non-dumpable");
 	}
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
 	/* Pipe so we can tell the child when we've finished setting up. */
 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
 		bail("failed to setup sync pipe between parent and child");
 	/* TODO: Currently we aren't dealing with child deaths properly. */
 	/*
 	 * Okay, so this is quite annoying.
 	 *
 	 * In order for this unsharing code to be more extensible we need to split
 	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
 	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
 	 * separately, but because of SELinux issues we cannot really do that. But
 	 * we cannot just dump the namespace flags into clone(...) because several
 	 * usecases (such as rootless containers) require more granularity around
 	 * the namespace setup. In addition, some older kernels had issues where
 	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
 	 * handle this while also dealing with SELinux so we choose SELinux support
 	 * over broken kernel support).
 	 *
 	 * However, if we unshare(2) the user namespace *before* we clone(2), then
 	 * all hell breaks loose.
 	 *
 	 * The parent no longer has permissions to do many things (unshare(2) drops
 	 * all capabilities in your old namespace), and the container cannot be set
 	 * up to have more than one {uid,gid} mapping. This is obviously less than
 	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
 	 *
 	 * Unfortunately, it's not as simple as that. We have to fork to enter the
 	 * PID namespace (the PID namespace only applies to children). Since we'll
 	 * have to double-fork, this clone_parent() call won't be able to get the
 	 * PID of the _actual_ init process (without doing more synchronisation than
 	 * I can deal with at the moment). So we'll just get the parent to send it
 	 * for us, the only job of this process is to update
 	 * /proc/pid/{setgroups,uid_map,gid_map}.
 	 *
 	 * And as a result of the above, we also need to setns(2) in the first child
 	 * because if we join a PID namespace in the topmost parent then our child
 	 * will be in that namespace (and it will not be able to give us a PID value
 	 * that makes sense without resorting to sending things with cmsg).
 	 *
 	 * This also deals with an older issue caused by dumping cloneflags into
 	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
 	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
 	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
 	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
 	 * aware, the last mainline kernel which had this bug was Linux 3.12.
 	 * However, we cannot comment on which kernels the broken patch was
 	 * backported to.
 	 *
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 	switch (setjmp(env)) {
 	/*
 	 * Stage 0: We're in the parent. Our job is just to create a new child
 	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
 	 *          gid_map. That process will go on to create a new process, then
 	 *          it will send us its PID which we will send to the bootstrap
 	 *          process.
 	 */
 	case JUMP_PARENT: {
 			int len, ready = 0;
 			pid_t child;
 			char buf[JSON_MAX];
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
 			/* Start the process of getting a container. */
 			child = clone_parent(&env, JUMP_CHILD);
 			if (child < 0)
 				bail("unable to fork: child_func");
 			/*
 			 * State machine for synchronisation with the children.
 			 *
 			 * Father only return when both child and grandchild are
 			 * ready, so we can receive all possible error codes
 			 * generated by children.
 			 */
 			while (ready < 2) {
 				enum sync_t s;
 				/* This doesn't need to be global, we're in the parent. */
 				int syncfd = syncpipe[1];
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with child: next state");
 				switch (s) {
 				case SYNC_ERR: {
 						/* We have to mirror the error code of the child. */
 						int ret;
 						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
 							bail("failed to sync with child: read(error code)");
 						exit(ret);
 					}
 					break;
 				case SYNC_USERMAP_PLS:
 					/* Enable setgroups(2) if we've been asked to. */
 					if (config.is_setgroup)
 						update_setgroups(child, SETGROUPS_ALLOW);
 					/* Set up mappings. */
 					update_uidmap(child, config.uidmap, config.uidmap_len);
 					update_gidmap(child, config.gidmap, config.gidmap_len);
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						kill(child, SIGKILL);
 						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
 					}
 					break;
 				case SYNC_USERMAP_ACK:
 					/* We should _never_ receive acks. */
 					kill(child, SIGKILL);
 					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
 					break;
 				case SYNC_RECVPID_PLS: {
 						pid_t old = child;
 						/* Get the init_func pid. */
 						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
 							kill(old, SIGKILL);
 							bail("failed to sync with child: read(childpid)");
 						}
 						/* Send ACK. */
 						s = SYNC_RECVPID_ACK;
 						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 							kill(old, SIGKILL);
 							kill(child, SIGKILL);
 							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
 						}
 					}
 					ready++;
 					break;
 				case SYNC_RECVPID_ACK:
 					/* We should _never_ receive acks. */
 					kill(child, SIGKILL);
 					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
 					break;
 				case SYNC_CHILD_READY:
 					ready++;
 					break;
 				default:
 					bail("unexpected sync value");
 					break;
 				}
 			}
 			/* Send the init_func pid back to our parent. */
 			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
 			if (len < 0) {
 				kill(child, SIGKILL);
 				bail("unable to generate JSON for child pid");
 			}
 			if (write(pipenum, buf, len) != len) {
 				kill(child, SIGKILL);
 				bail("unable to send child pid to bootstrapper");
 			}
 			exit(0);
 		}
 	/*
 	 * Stage 1: We're in the first child process. Our job is to join any
 	 *          provided namespaces in the netlink payload and unshare all
 	 *          of the requested namespaces. If we've been asked to
 	 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
 	 *          our user mappings for us. Then, we create a new child
 	 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
 	 *          child's PID to our parent (stage 0).
 	 */
 	case JUMP_CHILD: {
 			pid_t child;
 			enum sync_t s;
 			/* We're in a child and thus need to tell the parent if we die. */
 			syncfd = syncpipe[0];
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
 			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
 			if (config.namespaces)
 				join_namespaces(config.namespaces);
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
 			 * containers). But for now, it's not possible to split this into
 			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
 			 *
 			 * Note that we don't merge this with clone() because there were
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
 			if (unshare(config.cloneflags) < 0)
 				bail("failed to unshare namespaces");
 			/*
 			 * Deal with user namespaces first. They are quite special, as they
 			 * affect our ability to unshare other namespaces and are used as
 			 * context for privilege checks.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
 				/*
 				 * We don't have the privileges to do any mapping here (see the
 				 * clone_parent rant). So signal our parent to hook us up.
 				 */
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 				/* ... wait for mapping ... */
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
 			}
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 *
 			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
 			 * change the PID namespace of the calling process, because doing so
 			 * would change the caller's idea of its own PID (as reported by getpid()),
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 */
 			child = clone_parent(&env, JUMP_INIT);
 			if (child < 0)
 				bail("unable to fork: init_func");
 			/* Send the child to our parent, which knows what it's doing. */
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				kill(child, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
 			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
 				kill(child, SIGKILL);
 				bail("failed to sync with parent: write(childpid)");
 			}
 			/* ... wait for parent to get the pid ... */
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				kill(child, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			if (s != SYNC_RECVPID_ACK) {
 				kill(child, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
 			exit(0);
 		}
 	/*
 	 * Stage 2: We're the final child process, and the only process that will
 	 *          actually return to the Go runtime. Our job is to just do the
 	 *          final cleanup steps and then return to the Go runtime to allow
 	 *          init_linux.go to run.
 	 */
 	case JUMP_INIT: {
 			/*
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
 			 */
 			enum sync_t s;
 			/* We're in a child and thus need to tell the parent if we die. */
 			syncfd = syncpipe[0];
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
 			if (setsid() < 0)
 				bail("setsid failed");
 			if (setuid(0) < 0)
 				bail("setuid failed");
 			if (setgid(0) < 0)
 				bail("setgid failed");
 			if (setgroups(0, NULL) < 0)
 				bail("setgroups failed");
 			s = SYNC_CHILD_READY;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
 			/* Close sync pipes. */
 			close(syncpipe[0]);
 			close(syncpipe[1]);
 			/* Free netlink data. */
 			nl_free(&config);
 			/* Finish executing, let the Go runtime take over. */
 			return;
 		}
 	default:
 		bail("unexpected jump value");
 		break;
 	}
 	/* Should never be reached. */
 	bail("should never be reached");
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
 	if err != nil {
 		return "", err
 	}
 	return parseStartTime(string(data))
 }
-	parts := strings.Split(string(data), " ")
+func parseStartTime(stat string) (string, error) {
 	// the starttime is located at pos 22
 	// from the man page
 	//
@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
 	// (22)  The  time the process started after system boot.  In kernels before Linux 2.6, this
 	// value was expressed in jiffies.  Since Linux 2.6, the value is expressed in  clock  ticks
 	// (divide by sysconf(_SC_CLK_TCK)).
-	return parts[22-1], nil // starts at 1
+	//
 	// NOTE:
 	// pos 2 could contain space and is inside `(` and `)`:
 	// (2) comm  %s
 	// The filename of the executable, in parentheses.
 	// This is visible whether or not the executable is
 	// swapped out.
 	//
 	// the following is an example:
 	// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 	// get parts after last `)`:
 	s := strings.Split(stat, ")")
 	parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
 	return parts[22-3], nil // starts at 3 (after the filename pos `2`)
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
@ -343,7 +343,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
 			if len(groups) > 0 {
 				// First match wins, even if there's more than one matching entry.
 				user.Gid = groups[0].Gid
-			} else if groupArg != "" {
+			} else {
 				// If we can't find a group with the given name, the only other valid
 				// option is if it's a numeric group name with no associated entry in group.
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c
@ -0,0 +1,148 @@
 /*
 * Copyright 2016 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include "cmsg.h"
 #define error(fmt, ...)							\
 	({								\
 		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
 		errno = ECOMM;						\
 		goto err; /* return value */				\
 	})
 /*
 * Sends a file descriptor along the sockfd provided. Returns the return
 * value of sendmsg(2). Any synchronisation and preparation of state
 * should be done external to this (we expect the other side to be in
 * recvfd() in the code).
 */
 ssize_t sendfd(int sockfd, struct file_t file)
 {
 	struct msghdr msg = {0};
 	struct iovec iov[1] = {0};
 	struct cmsghdr *cmsg;
 	int *fdptr;
 	int ret;
 	union {
 		char buf[CMSG_SPACE(sizeof(file.fd))];
 		struct cmsghdr align;
 	} u;
 	/*
 	 * We need to send some other data along with the ancillary data,
 	 * otherwise the other side won't recieve any data. This is very
 	 * well-hidden in the documentation (and only applies to
 	 * SOCK_STREAM). See the bottom part of unix(7).
 	 */
 	iov[0].iov_base = file.name;
 	iov[0].iov_len = strlen(file.name) + 1;
 	msg.msg_name = NULL;
 	msg.msg_namelen = 0;
 	msg.msg_iov = iov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = u.buf;
 	msg.msg_controllen = sizeof(u.buf);
 	cmsg = CMSG_FIRSTHDR(&msg);
 	cmsg->cmsg_level = SOL_SOCKET;
 	cmsg->cmsg_type = SCM_RIGHTS;
 	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
 	fdptr = (int *) CMSG_DATA(cmsg);
 	memcpy(fdptr, &file.fd, sizeof(int));
 	return sendmsg(sockfd, &msg, 0);
 }
 /*
 * Receives a file descriptor from the sockfd provided. Returns the file
 * descriptor as sent from sendfd(). It will return the file descriptor
 * or die (literally) trying. Any synchronisation and preparation of
 * state should be done external to this (we expect the other side to be
 * in sendfd() in the code).
 */
 struct file_t recvfd(int sockfd)
 {
 	struct msghdr msg = {0};
 	struct iovec iov[1] = {0};
 	struct cmsghdr *cmsg;
 	struct file_t file = {0};
 	int *fdptr;
 	int olderrno;
 	union {
 		char buf[CMSG_SPACE(sizeof(file.fd))];
 		struct cmsghdr align;
 	} u;
 	/* Allocate a buffer. */
 	/* TODO: Make this dynamic with MSG_PEEK. */
 	file.name = malloc(TAG_BUFFER);
 	if (!file.name)
 		error("recvfd: failed to allocate file.tag buffer\n");
 	/*
 	 * We need to "recieve" the non-ancillary data even though we don't
 	 * plan to use it at all. Otherwise, things won't work as expected.
 	 * See unix(7) and other well-hidden documentation.
 	 */
 	iov[0].iov_base = file.name;
 	iov[0].iov_len = TAG_BUFFER;
 	msg.msg_name = NULL;
 	msg.msg_namelen = 0;
 	msg.msg_iov = iov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = u.buf;
 	msg.msg_controllen = sizeof(u.buf);
 	ssize_t ret = recvmsg(sockfd, &msg, 0);
 	if (ret < 0)
 		goto err;
 	cmsg = CMSG_FIRSTHDR(&msg);
 	if (!cmsg)
 		error("recvfd: got NULL from CMSG_FIRSTHDR");
 	if (cmsg->cmsg_level != SOL_SOCKET)
 		error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
 	if (cmsg->cmsg_type != SCM_RIGHTS)
 		error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
 	if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
 		error("recvfd: expected correct CMSG_LEN in cmsg: %lu", cmsg->cmsg_len);
 	fdptr = (int *) CMSG_DATA(cmsg);
 	if (!fdptr || *fdptr < 0)
 		error("recvfd: recieved invalid pointer");
 	file.fd = *fdptr;
 	return file;
 err:
 	olderrno = errno;
 	free(file.name);
 	errno = olderrno;
 	return (struct file_t){0};
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@ -0,0 +1,57 @@
 // +build linux
 package utils
 /*
 * Copyright 2016 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 #include <errno.h>
 #include <stdlib.h>
 #include "cmsg.h"
 */
 import "C"
 import (
 	"os"
 	"unsafe"
 )
 // RecvFd waits for a file descriptor to be sent over the given AF_UNIX
 // socket. The file name of the remote file descriptor will be recreated
 // locally (it is sent as non-auxiliary data in the same payload).
 func RecvFd(socket *os.File) (*os.File, error) {
 	file, err := C.recvfd(C.int(socket.Fd()))
 	if err != nil {
 		return nil, err
 	}
 	defer C.free(unsafe.Pointer(file.name))
 	return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
 }
 // SendFd sends a file descriptor over the given AF_UNIX socket. In
 // addition, the file.Name() of the given file will also be sent as
 // non-auxiliary data in the same payload (allowing to send contextual
 // information for a file descriptor).
 func SendFd(socket, file *os.File) error {
 	var cfile C.struct_file_t
 	cfile.fd = C.int(file.Fd())
 	cfile.name = C.CString(file.Name())
 	defer C.free(unsafe.Pointer(cfile.name))
 	_, err := C.sendfd(C.int(socket.Fd()), cfile)
 	return err
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h
@ -0,0 +1,36 @@
 /*
 * Copyright 2016 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #if !defined(CMSG_H)
 #define CMSG_H
 #include <sys/types.h>
 /* TODO: Implement this properly with MSG_PEEK. */
 #define TAG_BUFFER 4096
 /* This mirrors Go's (*os.File). */
 struct file_t {
 	char *name;
 	int fd;
 };
 struct file_t recvfd(int sockfd);
 ssize_t sendfd(int sockfd, struct file_t file);
 #endif /* !defined(CMSG_H) */
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@ -0,0 +1,126 @@
 package utils
 import (
 	"crypto/rand"
 	"encoding/hex"
 	"encoding/json"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"syscall"
 	"unsafe"
 )
 const (
 	exitSignalOffset = 128
 )
 // GenerateRandomName returns a new name joined with a prefix.  This size
 // specified is used to truncate the randomly generated value
 func GenerateRandomName(prefix string, size int) (string, error) {
 	id := make([]byte, 32)
 	if _, err := io.ReadFull(rand.Reader, id); err != nil {
 		return "", err
 	}
 	if size > 64 {
 		size = 64
 	}
 	return prefix + hex.EncodeToString(id)[:size], nil
 }
 // ResolveRootfs ensures that the current working directory is
 // not a symlink and returns the absolute path to the rootfs
 func ResolveRootfs(uncleanRootfs string) (string, error) {
 	rootfs, err := filepath.Abs(uncleanRootfs)
 	if err != nil {
 		return "", err
 	}
 	return filepath.EvalSymlinks(rootfs)
 }
 // ExitStatus returns the correct exit status for a process based on if it
 // was signaled or exited cleanly
 func ExitStatus(status syscall.WaitStatus) int {
 	if status.Signaled() {
 		return exitSignalOffset + int(status.Signal())
 	}
 	return status.ExitStatus()
 }
 // WriteJSON writes the provided struct v to w using standard json marshaling
 func WriteJSON(w io.Writer, v interface{}) error {
 	data, err := json.Marshal(v)
 	if err != nil {
 		return err
 	}
 	_, err = w.Write(data)
 	return err
 }
 // CleanPath makes a path safe for use with filepath.Join. This is done by not
 // only cleaning the path, but also (if the path is relative) adding a leading
 // '/' and cleaning it (then removing the leading '/'). This ensures that a
 // path resulting from prepending another path will always resolve to lexically
 // be a subdirectory of the prefixed path. This is all done lexically, so paths
 // that include symlinks won't be safe as a result of using CleanPath.
 func CleanPath(path string) string {
 	// Deal with empty strings nicely.
 	if path == "" {
 		return ""
 	}
 	// Ensure that all paths are cleaned (especially problematic ones like
 	// "/../../../../../" which can cause lots of issues).
 	path = filepath.Clean(path)
 	// If the path isn't absolute, we need to do more processing to fix paths
 	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
 	// paths to relative ones.
 	if !filepath.IsAbs(path) {
 		path = filepath.Clean(string(os.PathSeparator) + path)
 		// This can't fail, as (by definition) all paths are relative to root.
 		path, _ = filepath.Rel(string(os.PathSeparator), path)
 	}
 	// Clean the path again for good measure.
 	return filepath.Clean(path)
 }
 // SearchLabels searches a list of key-value pairs for the provided key and
 // returns the corresponding value. The pairs must be separated with '='.
 func SearchLabels(labels []string, query string) string {
 	for _, l := range labels {
 		parts := strings.SplitN(l, "=", 2)
 		if len(parts) < 2 {
 			continue
 		}
 		if parts[0] == query {
 			return parts[1]
 		}
 	}
 	return ""
 }
 // Annotations returns the bundle path and user defined annotations from the
 // libcontainer state.  We need to remove the bundle because that is a label
 // added by libcontainer.
 func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
 	userAnnotations = make(map[string]string)
 	for _, l := range labels {
 		parts := strings.SplitN(l, "=", 2)
 		if len(parts) < 2 {
 			continue
 		}
 		if parts[0] == "bundle" {
 			bundle = parts[1]
 		} else {
 			userAnnotations[parts[0]] = parts[1]
 		}
 	}
 	return
 }
 func GetIntSize() int {
 	return int(unsafe.Sizeof(1))
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@ -0,0 +1,33 @@
 // +build !windows
 package utils
 import (
 	"io/ioutil"
 	"strconv"
 	"syscall"
 )
 func CloseExecFrom(minFd int) error {
 	fdList, err := ioutil.ReadDir("/proc/self/fd")
 	if err != nil {
 		return err
 	}
 	for _, fi := range fdList {
 		fd, err := strconv.Atoi(fi.Name())
 		if err != nil {
 			// ignore non-numeric file names
 			continue
 		}
 		if fd < minFd {
 			// ignore descriptors lower than our specified minimum
 			continue
 		}
 		// intentionally ignore errors from syscall.CloseOnExec
 		syscall.CloseOnExec(fd)
 		// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
 	}
 	return nil
 }