commit a4395189ad3da9a21b2759da734c5e7b2e2ebf8c Author: Vincent Batts Date: Fri Aug 8 22:36:39 2014 -0400 initial commit diff --git a/demo_userns.c b/demo_userns.c new file mode 100644 index 0000000..f7139f1 --- /dev/null +++ b/demo_userns.c @@ -0,0 +1,66 @@ +/* demo_userns.c + + Copyright 2013, Michael Kerrisk + Licensed under GNU General Public License v2 or later + + Demonstrate the use of the clone() CLONE_NEWUSER flag. + + Link with "-lcap" and make sure that the "libcap-devel" (or + similar) package is installed on the system. +*/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ + } while (0) + +static int /* Startup function for cloned child */ +childFunc(void *arg) +{ + cap_t caps; + + for (;;) { + printf("eUID = %ld; eGID = %ld; ", + (long) geteuid(), (long) getegid()); + + caps = cap_get_proc(); + printf("capabilities: %s\n", cap_to_text(caps, NULL)); + + if (arg == NULL) + break; + + sleep(5); + } + + return 0; +} + +#define STACK_SIZE (1024 * 1024) + +static char child_stack[STACK_SIZE]; /* Space for child's stack */ + +int +main(int argc, char *argv[]) +{ + pid_t pid; + + /* Create child; child commences execution in childFunc() */ + + pid = clone(childFunc, child_stack + STACK_SIZE, /* Assume stack + grows downward */ + CLONE_NEWUSER | SIGCHLD, argv[1]); + if (pid == -1) + errExit("clone"); + + /* Parent falls through to here. Wait for child. */ + + if (waitpid(pid, NULL, 0) == -1) + errExit("waitpid"); + + exit(EXIT_SUCCESS); +} diff --git a/ns_child_exec.c b/ns_child_exec.c new file mode 100644 index 0000000..ed61008 --- /dev/null +++ b/ns_child_exec.c @@ -0,0 +1,97 @@ +/* ns_child_exec.c + + Copyright 2013, Michael Kerrisk + Licensed under GNU General Public License v2 or later + + Create a child process that executes a shell command in new namespace(s). +*/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* A simple error-handling function: print an error message based + on the value in 'errno' and terminate the calling process */ + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ + } while (0) + +static void +usage(char *pname) +{ + fprintf(stderr, "Usage: %s [options] cmd [arg...]\n", pname); + fprintf(stderr, "Options can be:\n"); + fprintf(stderr, " -i new IPC namespace\n"); + fprintf(stderr, " -m new mount namespace\n"); + fprintf(stderr, " -n new network namespace\n"); + fprintf(stderr, " -p new PID namespace\n"); + fprintf(stderr, " -u new UTS namespace\n"); + fprintf(stderr, " -U new user namespace\n"); + fprintf(stderr, " -v Display verbose messages\n"); + exit(EXIT_FAILURE); +} + +static int /* Start function for cloned child */ +childFunc(void *arg) +{ + char **argv = arg; + + execvp(argv[0], &argv[0]); + errExit("execvp"); +} + +#define STACK_SIZE (1024 * 1024) + +static char child_stack[STACK_SIZE]; /* Space for child's stack */ + +int +main(int argc, char *argv[]) +{ + int flags, opt, verbose; + pid_t child_pid; + + flags = 0; + verbose = 0; + + /* Parse command-line options. The initial '+' character in + the final getopt() argument prevents GNU-style permutation + of command-line options. That's useful, since sometimes + the 'command' to be executed by this program itself + has command-line options. We don't want getopt() to treat + those as options to this program. */ + + while ((opt = getopt(argc, argv, "+imnpuUv")) != -1) { + switch (opt) { + case 'i': flags |= CLONE_NEWIPC; break; + case 'm': flags |= CLONE_NEWNS; break; + case 'n': flags |= CLONE_NEWNET; break; + case 'p': flags |= CLONE_NEWPID; break; + case 'u': flags |= CLONE_NEWUTS; break; + case 'U': flags |= CLONE_NEWUSER; break; + case 'v': verbose = 1; break; + default: usage(argv[0]); + } + } + + child_pid = clone(childFunc, + child_stack + STACK_SIZE, + flags | SIGCHLD, &argv[optind]); + if (child_pid == -1) + errExit("clone"); + + if (verbose) + printf("%s: PID of child created by clone() is %ld\n", + argv[0], (long) child_pid); + + /* Parent falls through to here */ + + if (waitpid(child_pid, NULL, 0) == -1) /* Wait for child */ + errExit("waitpid"); + + if (verbose) + printf("%s: terminating\n", argv[0]); + exit(EXIT_SUCCESS); +} diff --git a/setns.c b/setns.c new file mode 100644 index 0000000..385aa74 --- /dev/null +++ b/setns.c @@ -0,0 +1,31 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ + } while (0) + +int +main(int argc, char *argv[]) +{ + int fd; + + if (argc < 3) { + fprintf(stderr, "%s /proc/PID/ns/FILE cmd args...\n", argv[0]); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDONLY); /* Get descriptor for namespace */ + if (fd == -1) + errExit("open"); + + //if (setns(fd, CLONE_NEWNS) == -1) + if (setns(fd, 0) == -1) /* Join that namespace */ + errExit("setns"); + + execvp(argv[2], &argv[2]); /* Execute a command in namespace */ + errExit("execvp"); +} diff --git a/simple_init.c b/simple_init.c new file mode 100644 index 0000000..e2ab30c --- /dev/null +++ b/simple_init.c @@ -0,0 +1,188 @@ +/* simple_init.c + + Copyright 2013, Michael Kerrisk + Licensed under GNU General Public License v2 or later + + A simple init(1)-style program to be used as the init program in + a PID namespace. The program reaps the status of its children and + provides a simple shell facility for executing commands. +*/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ + } while (0) + +static int verbose = 0; + +/* Display wait status (from waitpid() or similar) given in 'status' */ + +/* SIGCHLD handler: reap child processes as they change state */ + +static void +child_handler(int sig) +{ + pid_t pid; + int status; + + /* WUNTRACED and WCONTINUED allow waitpid() to catch stopped and + continued children (in addition to terminated children) */ + + while ((pid = waitpid(-1, &status, + WNOHANG | WUNTRACED | WCONTINUED)) != 0) { + if (pid == -1) { + if (errno == ECHILD) /* No more children */ + break; + else + perror("waitpid"); /* Unexpected error */ + } + + if (verbose) + printf("\tinit: SIGCHLD handler: PID %ld terminated\n", + (long) pid); + } +} + +/* Perform word expansion on string in 'cmd', allocating and + returning a vector of words on success or NULL on failure */ + +static char ** +expand_words(char *cmd) +{ + char **arg_vec; + int s; + wordexp_t pwordexp; + + s = wordexp(cmd, &pwordexp, 0); + if (s != 0) { + fprintf(stderr, "Word expansion failed\n"); + return NULL; + } + + arg_vec = calloc(pwordexp.we_wordc + 1, sizeof(char *)); + if (arg_vec == NULL) + errExit("calloc"); + + for (s = 0; s < pwordexp.we_wordc; s++) + arg_vec[s] = pwordexp.we_wordv[s]; + + arg_vec[pwordexp.we_wordc] = NULL; + + return arg_vec; +} + +static void +usage(char *pname) +{ + fprintf(stderr, "Usage: %s [-q]\n", pname); + fprintf(stderr, "\t-v\tProvide verbose logging\n"); + + exit(EXIT_FAILURE); +} + +int +main(int argc, char *argv[]) +{ + struct sigaction sa; +#define CMD_SIZE 10000 + char cmd[CMD_SIZE]; + pid_t pid; + int opt; + + while ((opt = getopt(argc, argv, "v")) != -1) { + switch (opt) { + case 'v': verbose = 1; break; + default: usage(argv[0]); + } + } + + sa.sa_flags = SA_RESTART | SA_NOCLDSTOP; + sigemptyset(&sa.sa_mask); + sa.sa_handler = child_handler; + if (sigaction(SIGCHLD, &sa, NULL) == -1) + errExit("sigaction"); + + if (verbose) + printf("\tinit: my PID is %ld\n", (long) getpid()); + + /* Performing terminal operations while not being the foreground + process group for the terminal generates a SIGTTOU that stops the + process. However our init "shell" needs to be able to perform + such operations (just like a normal shell), so we ignore that + signal, which allows the operations to proceed successfully. */ + + signal(SIGTTOU, SIG_IGN); + + /* Become leader of a new process group and make that process + group the foreground process group for the terminal */ + + if (setpgid(0, 0) == -1) + errExit("setpgid");; + if (tcsetpgrp(STDIN_FILENO, getpgrp()) == -1) + errExit("tcsetpgrp-child"); + + while (1) { + + /* Read a shell command; exit on end of file */ + + printf("init$ "); + if (fgets(cmd, CMD_SIZE, stdin) == NULL) { + if (verbose) + printf("\tinit: exiting"); + printf("\n"); + exit(EXIT_SUCCESS); + } + + if (cmd[strlen(cmd) - 1] == '\n') + cmd[strlen(cmd) - 1] = '\0'; /* Strip trailing '\n' */ + + if (strlen(cmd) == 0) + continue; /* Ignore empty commands */ + + pid = fork(); /* Create child process */ + if (pid == -1) + errExit("fork"); + + if (pid == 0) { /* Child */ + char **arg_vec; + + arg_vec = expand_words(cmd); + if (arg_vec == NULL) /* Word expansion failed */ + continue; + + /* Make child the leader of a new process group and + make that process group the foreground process + group for the terminal */ + + if (setpgid(0, 0) == -1) + errExit("setpgid");; + if (tcsetpgrp(STDIN_FILENO, getpgrp()) == -1) + errExit("tcsetpgrp-child"); + + /* Child executes shell command and terminates */ + + execvp(arg_vec[0], arg_vec); + errExit("execvp"); /* Only reached if execvp() fails */ + } + + /* Parent falls through to here */ + + if (verbose) + printf("\tinit: created child %ld\n", (long) pid); + + pause(); /* Will be interrupted by signal handler */ + + /* After child changes state, ensure that the 'init' program + is the foreground process group for the terminal */ + + if (tcsetpgrp(STDIN_FILENO, getpgrp()) == -1) + errExit("tcsetpgrp-parent"); + } +} diff --git a/userns_child_exec.c b/userns_child_exec.c new file mode 100644 index 0000000..a3e1451 --- /dev/null +++ b/userns_child_exec.c @@ -0,0 +1,226 @@ +/* userns_child_exec.c + + Copyright 2013, Michael Kerrisk + Licensed under GNU General Public License v2 or later + + Create a child process that executes a shell command in new + namespace(s); allow UID and GID mappings to be specified when + creating a user namespace. +*/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* A simple error-handling function: print an error message based + on the value in 'errno' and terminate the calling process */ + +#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ + } while (0) + +struct child_args { + char **argv; /* Command to be executed by child, with arguments */ + int pipe_fd[2]; /* Pipe used to synchronize parent and child */ +}; + +static int verbose; + +static void +usage(char *pname) +{ + fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname); + fprintf(stderr, "Create a child process that executes a shell command " + "in a new user namespace,\n" + "and possibly also other new namespace(s).\n\n"); + fprintf(stderr, "Options can be:\n\n"); +#define fpe(str) fprintf(stderr, " %s", str); + fpe("-i New IPC namespace\n"); + fpe("-m New mount namespace\n"); + fpe("-n New network namespace\n"); + fpe("-p New PID namespace\n"); + fpe("-u New UTS namespace\n"); + fpe("-U New user namespace\n"); + fpe("-M uid_map Specify UID map for user namespace\n"); + fpe("-G gid_map Specify GID map for user namespace\n"); + fpe(" If -M or -G is specified, -U is required\n"); + fpe("-v Display verbose messages\n"); + fpe("\n"); + fpe("Map strings for -M and -G consist of records of the form:\n"); + fpe("\n"); + fpe(" ID-inside-ns ID-outside-ns len\n"); + fpe("\n"); + fpe("A map string can contain multiple records, separated by commas;\n"); + fpe("the commas are replaced by newlines before writing to map files.\n"); + + exit(EXIT_FAILURE); +} + +/* Update the mapping file 'map_file', with the value provided in + 'mapping', a string that defines a UID or GID mapping. A UID or + GID mapping consists of one or more newline-delimited records + of the form: + + ID_inside-ns ID-outside-ns length + + Requiring the user to supply a string that contains newlines is + of course inconvenient for command-line use. Thus, we permit the + use of commas to delimit records in this string, and replace them + with newlines before writing the string to the file. */ + +static void +update_map(char *mapping, char *map_file) +{ + int fd, j; + size_t map_len; /* Length of 'mapping' */ + + /* Replace commas in mapping string with newlines */ + + map_len = strlen(mapping); + for (j = 0; j < map_len; j++) + if (mapping[j] == ',') + mapping[j] = '\n'; + + fd = open(map_file, O_RDWR); + if (fd == -1) { + fprintf(stderr, "open %s: %s\n", map_file, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (write(fd, mapping, map_len) != map_len) { + fprintf(stderr, "write %s: %s\n", map_file, strerror(errno)); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static int /* Start function for cloned child */ +childFunc(void *arg) +{ + struct child_args *args = (struct child_args *) arg; + char ch; + + /* Wait until the parent has updated the UID and GID mappings. See + the comment in main(). We wait for end of file on a pipe that will + be closed by the parent process once it has updated the mappings. */ + + close(args->pipe_fd[1]); /* Close our descriptor for the write end + of the pipe so that we see EOF when + parent closes its descriptor */ + if (read(args->pipe_fd[0], &ch, 1) != 0) { + fprintf(stderr, "Failure in child: read from pipe returned != 0\n"); + exit(EXIT_FAILURE); + } + + /* Execute a shell command */ + + execvp(args->argv[0], args->argv); + errExit("execvp"); +} + +#define STACK_SIZE (1024 * 1024) + +static char child_stack[STACK_SIZE]; /* Space for child's stack */ + +int +main(int argc, char *argv[]) +{ + int flags, opt; + pid_t child_pid; + struct child_args args; + char *uid_map, *gid_map; + char map_path[PATH_MAX]; + + /* Parse command-line options. The initial '+' character in + the final getopt() argument prevents GNU-style permutation + of command-line options. That's useful, since sometimes + the 'command' to be executed by this program itself + has command-line options. We don't want getopt() to treat + those as options to this program. */ + + flags = 0; + verbose = 0; + gid_map = NULL; + uid_map = NULL; + while ((opt = getopt(argc, argv, "+imnpuUM:G:v")) != -1) { + switch (opt) { + case 'i': flags |= CLONE_NEWIPC; break; + case 'm': flags |= CLONE_NEWNS; break; + case 'n': flags |= CLONE_NEWNET; break; + case 'p': flags |= CLONE_NEWPID; break; + case 'u': flags |= CLONE_NEWUTS; break; + case 'v': verbose = 1; break; + case 'M': uid_map = optarg; break; + case 'G': gid_map = optarg; break; + case 'U': flags |= CLONE_NEWUSER; break; + default: usage(argv[0]); + } + } + + /* -M or -G without -U is nonsensical */ + + if ((uid_map != NULL || gid_map != NULL) && + !(flags & CLONE_NEWUSER)) + usage(argv[0]); + + args.argv = &argv[optind]; + + /* We use a pipe to synchronize the parent and child, in order to + ensure that the parent sets the UID and GID maps before the child + calls execve(). This ensures that the child maintains its + capabilities during the execve() in the common case where we + want to map the child's effective user ID to 0 in the new user + namespace. Without this synchronization, the child would lose + its capabilities if it performed an execve() with nonzero + user IDs (see the capabilities(7) man page for details of the + transformation of a process's capabilities during execve()). */ + + if (pipe(args.pipe_fd) == -1) + errExit("pipe"); + + /* Create the child in new namespace(s) */ + + child_pid = clone(childFunc, child_stack + STACK_SIZE, + flags | SIGCHLD, &args); + if (child_pid == -1) + errExit("clone"); + + /* Parent falls through to here */ + + if (verbose) + printf("%s: PID of child created by clone() is %ld\n", + argv[0], (long) child_pid); + + /* Update the UID and GID maps in the child */ + + if (uid_map != NULL) { + snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map", + (long) child_pid); + update_map(uid_map, map_path); + } + if (gid_map != NULL) { + snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map", + (long) child_pid); + update_map(gid_map, map_path); + } + + /* Close the write end of the pipe, to signal to the child that we + have updated the UID and GID maps */ + + close(args.pipe_fd[1]); + + if (waitpid(child_pid, NULL, 0) == -1) /* Wait for child */ + errExit("waitpid"); + + if (verbose) + printf("%s: terminating\n", argv[0]); + + exit(EXIT_SUCCESS); +}