diff --git a/.gitignore b/.gitignore index be7505ca..981861ff 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ /ocic /ocid /ocid.conf +*.o *.orig /pause/pause /pause/pause.o diff --git a/.tool/lint b/.tool/lint index a2d28fe3..e70b53f8 100755 --- a/.tool/lint +++ b/.tool/lint @@ -6,7 +6,8 @@ set -o pipefail for d in $(find . -type d -not -iwholename '*.git*' -a -not -iname '.tool' -a -not -iwholename '*vendor*'); do ${GOPATH}/bin/gometalinter \ - --exclude='error return value not checked.*(Close|Log|Print).*\(errcheck\)$' \ + --exclude='error return value not checked.*(Close|Log|Print|RemoveAll).*\(errcheck\)$' \ + --exclude='declaration of.*err.*shadows declaration.*\(vetshadow\)$' \ --exclude='.*_test\.go:.*error return value not checked.*\(errcheck\)$' \ --exclude='duplicate of.*_test.go.*\(dupl\)$' \ --exclude='cmd\/client\/.*\.go.*\(dupl\)$' \ @@ -15,7 +16,7 @@ for d in $(find . -type d -not -iwholename '*.git*' -a -not -iname '.tool' -a -n --disable=aligncheck \ --disable=gotype \ --disable=gas \ - --cyclo-over=60 \ + --cyclo-over=80 \ --dupl-threshold=100 \ --tests \ --deadline=60s "${d}" diff --git a/Dockerfile b/Dockerfile index 5fc2b294..a3ceaf4e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,7 +43,9 @@ RUN mkdir -p /usr/src/criu \ && rm -rf /usr/src/criu # Install runc -ENV RUNC_COMMIT cc29e3dded8e27ba8f65738f40d251c885030a28 +# TODO: This should actually be v1.0.0-rc3 but we first need to switch to +# v1.0.0-rc5 runtime config generation. +ENV RUNC_COMMIT 31980a53ae7887b2c8f8715d13c3eb486c27b6cf RUN set -x \ && export GOPATH="$(mktemp -d)" \ && git clone https://github.com/opencontainers/runc.git "$GOPATH/src/github.com/opencontainers/runc" \ diff --git a/conmon/cmsg.c b/conmon/cmsg.c new file mode 100644 index 00000000..c44db2ef --- /dev/null +++ b/conmon/cmsg.c @@ -0,0 +1,149 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* NOTE: This code comes directly from runc/libcontainer/utils/cmsg.c. */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmsg.h" + +#define error(fmt, ...) \ + ({ \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + errno = ECOMM; \ + goto err; /* return value */ \ + }) + +/* + * Sends a file descriptor along the sockfd provided. Returns the return + * value of sendmsg(2). Any synchronisation and preparation of state + * should be done external to this (we expect the other side to be in + * recvfd() in the code). + */ +ssize_t sendfd(int sockfd, struct file_t file) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + int *fdptr; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* + * We need to send some other data along with the ancillary data, + * otherwise the other side won't recieve any data. This is very + * well-hidden in the documentation (and only applies to + * SOCK_STREAM). See the bottom part of unix(7). + */ + iov[0].iov_base = file.name; + iov[0].iov_len = strlen(file.name) + 1; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + fdptr = (int *) CMSG_DATA(cmsg); + memcpy(fdptr, &file.fd, sizeof(int)); + + return sendmsg(sockfd, &msg, 0); +} + +/* + * Receives a file descriptor from the sockfd provided. Returns the file + * descriptor as sent from sendfd(). It will return the file descriptor + * or die (literally) trying. Any synchronisation and preparation of + * state should be done external to this (we expect the other side to be + * in sendfd() in the code). + */ +struct file_t recvfd(int sockfd) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + struct file_t file = {0}; + int *fdptr; + int olderrno; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* Allocate a buffer. */ + /* TODO: Make this dynamic with MSG_PEEK. */ + file.name = malloc(TAG_BUFFER); + if (!file.name) + error("recvfd: failed to allocate file.tag buffer\n"); + + /* + * We need to "recieve" the non-ancillary data even though we don't + * plan to use it at all. Otherwise, things won't work as expected. + * See unix(7) and other well-hidden documentation. + */ + iov[0].iov_base = file.name; + iov[0].iov_len = TAG_BUFFER; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + ssize_t ret = recvmsg(sockfd, &msg, 0); + if (ret < 0) + goto err; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) + error("recvfd: got NULL from CMSG_FIRSTHDR"); + if (cmsg->cmsg_level != SOL_SOCKET) + error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level); + if (cmsg->cmsg_type != SCM_RIGHTS) + error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type); + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + error("recvfd: expected correct CMSG_LEN in cmsg: %lu", cmsg->cmsg_len); + + fdptr = (int *) CMSG_DATA(cmsg); + if (!fdptr || *fdptr < 0) + error("recvfd: recieved invalid pointer"); + + file.fd = *fdptr; + return file; + +err: + olderrno = errno; + free(file.name); + errno = olderrno; + return (struct file_t){0}; +} diff --git a/conmon/cmsg.h b/conmon/cmsg.h new file mode 100644 index 00000000..7c7aefe6 --- /dev/null +++ b/conmon/cmsg.h @@ -0,0 +1,38 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* NOTE: This code comes directly from runc/libcontainer/utils/cmsg.h. */ + +#pragma once + +#if !defined(CMSG_H) +#define CMSG_H + +#include + +/* TODO: Implement this properly with MSG_PEEK. */ +#define TAG_BUFFER 4096 + +/* This mirrors Go's (*os.File). */ +struct file_t { + char *name; + int fd; +}; + +struct file_t recvfd(int sockfd); +ssize_t sendfd(int sockfd, struct file_t file); + +#endif /* !defined(CMSG_H) */ diff --git a/conmon/conmon.c b/conmon/conmon.c index 279d7a15..6c285b4d 100644 --- a/conmon/conmon.c +++ b/conmon/conmon.c @@ -2,36 +2,46 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include +#include #include #include -#include #include #include +#include "cmsg.h" + #define pexit(fmt, ...) \ do { \ - fprintf(stderr, "conmon: " fmt " %m\n", ##__VA_ARGS__); \ + fprintf(stderr, "[conmon:e]: " fmt " %m\n", ##__VA_ARGS__); \ syslog(LOG_ERR, "conmon : " fmt ": %m\n", ##__VA_ARGS__); \ exit(EXIT_FAILURE); \ } while (0) #define nexit(fmt, ...) \ do { \ - fprintf(stderr, "conmon: " fmt "\n", ##__VA_ARGS__); \ + fprintf(stderr, "[conmon:e]: " fmt "\n", ##__VA_ARGS__); \ syslog(LOG_ERR, "conmon : " fmt " \n", ##__VA_ARGS__); \ exit(EXIT_FAILURE); \ } while (0) #define nwarn(fmt, ...) \ do { \ - fprintf(stderr, "conmon: " fmt "\n", ##__VA_ARGS__); \ + fprintf(stderr, "[conmon:w]: " fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define ninfo(fmt, ...) \ + do { \ + fprintf(stderr, "[conmon:i]: " fmt "\n", ##__VA_ARGS__); \ } while (0) #define _cleanup_(x) __attribute__((cleanup(x))) @@ -58,14 +68,6 @@ static inline void gstring_free_cleanup(GString **string) #define _cleanup_close_ _cleanup_(closep) #define _cleanup_gstring_ _cleanup_(gstring_free_cleanup) -struct termios tty_orig; - -static void tty_restore(void) -{ - if (tcsetattr(STDIN_FILENO, TCSANOW, &tty_orig) == -1) - pexit("tcsetattr"); -} - #define BUF_SIZE 256 #define CMD_SIZE 1024 #define MAX_EVENTS 10 @@ -77,6 +79,7 @@ static char *bundle_path = NULL; static char *pid_file = NULL; static bool systemd_cgroup = false; static bool exec = false; +static char *log_path = NULL; static GOptionEntry entries[] = { { "terminal", 't', 0, G_OPTION_ARG_NONE, &terminal, "Terminal", NULL }, @@ -86,9 +89,33 @@ static GOptionEntry entries[] = { "pidfile", 'p', 0, G_OPTION_ARG_STRING, &pid_file, "PID file", NULL }, { "systemd-cgroup", 's', 0, G_OPTION_ARG_NONE, &systemd_cgroup, "Enable systemd cgroup manager", NULL }, { "exec", 'e', 0, G_OPTION_ARG_NONE, &exec, "Exec a command in a running container", NULL }, + { "log-path", 'l', 0, G_OPTION_ARG_STRING, &log_path, "Log file path", NULL }, { NULL } }; +int set_k8s_timestamp(char *buf, ssize_t buflen, const char *stream_type) +{ + time_t now = time(NULL); + struct tm *tm; + char off_sign = '+'; + int off; + + if ((tm = localtime(&now)) == NULL) { + return -1; + } + off = (int) tm->tm_gmtoff; + if (tm->tm_gmtoff < 0) { + off_sign = '-'; + off = -off; + } + snprintf(buf, buflen, "%d-%02d-%02dT%02d:%02d:%02d%c%02d:%02d %s ", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec, + off_sign, off / 3600, off % 3600, stream_type); + + return 0; +} + int main(int argc, char *argv[]) { int ret, runtime_status; @@ -98,13 +125,15 @@ int main(int argc, char *argv[]) _cleanup_free_ char *contents; int cpid = -1; int status; - pid_t pid; + pid_t pid, create_pid; + _cleanup_close_ int logfd = -1; _cleanup_close_ int mfd = -1; _cleanup_close_ int epfd = -1; - char slname[BUF_SIZE]; + _cleanup_close_ int csfd = -1; + int runtime_mfd = -1; + char csname[PATH_MAX] = "/tmp/conmon-term.XXXXXXXX"; char buf[BUF_SIZE]; int num_read; - struct termios t; struct epoll_event ev; struct epoll_event evlist[MAX_EVENTS]; int sync_pipe_fd = -1; @@ -115,11 +144,11 @@ int main(int argc, char *argv[]) _cleanup_gstring_ GString *cmd = NULL; /* Command line parameters */ - context = g_option_context_new ("- conmon utility"); - g_option_context_add_main_entries (context, entries, "conmon"); - if (!g_option_context_parse (context, &argc, &argv, &error)) { - g_print ("option parsing failed: %s\n", error->message); - exit (1); + context = g_option_context_new("- conmon utility"); + g_option_context_add_main_entries(context, entries, "conmon"); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_print("option parsing failed: %s\n", error->message); + exit(1); } if (cid == NULL) @@ -132,7 +161,6 @@ int main(int argc, char *argv[]) if (getcwd(cwd, sizeof(cwd)) == NULL) { nexit("Failed to get working directory"); } - bundle_path = cwd; } @@ -141,10 +169,12 @@ int main(int argc, char *argv[]) "%s/pidfile-%s", cwd, cid) < 0) { nexit("Failed to generate the pidfile path"); } - pid_file = default_pid_file; } + if (log_path == NULL) + nexit("Log file path not provided. Use --log-path"); + /* Environment variables */ sync_pipe = getenv("_OCI_SYNCPIPE"); if (sync_pipe) { @@ -154,6 +184,11 @@ int main(int argc, char *argv[]) pexit("unable to parse _OCI_SYNCPIPE"); } + /* Open the log path file. */ + logfd = open(log_path, O_WRONLY | O_APPEND | O_CREAT); + if (logfd < 0) + pexit("Failed to open log file"); + /* * Set self as subreaper so we can wait for container process * and return its exit code. @@ -164,163 +199,269 @@ int main(int argc, char *argv[]) } if (terminal) { - /* Open the master pty */ - mfd = open("/dev/ptmx", O_RDWR | O_NOCTTY); - if (mfd < 0) - pexit("Failed to open console master pty"); + struct sockaddr_un addr = {0}; - /* Grant access to the slave pty */ - if (grantpt(mfd) == -1) - pexit("Failed to grant access to slave pty"); + /* + * Generate a temporary name. Is this unsafe? Probably, but we can + * replace it with a rename(2) setup if necessary. + */ - /* Unlock the slave pty */ - if (unlockpt(mfd) == -1) { /* Unlock slave pty */ - pexit("Failed to unlock the slave pty"); - } + int unusedfd = g_mkstemp(csname); + if (unusedfd < 0) + pexit("Failed to generate random path for console-socket"); + close(unusedfd); - /* Get the slave pty name */ - ret = ptsname_r(mfd, slname, BUF_SIZE); - if (ret != 0) { - pexit("Failed to get the slave pty name"); - } + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, csname, sizeof(addr.sun_path)-1); + ninfo("addr{sun_family=AF_UNIX, sun_path=%s}", addr.sun_path); + + /* Bind to the console socket path. */ + csfd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (csfd < 0) + pexit("Failed to create console-socket"); + /* XXX: This should be handled with a rename(2). */ + if (unlink(csname) < 0) + pexit("Failed to unlink temporary ranom path"); + if (bind(csfd, (struct sockaddr *) &addr, sizeof(addr)) < 0) + pexit("Failed to bind to console-socket"); + if (listen(csfd, 128) < 0) + pexit("Failed to listen on console-socket"); + } else { + int fds[2]; + + /* + * Create a "fake" master fd so that we can use the same epoll code in + * both cases. The runtime_mfd will be closed after we dup over + * everything. + * + * We use pipes here because open(/dev/std{out,err}) will fail if we + * used anything else (and it wouldn't be a good idea to create a new + * pty pair in the host). + */ + if (pipe(fds) < 0) + pexit("Failed to create runtime_mfd pipes"); + + mfd = fds[0]; + runtime_mfd = fds[1]; } cmd = g_string_new(runtime_path); - if (!exec) { - /* Create the container */ - if (systemd_cgroup) { - g_string_append_printf(cmd, " --systemd-cgroup"); - } - g_string_append_printf(cmd, " create %s --bundle %s --pid-file %s", - cid, bundle_path, pid_file); - if (terminal) { - g_string_append_printf(cmd, " --console %s", slname); - } - } else { + + /* Generate the cmdline. */ + if (exec && systemd_cgroup) + g_string_append_printf(cmd, " --systemd-cgroup"); + + if (exec) + g_string_append_printf(cmd, " exec -d --pid-file %s", pid_file); + else + g_string_append_printf(cmd, " create --bundle %s --pid-file %s", bundle_path, pid_file); + + if (terminal) + g_string_append_printf(cmd, " --console-socket %s", csname); + + /* Container name comes last. */ + g_string_append_printf(cmd, " %s", cid); + + /* Set the exec arguments. */ + if (exec) { + /* + * FIXME: This code is broken if argv[1] contains spaces or other + * similar characters that shells don't like. It's a bit silly + * that we're doing things inside a shell at all -- this should + * all be done in arrays. + */ + int i; - - /* Exec the command */ - if (terminal) { - g_string_append_printf(cmd, " exec -d --pid-file %s --console %s %s", - pid_file, slname, cid); - } else { - g_string_append_printf(cmd, " exec -d --pid-file %s %s", - pid_file, cid); - } - - for (i = 1; i < argc; i++) { + for (i = 1; i < argc; i++) g_string_append_printf(cmd, " %s", argv[i]); - } } - runtime_status = system(cmd->str); - if (runtime_status != 0) { - nexit("Failed to create container"); + /* + * We have to fork here because the current runC API dups the stdio of the + * calling process over the container's fds. This is actually *very bad* + * but is currently being discussed for change in + * https://github.com/opencontainers/runtime-spec/pull/513. Hopefully this + * won't be the case for very long. + */ + + /* Create our container. */ + create_pid = fork(); + if (create_pid < 0) { + pexit("Failed to fork the create command"); + } else if (!create_pid) { + char *argv[] = {"sh", "-c", cmd->str, NULL}; + + /* We only need to touch the stdio if we have terminal=false. */ + /* FIXME: This results in us not outputting runc error messages to ocid's log. */ + if (runtime_mfd >= 0) { + if (dup2(runtime_mfd, STDIN_FILENO) < 0) + pexit("Failed to dup over stdin"); + if (dup2(runtime_mfd, STDOUT_FILENO) < 0) + pexit("Failed to dup over stdout"); + if (dup2(runtime_mfd, STDERR_FILENO) < 0) + pexit("Failed to dup over stderr"); + } + + /* Exec into the process. TODO: Don't use the shell. */ + execv("/bin/sh", argv); + exit(127); } + /* The runtime has that fd now. We don't need to touch it anymore. */ + close(runtime_mfd); + + /* Get the console fd. */ + /* + * FIXME: If runc fails to start a container, we won't bail because we're + * busy waiting for requests. The solution probably involves + * epoll(2) and a signalfd(2). This causes a lot of issues. + */ + if (terminal) { + struct file_t console; + int connfd = -1; + + ninfo("about to accept from csfd: %d", csfd); + connfd = accept4(csfd, NULL, NULL, SOCK_CLOEXEC); + if (connfd < 0) + pexit("Failed to accept console-socket connection"); + + /* Not accepting anything else. */ + close(csfd); + unlink(csname); + + /* We exit if this fails. */ + ninfo("about to recvfd from connfd: %d", connfd); + console = recvfd(connfd); + + ninfo("console = {.name = '%s'; .fd = %d}", console.name, console.fd); + free(console.name); + + mfd = console.fd; + + /* Clean up everything */ + close(connfd); + } + + ninfo("about to waitpid: %d", create_pid); + + /* Wait for our create child to exit with the return code. */ + if (waitpid(create_pid, &runtime_status, 0) < 0) { + int old_errno = errno; + kill(create_pid, SIGKILL); + errno = old_errno; + pexit("Failed to wait for `runtime %s`", exec ? "exec" : "create"); + } + if (!WIFEXITED(runtime_status) || WEXITSTATUS(runtime_status) != 0) + nexit("Failed to create container: exit status %d", WEXITSTATUS(runtime_status)); + /* Read the pid so we can wait for the process to exit */ g_file_get_contents(pid_file, &contents, NULL, &err); if (err) { - fprintf(stderr, "Failed to read pidfile: %s\n", err->message); + nwarn("Failed to read pidfile: %s", err->message); g_error_free(err); exit(1); } cpid = atoi(contents); - printf("container PID: %d\n", cpid); + ninfo("container PID: %d", cpid); /* Send the container pid back to parent */ - if (sync_pipe_fd > 0 && !exec) { + if (sync_pipe_fd > 0 && !exec) { len = snprintf(buf, BUF_SIZE, "{\"pid\": %d}\n", cpid); if (len < 0 || write(sync_pipe_fd, buf, len) != len) { pexit("unable to send container pid to parent"); } } - if (terminal) { - /* Save exiting termios settings */ - if (tcgetattr(STDIN_FILENO, &tty_orig) == -1) - pexit("tcegetattr"); - - /* Settings for raw mode */ - t.c_lflag &= - ~(ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHONL | IEXTEN); - t.c_iflag &= - ~(BRKINT | ICRNL | IGNBRK | IGNCR | INLCR | INPCK | ISTRIP | - IXON | IXOFF | IGNPAR | PARMRK); - t.c_oflag &= ~OPOST; - t.c_cc[VMIN] = 1; - t.c_cc[VTIME] = 0; - - /* Set terminal to raw mode */ - if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &t) == -1) - pexit("tcsetattr"); - - /* Setup terminal restore on exit */ - if (atexit(tty_restore) != 0) - pexit("atexit"); - - epfd = epoll_create(5); - if (epfd < 0) - pexit("epoll_create"); - ev.events = EPOLLIN; - ev.data.fd = STDIN_FILENO; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, STDIN_FILENO, &ev) < 0) { - pexit("Failed to add stdin to epoll"); - } - ev.data.fd = mfd; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, mfd, &ev) < 0) { - pexit("Failed to add console master fd to epoll"); - } - - /* Copy data back and forth between STDIN and master fd */ - while (true) { - int ready = epoll_wait(epfd, evlist, MAX_EVENTS, -1); - int i = 0; - for (i = 0; i < ready; i++) { - if (evlist[i].events & EPOLLIN) { - if (evlist[i].data.fd == STDIN_FILENO) { - num_read = - read(STDIN_FILENO, buf, - BUF_SIZE); - if (num_read <= 0) - goto out; - - if (write(mfd, buf, num_read) != - num_read) { - nwarn - ("partial/failed write (masterFd)"); - goto out; - } - } else if (evlist[i].data.fd == mfd) { - num_read = - read(mfd, buf, BUF_SIZE); - if (num_read <= 0) - goto out; - - if (write - (STDOUT_FILENO, buf, - num_read) != num_read) { - nwarn - ("partial/failed write (STDOUT_FILENO)"); - goto out; - } - } - } else if (evlist[i].events & - (EPOLLHUP | EPOLLERR)) { - printf("closing fd %d\n", - evlist[i].data.fd); - if (close(evlist[i].data.fd) < 0) - pexit("close"); - goto out; - } - } - } - out: - tty_restore(); + /* Create epoll_ctl so that we can handle read/write events. */ + /* + * TODO: Switch to libuv so that we can also implement exec as well as + * attach and other important things. Using epoll directly is just + * really nasty. + */ + epfd = epoll_create(5); + if (epfd < 0) + pexit("epoll_create"); + ev.events = EPOLLIN; + /* + ev.data.fd = STDIN_FILENO; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, STDIN_FILENO, &ev) < 0) { + pexit("Failed to add stdin to epoll"); + } + */ + ev.data.fd = mfd; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, mfd, &ev) < 0) { + pexit("Failed to add console master fd to epoll"); } + #define TSBUFLEN 34 + char tsbuf[TSBUFLEN]; + + /* + * Log all of the container's output and pipe STDIN into it. Currently + * nothing using the STDIN setup (which makes its inclusion here a bit + * questionable but we need to rewrite this code soon anyway TODO). + */ + while (true) { + int ready = epoll_wait(epfd, evlist, MAX_EVENTS, -1); + int i = 0; + for (i = 0; i < ready; i++) { + if (evlist[i].events & EPOLLIN) { + if (evlist[i].data.fd == STDIN_FILENO) { + /* + * TODO: We need to replace STDIN_FILENO with something + * more sophisticated so that attach actually works + * properly. + */ + num_read = read(STDIN_FILENO, buf, BUF_SIZE); + if (num_read <= 0) + goto out; + + if (write(mfd, buf, num_read) != num_read) { + nwarn("partial/failed write (masterFd)"); + goto out; + } + } else if (evlist[i].data.fd == mfd) { + num_read = read(mfd, buf, BUF_SIZE); + if (num_read <= 0) + goto out; + + ninfo("read a chunk: (fd=%d) '%s'", mfd, buf); + + + /* Prepend the CRI-mandated timestamp and other metadata. */ + /* + * FIXME: Currently this code makes the assumption that + * @buf doesn't contain any newlines (since the CRI + * requires each line to contain a timestamp). This + * is an /okay/ assumption in most cases because + * ptys generally have newline-buffered output. + */ + int rc = set_k8s_timestamp(tsbuf, TSBUFLEN, "stdout"); + if (rc < 0) { + nwarn("failed to set timestamp"); + } else { + if (write(logfd, tsbuf, TSBUFLEN) != TSBUFLEN) { + nwarn("partial/failed write ts (logFd)"); + } + } + /* Log all output to logfd. */ + if (write(logfd, buf, num_read) != num_read) { + nwarn("partial/failed write (logFd)"); + goto out; + } + } + } else if (evlist[i].events & (EPOLLHUP | EPOLLERR)) { + printf("closing fd %d\n", evlist[i].data.fd); + if (close(evlist[i].data.fd) < 0) + pexit("close"); + goto out; + } + } + } + +out: /* Wait for the container process and record its exit code */ while ((pid = waitpid(-1, &status, 0)) > 0) { int exit_status = WEXITSTATUS(status); diff --git a/oci/oci.go b/oci/oci.go index ff19d59e..a00a819e 100644 --- a/oci/oci.go +++ b/oci/oci.go @@ -120,9 +120,13 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error { args = append(args, "-r", r.Path(c)) args = append(args, "-b", c.bundlePath) args = append(args, "-p", filepath.Join(c.bundlePath, "pidfile")) + args = append(args, "-l", c.logPath) if c.terminal { args = append(args, "-t") } + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) cmd := exec.Command(r.conmonPath, args...) cmd.Dir = c.bundlePath @@ -248,6 +252,19 @@ func (r *Runtime) ExecSync(c *Container, command []string, timeout int64) (resp } }() + logFile, err := ioutil.TempFile("", "ocid-log-"+c.name) + if err != nil { + return nil, ExecSyncError{ + ExitCode: -1, + Err: err, + } + } + logPath := logFile.Name() + defer func() { + logFile.Close() + os.RemoveAll(logPath) + }() + var args []string args = append(args, "-c", c.name) args = append(args, "-r", r.Path(c)) @@ -256,6 +273,7 @@ func (r *Runtime) ExecSync(c *Container, command []string, timeout int64) (resp if c.terminal { args = append(args, "-t") } + args = append(args, "-l", logPath) args = append(args, command...) @@ -371,9 +389,25 @@ func (r *Runtime) ExecSync(c *Container, command []string, timeout int64) (resp } } + // The actual logged output is not the same as stdoutBuf and stderrBuf, + // which are used for getting error information. For the actual + // ExecSyncResponse we have to read the logfile. + // XXX: Currently runC dups the same console over both stdout and stderr, + // so we can't differentiate between the two. + + outputBytes, err := ioutil.ReadFile(logPath) + if err != nil { + return nil, ExecSyncError{ + Stdout: stdoutBuf, + Stderr: stderrBuf, + ExitCode: -1, + Err: err, + } + } + return &ExecSyncResponse{ - Stdout: stdoutBuf.Bytes(), - Stderr: stderrBuf.Bytes(), + Stdout: outputBytes, + Stderr: nil, ExitCode: 0, }, nil } diff --git a/server/config.go b/server/config.go index 014edb06..59233772 100644 --- a/server/config.go +++ b/server/config.go @@ -56,8 +56,6 @@ type RootConfig struct { // LogDir is the default log directory were all logs will go unless kubelet // tells us to put them somewhere else. - // - // TODO: This is currently unused until the conmon logging rewrite is done. LogDir string `toml:"log_dir"` } diff --git a/server/container_create.go b/server/container_create.go index 78d4fc92..94f4b390 100644 --- a/server/container_create.go +++ b/server/container_create.go @@ -203,6 +203,29 @@ func setupContainerUser(specgen *generate.Generator, rootfs string, sc *pb.Linux return nil } +// ensureSaneLogPath is a hack to fix https://issues.k8s.io/44043 which causes +// logPath to be a broken symlink to some magical Docker path. Ideally we +// wouldn't have to deal with this, but until that issue is fixed we have to +// remove the path if it's a broken symlink. +func ensureSaneLogPath(logPath string) error { + // If the path exists but the resolved path does not, then we have a broken + // symlink and we need to remove it. + fi, err := os.Lstat(logPath) + if err != nil || fi.Mode()&os.ModeSymlink == 0 { + // Non-existant files and non-symlinks aren't our problem. + return nil + } + + _, err = os.Stat(logPath) + if os.IsNotExist(err) { + err = os.RemoveAll(logPath) + if err != nil { + return fmt.Errorf("ensureSaneLogPath remove bad logPath: %s", err) + } + } + return nil +} + // CreateContainer creates a new container in specified PodSandbox func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerRequest) (res *pb.CreateContainerResponse, err error) { logrus.Debugf("CreateContainerRequest %+v", req) @@ -332,6 +355,27 @@ func (s *Server) createSandboxContainer(ctx context.Context, containerID string, } logPath := containerConfig.LogPath + if logPath == "" { + // TODO: Should we use sandboxConfig.GetLogDirectory() here? + logPath = filepath.Join(sb.logDir, containerID+".log") + } + if !filepath.IsAbs(logPath) { + // XXX: It's not really clear what this should be versus the sbox logDirectory. + logrus.Warnf("requested logPath for ctr id %s is a relative path: %s", containerID, logPath) + logPath = filepath.Join(sb.logDir, logPath) + } + + // Handle https://issues.k8s.io/44043 + if err := ensureSaneLogPath(logPath); err != nil { + return nil, err + } + + logrus.WithFields(logrus.Fields{ + "sbox.logdir": sb.logDir, + "ctr.logfile": containerConfig.LogPath, + "log_path": logPath, + }).Debugf("setting container's log_path") + specgen.SetProcessTerminal(containerConfig.Tty) linux := containerConfig.GetLinux() diff --git a/server/sandbox.go b/server/sandbox.go index 747c4dad..1832265c 100644 --- a/server/sandbox.go +++ b/server/sandbox.go @@ -121,7 +121,6 @@ func hostNetNsPath() (string, error) { } defer netNS.Close() - return netNS.Path(), nil } diff --git a/server/sandbox_run.go b/server/sandbox_run.go index 410365f5..362419f0 100644 --- a/server/sandbox_run.go +++ b/server/sandbox_run.go @@ -149,12 +149,6 @@ func (s *Server) RunPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest g.SetHostname(hostname) } - // set log directory - logDir := req.GetConfig().LogDirectory - if logDir == "" { - logDir = filepath.Join(s.config.LogDir, id) - } - // set DNS options if req.GetConfig().GetDnsConfig() != nil { dnsServers := req.GetConfig().GetDnsConfig().Servers @@ -194,6 +188,19 @@ func (s *Server) RunPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, err } + // set log directory + logDir := req.GetConfig().LogDirectory + if logDir == "" { + logDir = filepath.Join(s.config.LogDir, id) + } + if err = os.MkdirAll(logDir, 0700); err != nil { + return nil, err + } + // This should always be absolute from k8s. + if !filepath.IsAbs(logDir) { + return nil, fmt.Errorf("requested logDir for sbox id %s is a relative path: %s", id, logDir) + } + // Don't use SELinux separation with Host Pid or IPC Namespace, if !req.GetConfig().GetLinux().GetSecurityContext().GetNamespaceOptions().HostPid && !req.GetConfig().GetLinux().GetSecurityContext().GetNamespaceOptions().HostIpc { processLabel, mountLabel, err = getSELinuxLabels(nil) @@ -245,12 +252,19 @@ func (s *Server) RunPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest } }() - privileged := s.privilegedSandbox(req) + // set log path inside log directory + logPath := filepath.Join(logDir, id+".log") + // Handle https://issues.k8s.io/44043 + if err := ensureSaneLogPath(logPath); err != nil { + return nil, err + } + + privileged := s.privilegedSandbox(req) g.AddAnnotation("ocid/metadata", string(metadataJSON)) g.AddAnnotation("ocid/labels", string(labelsJSON)) g.AddAnnotation("ocid/annotations", string(annotationsJSON)) - g.AddAnnotation("ocid/log_path", logDir) + g.AddAnnotation("ocid/log_path", logPath) g.AddAnnotation("ocid/name", name) g.AddAnnotation("ocid/container_type", containerTypeSandbox) g.AddAnnotation("ocid/sandbox_id", id) @@ -379,7 +393,7 @@ func (s *Server) RunPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, fmt.Errorf("failed to write runtime configuration for pod sandbox %s(%s): %v", sb.name, id, err) } - container, err := oci.NewContainer(id, containerName, podContainer.RunDir, logDir, sb.netNs(), labels, annotations, nil, nil, id, false, sb.privileged) + container, err := oci.NewContainer(id, containerName, podContainer.RunDir, logPath, sb.netNs(), labels, annotations, nil, nil, id, false, sb.privileged) if err != nil { return nil, err } diff --git a/server/server.go b/server/server.go index 6e67e476..dbf9a362 100644 --- a/server/server.go +++ b/server/server.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "os" + "path/filepath" "sync" "github.com/Sirupsen/logrus" @@ -175,7 +176,7 @@ func (s *Server) loadSandbox(id string) error { sb := &sandbox{ id: id, name: name, - logDir: m.Annotations["ocid/log_path"], + logDir: filepath.Dir(m.Annotations["ocid/log_path"]), labels: labels, containers: oci.NewMemoryStore(), processLabel: processLabel, @@ -225,7 +226,7 @@ func (s *Server) loadSandbox(id string) error { } }() - scontainer, err := oci.NewContainer(m.Annotations["ocid/container_id"], cname, sandboxPath, sandboxPath, sb.netNs(), labels, annotations, nil, nil, id, false, privileged) + scontainer, err := oci.NewContainer(m.Annotations["ocid/container_id"], cname, sandboxPath, m.Annotations["ocid/log_path"], sb.netNs(), labels, annotations, nil, nil, id, false, privileged) if err != nil { return err } diff --git a/test/ctr.bats b/test/ctr.bats index ba92f0f7..0798577c 100644 --- a/test/ctr.bats +++ b/test/ctr.bats @@ -99,6 +99,103 @@ function teardown() { stop_ocid } +@test "ctr logging" { + start_ocid + run ocic pod run --config "$TESTDATA"/sandbox_config.json + echo "$output" + [ "$status" -eq 0 ] + pod_id="$output" + run ocic pod list + echo "$output" + [ "$status" -eq 0 ] + + # Create a new container. + newconfig=$(mktemp --tmpdir ocid-config.XXXXXX.json) + cp "$TESTDATA"/container_config_logging.json "$newconfig" + sed -i 's|"%echooutput%"|"here", "is", "some", "output"|' "$newconfig" + run ocic ctr create --config "$newconfig" --pod "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + ctr_id="$output" + run ocic ctr start --id "$ctr_id" + echo "$output" + [ "$status" -eq 0 ] + run ocic ctr stop --id "$ctr_id" + echo "$output" + # Ignore errors on stop. + run ocic ctr status --id "$ctr_id" + [ "$status" -eq 0 ] + run ocic ctr remove --id "$ctr_id" + echo "$output" + [ "$status" -eq 0 ] + + # Check that the output is what we expect. + run cat "$DEFAULT_LOG_PATH/$pod_id/$ctr_id.log" + echo "$DEFAULT_LOG_PATH/$pod_id/$ctr_id.log ::" "$output" + [ "$status" -eq 0 ] + [[ "$output" == *"here is some output" ]] + + run ocic pod stop --id "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + run ocic pod remove --id "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + + cleanup_ctrs + cleanup_pods + stop_ocid +} + +@test "ctr logging [tty=true]" { + start_ocid + run ocic pod run --config "$TESTDATA"/sandbox_config.json + echo "$output" + [ "$status" -eq 0 ] + pod_id="$output" + run ocic pod list + echo "$output" + [ "$status" -eq 0 ] + + # Create a new container. + newconfig=$(mktemp --tmpdir ocid-config.XXXXXX.json) + cp "$TESTDATA"/container_config_logging.json "$newconfig" + sed -i 's|"%echooutput%"|"here", "is", "some", "output"|' "$newconfig" + sed -i 's|"tty": false,|"tty": true,|' "$newconfig" + run ocic ctr create --config "$newconfig" --pod "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + ctr_id="$output" + run ocic ctr start --id "$ctr_id" + echo "$output" + [ "$status" -eq 0 ] + run ocic ctr stop --id "$ctr_id" + echo "$output" + # Ignore errors on stop. + run ocic ctr status --id "$ctr_id" + [ "$status" -eq 0 ] + run ocic ctr remove --id "$ctr_id" + echo "$output" + [ "$status" -eq 0 ] + + # Check that the output is what we expect. + run cat "$DEFAULT_LOG_PATH/$pod_id/$ctr_id.log" + echo "$DEFAULT_LOG_PATH/$pod_id/$ctr_id.log ::" "$output" + [ "$status" -eq 0 ] + [[ "$output" == *"here is some output" ]] + + run ocic pod stop --id "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + run ocic pod remove --id "$pod_id" + echo "$output" + [ "$status" -eq 0 ] + + cleanup_ctrs + cleanup_pods + stop_ocid +} + # regression test for #127 @test "ctrs status for a pod" { start_ocid diff --git a/test/helpers.bash b/test/helpers.bash index 3bff7e19..be4c56c1 100644 --- a/test/helpers.bash +++ b/test/helpers.bash @@ -45,6 +45,8 @@ COPYIMG_BINARY=${COPYIMG_BINARY:-${OCID_ROOT}/cri-o/test/copyimg/copyimg} ARTIFACTS_PATH=${ARTIFACTS_PATH:-${OCID_ROOT}/cri-o/.artifacts} # Path of the checkseccomp binary. CHECKSECCOMP_BINARY=${CHECKSECCOMP_BINARY:-${OCID_ROOT}/cri-o/test/checkseccomp/checkseccomp} +# XXX: This is hardcoded inside cri-o at the moment. +DEFAULT_LOG_PATH=/var/log/ocid/pods TESTDIR=$(mktemp -d) if [ -e /usr/sbin/selinuxenabled ] && /usr/sbin/selinuxenabled; then @@ -75,6 +77,16 @@ if ! [ -d "$ARTIFACTS_PATH"/redis-image ]; then fi fi +# Make sure we have a copy of the busybox:latest image. +if ! [ -d "$ARTIFACTS_PATH"/busybox-image ]; then + mkdir -p "$ARTIFACTS_PATH"/busybox-image + if ! "$COPYIMG_BINARY" --import-from=docker://busybox --export-to=dir:"$ARTIFACTS_PATH"/busybox-image --signature-policy="$INTEGRATION_ROOT"/policy.json ; then + echo "Error pulling docker://busybox" + rm -fr "$ARTIFACTS_PATH"/busybox-image + exit 1 + fi +fi + # Run ocid using the binary specified by $OCID_BINARY. # This must ONLY be run on engines created with `start_ocid`. function ocid() { @@ -145,6 +157,11 @@ function start_ocid() { ocic image pull redis:latest fi REDIS_IMAGEID=$(ocic image status --id=redis | head -1 | sed -e "s/ID: //g") + run ocic image status --id=busybox + if [ "$status" -ne 0 ] ; then + ocic image pull busybox:latest + fi + BUSYBOX_IMAGEID=$(ocic image status --id=busybox | head -1 | sed -e "s/ID: //g") } function cleanup_ctrs() { diff --git a/test/runtimeversion.bats b/test/runtimeversion.bats index 20848fb4..c0398cc7 100644 --- a/test/runtimeversion.bats +++ b/test/runtimeversion.bats @@ -8,7 +8,7 @@ function teardown() { @test "ocic runtimeversion" { start_ocid - ocic runtimeversion + run ocic runtimeversion echo "$output" [ "$status" -eq 0 ] stop_ocid diff --git a/test/testdata/container_config.json b/test/testdata/container_config.json index 3bc9d8d7..3ab8fb8d 100644 --- a/test/testdata/container_config.json +++ b/test/testdata/container_config.json @@ -7,11 +7,9 @@ "image": "docker://redis:latest" }, "command": [ - "/bin/bash" - ], - "args": [ "/bin/ls" ], + "args": [], "working_dir": "/", "envs": [ { @@ -41,7 +39,7 @@ }, "privileged": true, "readonly_rootfs": true, - "log_path": "container.log", + "log_path": "", "stdin": false, "stdin_once": false, "tty": false, diff --git a/test/testdata/container_config_by_imageid.json b/test/testdata/container_config_by_imageid.json index 6cfb3556..5c87e7a5 100644 --- a/test/testdata/container_config_by_imageid.json +++ b/test/testdata/container_config_by_imageid.json @@ -41,7 +41,7 @@ }, "privileged": true, "readonly_rootfs": true, - "log_path": "container.log", + "log_path": "", "stdin": false, "stdin_once": false, "tty": false, diff --git a/test/testdata/container_config_logging.json b/test/testdata/container_config_logging.json new file mode 100644 index 00000000..05035e82 --- /dev/null +++ b/test/testdata/container_config_logging.json @@ -0,0 +1,82 @@ +{ + "metadata": { + "name": "container1", + "attempt": 1 + }, + "image": { + "image": "docker://busybox:latest" + }, + "command": [ + "/bin/echo" + ], + "args": [ + "%echooutput%" + ], + "working_dir": "/", + "envs": [ + { + "key": "PATH", + "value": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + }, + { + "key": "TERM", + "value": "xterm" + }, + { + "key": "TESTDIR", + "value": "test/dir1" + }, + { + "key": "TESTFILE", + "value": "test/file1" + } + ], + "labels": { + "type": "small", + "batch": "no" + }, + "annotations": { + "owner": "dragon", + "daemon": "ocid" + }, + "privileged": true, + "readonly_rootfs": true, + "log_path": "", + "stdin": false, + "stdin_once": false, + "tty": false, + "linux": { + "resources": { + "cpu_period": 10000, + "cpu_quota": 20000, + "cpu_shares": 512, + "memory_limit_in_bytes": 88000000, + "oom_score_adj": 30 + }, + "capabilities": { + "add_capabilities": [ + "setuid", + "setgid" + ], + "drop_capabilities": [ + "audit_write", + "audit_read" + ] + }, + "selinux_options": { + "user": "system_u", + "role": "system_r", + "type": "container_t", + "level": "s0:c4,c5" + }, + "user": { + "uid": 5, + "gid": 300, + "additional_gids": [ + 400, + 401, + 402 + ] + } + } +} diff --git a/test/testdata/container_config_seccomp.json b/test/testdata/container_config_seccomp.json index 6213c515..027c25e1 100644 --- a/test/testdata/container_config_seccomp.json +++ b/test/testdata/container_config_seccomp.json @@ -41,7 +41,7 @@ }, "privileged": true, "readonly_rootfs": true, - "log_path": "container.log", + "log_path": "", "stdin": false, "stdin_once": false, "tty": false, diff --git a/test/testdata/container_exit_test.json b/test/testdata/container_exit_test.json index 8780faed..bca99fb7 100644 --- a/test/testdata/container_exit_test.json +++ b/test/testdata/container_exit_test.json @@ -15,7 +15,7 @@ } ], "readonly_rootfs": true, - "log_path": "container.log", + "log_path": "", "stdin": false, "stdin_once": false, "tty": false, diff --git a/test/testdata/container_redis.json b/test/testdata/container_redis.json index 17e8de93..839ca746 100644 --- a/test/testdata/container_redis.json +++ b/test/testdata/container_redis.json @@ -39,7 +39,7 @@ "pod": "podsandbox1" }, "readonly_rootfs": false, - "log_path": "container.log", + "log_path": "", "stdin": false, "stdin_once": false, "tty": false, diff --git a/test/testdata/sandbox_config.json b/test/testdata/sandbox_config.json index 8f662649..39241317 100644 --- a/test/testdata/sandbox_config.json +++ b/test/testdata/sandbox_config.json @@ -6,7 +6,7 @@ "attempt": 1 }, "hostname": "ocic_host", - "log_directory": ".", + "log_directory": "", "dns_options": { "servers": [ "server1.redhat.com", diff --git a/test/testdata/sandbox_config_hostnet.json b/test/testdata/sandbox_config_hostnet.json index fad50e42..44a72166 100644 --- a/test/testdata/sandbox_config_hostnet.json +++ b/test/testdata/sandbox_config_hostnet.json @@ -6,7 +6,7 @@ "attempt": 1 }, "hostname": "ocic_host", - "log_directory": ".", + "log_directory": "", "dns_options": { "servers": [ "server1.redhat.com", diff --git a/test/testdata/sandbox_config_seccomp.json b/test/testdata/sandbox_config_seccomp.json index 9a55f51a..de666222 100644 --- a/test/testdata/sandbox_config_seccomp.json +++ b/test/testdata/sandbox_config_seccomp.json @@ -6,7 +6,7 @@ "attempt": 1 }, "hostname": "ocic_host", - "log_directory": ".", + "log_directory": "", "dns_options": { "servers": [ "server1.redhat.com",