Merge pull request #583 from alexlarsson/conmon-reap-zombies

conmon: Don't leave zombies and fix cgroup race
This commit is contained in:
Mrunal Patel 2017-06-20 07:53:52 -07:00 committed by GitHub
commit 88037b143b
3 changed files with 121 additions and 82 deletions

View file

@ -677,8 +677,10 @@ static gboolean terminal_accept_cb(int fd, G_GNUC_UNUSED GIOCondition condition,
ninfo("about to accept from csfd: %d", fd); ninfo("about to accept from csfd: %d", fd);
connfd = accept4(fd, NULL, NULL, SOCK_CLOEXEC); connfd = accept4(fd, NULL, NULL, SOCK_CLOEXEC);
if (connfd < 0) if (connfd < 0) {
pexit("Failed to accept console-socket connection"); nwarn("Failed to accept console-socket connection");
return G_SOURCE_CONTINUE;
}
/* Not accepting anything else. */ /* Not accepting anything else. */
close(fd); close(fd);
@ -704,7 +706,6 @@ static gboolean terminal_accept_cb(int fd, G_GNUC_UNUSED GIOCondition condition,
* stdout. stderr is ignored. */ * stdout. stderr is ignored. */
masterfd_stdin = console.fd; masterfd_stdin = console.fd;
masterfd_stdout = console.fd; masterfd_stdout = console.fd;
masterfd_stderr = -1;
/* Clean up everything */ /* Clean up everything */
close(connfd); close(connfd);
@ -730,7 +731,7 @@ int main(int argc, char *argv[])
_cleanup_free_ char *contents; _cleanup_free_ char *contents;
int cpid = -1; int cpid = -1;
int status; int status;
pid_t pid, create_pid; pid_t pid, main_pid, create_pid;
_cleanup_close_ int epfd = -1; _cleanup_close_ int epfd = -1;
_cleanup_close_ int csfd = -1; _cleanup_close_ int csfd = -1;
/* Used for !terminal cases. */ /* Used for !terminal cases. */
@ -741,11 +742,15 @@ int main(int argc, char *argv[])
char buf[BUF_SIZE]; char buf[BUF_SIZE];
int num_read; int num_read;
int sync_pipe_fd = -1; int sync_pipe_fd = -1;
char *sync_pipe, *endptr; int start_pipe_fd = -1;
char *start_pipe, *sync_pipe, *endptr;
int len; int len;
GError *error = NULL; GError *error = NULL;
GOptionContext *context; GOptionContext *context;
GPtrArray *runtime_argv = NULL; GPtrArray *runtime_argv = NULL;
_cleanup_close_ int dev_null_r = -1;
_cleanup_close_ int dev_null_w = -1;
int fds[2];
_cleanup_free_ char *memory_cgroup_path = NULL; _cleanup_free_ char *memory_cgroup_path = NULL;
int wb; int wb;
@ -776,6 +781,14 @@ int main(int argc, char *argv[])
bundle_path = cwd; bundle_path = cwd;
} }
dev_null_r = open("/dev/null", O_RDONLY | O_CLOEXEC);
if (dev_null_r < 0)
pexit("Failed to open /dev/null");
dev_null_w = open("/dev/null", O_WRONLY | O_CLOEXEC);
if (dev_null_w < 0)
pexit("Failed to open /dev/null");
if (exec && exec_process_spec == NULL) { if (exec && exec_process_spec == NULL) {
nexit("Exec process spec path not provided. Use --exec-process-spec"); nexit("Exec process spec path not provided. Use --exec-process-spec");
} }
@ -791,6 +804,44 @@ int main(int argc, char *argv[])
if (log_path == NULL) if (log_path == NULL)
nexit("Log file path not provided. Use --log-path"); nexit("Log file path not provided. Use --log-path");
start_pipe = getenv("_OCI_STARTPIPE");
if (start_pipe) {
errno = 0;
start_pipe_fd = strtol(start_pipe, &endptr, 10);
if (errno != 0 || *endptr != '\0')
pexit("unable to parse _OCI_STARTPIPE");
/* Block for an initial write to the start pipe before
spawning any childred or exiting, to ensure the
parent can put us in the right cgroup. */
read(start_pipe_fd, buf, BUF_SIZE);
close(start_pipe_fd);
}
if (!exec) {
/* In the create-container case we double-fork in
order to disconnect from the parent, as we want to
continue in a daemon-like way */
main_pid = fork();
if (main_pid < 0) {
pexit("Failed to fork the create command");
} else if (main_pid != 0) {
exit(0);
}
/* Disconnect stdio from parent. We need to do this, because
the parent is waiting for the stdout to end when the intermediate
child dies */
if (dup2(dev_null_r, STDIN_FILENO) < 0)
pexit("Failed to dup over stdout");
if (dup2(dev_null_w, STDOUT_FILENO) < 0)
pexit("Failed to dup over stdout");
if (dup2(dev_null_w, STDERR_FILENO) < 0)
pexit("Failed to dup over stderr");
/* Create a new session group */
setsid();
}
/* Environment variables */ /* Environment variables */
sync_pipe = getenv("_OCI_SYNCPIPE"); sync_pipe = getenv("_OCI_SYNCPIPE");
if (sync_pipe) { if (sync_pipe) {
@ -848,7 +899,6 @@ int main(int argc, char *argv[])
if (listen(csfd, 128) < 0) if (listen(csfd, 128) < 0)
pexit("Failed to listen on console-socket"); pexit("Failed to listen on console-socket");
} else { } else {
int fds[2];
/* /*
* Create a "fake" master fd so that we can use the same epoll code in * Create a "fake" master fd so that we can use the same epoll code in
@ -873,13 +923,15 @@ int main(int argc, char *argv[])
masterfd_stdout = fds[0]; masterfd_stdout = fds[0];
slavefd_stdout = fds[1]; slavefd_stdout = fds[1];
}
/* We always create a stderr pipe, because that way we can capture
runc stderr messages before the tty is created */
if (pipe2(fds, O_CLOEXEC) < 0) if (pipe2(fds, O_CLOEXEC) < 0)
pexit("Failed to create !terminal stderr pipe"); pexit("Failed to create stderr pipe");
masterfd_stderr = fds[0]; masterfd_stderr = fds[0];
slavefd_stderr = fds[1]; slavefd_stderr = fds[1];
}
runtime_argv = g_ptr_array_new(); runtime_argv = g_ptr_array_new();
g_ptr_array_add(runtime_argv, runtime_path); g_ptr_array_add(runtime_argv, runtime_path);
@ -929,25 +981,21 @@ int main(int argc, char *argv[])
if (create_pid < 0) { if (create_pid < 0) {
pexit("Failed to fork the create command"); pexit("Failed to fork the create command");
} else if (!create_pid) { } else if (!create_pid) {
_cleanup_close_ int dev_null = -1;
/* FIXME: This results in us not outputting runc error messages to crio's log. */ /* FIXME: This results in us not outputting runc error messages to crio's log. */
if (slavefd_stdin < 0) { if (slavefd_stdin < 0)
dev_null = open("/dev/null", O_RDONLY); slavefd_stdin = dev_null_r;
if (dev_null < 0)
pexit("Failed to open /dev/null");
slavefd_stdin = dev_null;
}
if (dup2(slavefd_stdin, STDIN_FILENO) < 0) if (dup2(slavefd_stdin, STDIN_FILENO) < 0)
pexit("Failed to dup over stdout"); pexit("Failed to dup over stdout");
if (slavefd_stdout >= 0) { if (slavefd_stdout < 0)
slavefd_stdout = dev_null_w;
if (dup2(slavefd_stdout, STDOUT_FILENO) < 0) if (dup2(slavefd_stdout, STDOUT_FILENO) < 0)
pexit("Failed to dup over stdout"); pexit("Failed to dup over stdout");
}
if (slavefd_stderr >= 0) { if (slavefd_stderr < 0)
slavefd_stderr = slavefd_stdout;
if (dup2(slavefd_stderr, STDERR_FILENO) < 0) if (dup2(slavefd_stderr, STDERR_FILENO) < 0)
pexit("Failed to dup over stderr"); pexit("Failed to dup over stderr");
}
execv(g_ptr_array_index(runtime_argv,0), (char **)runtime_argv->pdata); execv(g_ptr_array_index(runtime_argv,0), (char **)runtime_argv->pdata);
exit(127); exit(127);
@ -978,16 +1026,6 @@ int main(int argc, char *argv[])
if (!WIFEXITED(runtime_status) || WEXITSTATUS(runtime_status) != 0) { if (!WIFEXITED(runtime_status) || WEXITSTATUS(runtime_status) != 0) {
if (sync_pipe_fd > 0 && !exec) { if (sync_pipe_fd > 0 && !exec) {
if (terminal) {
/*
* For this case, the stderr is captured in the parent when terminal is passed down.
* We send -1 as pid to signal to parent that create container has failed.
*/
len = snprintf(buf, BUF_SIZE, "{\"pid\": %d}\n", -1);
if (len < 0 || write_all(sync_pipe_fd, buf, len) != len) {
pexit("unable to send container pid to parent");
}
} else {
/* /*
* Read from container stderr for any error and send it to parent * Read from container stderr for any error and send it to parent
* We send -1 as pid to signal to parent that create container has failed. * We send -1 as pid to signal to parent that create container has failed.
@ -1006,12 +1044,11 @@ int main(int argc, char *argv[])
} }
} }
} }
}
nexit("Failed to create container: exit status %d", WEXITSTATUS(runtime_status)); nexit("Failed to create container: exit status %d", WEXITSTATUS(runtime_status));
} }
if (terminal && masterfd_stdout == -1) if (terminal && masterfd_stdout == -1)
pexit("Runtime did not set up terminal"); nexit("Runtime did not set up terminal");
/* Read the pid so we can wait for the process to exit */ /* Read the pid so we can wait for the process to exit */
g_file_get_contents(pid_file, &contents, NULL, &err); g_file_get_contents(pid_file, &contents, NULL, &err);

View file

@ -128,10 +128,12 @@ func getOCIVersion(name string, args ...string) (string, error) {
func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error { func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
var stderrBuf bytes.Buffer var stderrBuf bytes.Buffer
parentPipe, childPipe, err := newPipe() parentPipe, childPipe, err := newPipe()
childStartPipe, parentStartPipe, err := newPipe()
if err != nil { if err != nil {
return fmt.Errorf("error creating socket pair: %v", err) return fmt.Errorf("error creating socket pair: %v", err)
} }
defer parentPipe.Close() defer parentPipe.Close()
defer parentStartPipe.Close()
var args []string var args []string
if r.cgroupManager == "systemd" { if r.cgroupManager == "systemd" {
@ -163,9 +165,10 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
if c.terminal { if c.terminal {
cmd.Stderr = &stderrBuf cmd.Stderr = &stderrBuf
} }
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe, childStartPipe)
// 0, 1 and 2 are stdin, stdout and stderr // 0, 1 and 2 are stdin, stdout and stderr
cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3)) cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3))
cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_STARTPIPE=%d", 4))
err = cmd.Start() err = cmd.Start()
if err != nil { if err != nil {
@ -175,6 +178,7 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
// We don't need childPipe on the parent side // We don't need childPipe on the parent side
childPipe.Close() childPipe.Close()
childStartPipe.Close()
// Move conmon to specified cgroup // Move conmon to specified cgroup
if cgroupParent != "" { if cgroupParent != "" {
@ -186,6 +190,19 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
} }
} }
/* We set the cgroup, now the child can start creating children */
someData := []byte{0}
_, err = parentStartPipe.Write(someData)
if err != nil {
return err
}
/* Wait for initial setup and fork, and reap child */
err = cmd.Wait()
if err != nil {
return err
}
// Wait to get container pid from conmon // Wait to get container pid from conmon
type syncStruct struct { type syncStruct struct {
si *syncInfo si *syncInfo
@ -207,21 +224,10 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
return fmt.Errorf("error reading container (probably exited) json message: %v", ss.err) return fmt.Errorf("error reading container (probably exited) json message: %v", ss.err)
} }
logrus.Debugf("Received container pid: %d", ss.si.Pid) logrus.Debugf("Received container pid: %d", ss.si.Pid)
errorMessage := ""
if c.terminal {
errorMessage = stderrBuf.String()
fmt.Fprintf(os.Stderr, errorMessage)
errorMessage = sanitizeConmonErrorMessage(errorMessage)
} else {
if ss.si.Message != "" {
errorMessage = ss.si.Message
}
}
if ss.si.Pid == -1 { if ss.si.Pid == -1 {
if errorMessage != "" { if ss.si.Message != "" {
logrus.Debugf("Container creation error: %s", errorMessage) logrus.Debugf("Container creation error: %s", ss.si.Message)
return fmt.Errorf("container create failed: %s", errorMessage) return fmt.Errorf("container create failed: %s", ss.si.Message)
} }
logrus.Debugf("Container creation failed") logrus.Debugf("Container creation failed")
return fmt.Errorf("container create failed") return fmt.Errorf("container create failed")
@ -232,18 +238,6 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
return nil return nil
} }
// sanitizeConmonErrorMessage removes conmon debug messages from error string
func sanitizeConmonErrorMessage(errString string) string {
var sanitizedLines []string
lines := strings.Split(errString, "\n")
for _, line := range lines {
if !strings.HasPrefix(line, "[conmon") {
sanitizedLines = append(sanitizedLines, line)
}
}
return strings.Join(sanitizedLines, "\n")
}
func createUnitName(prefix string, name string) string { func createUnitName(prefix string, name string) string {
return fmt.Sprintf("%s-%s.scope", prefix, name) return fmt.Sprintf("%s-%s.scope", prefix, name)
} }

View file

@ -69,8 +69,16 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
properties = append(properties, newProp("Delegate", true)) properties = append(properties, newProp("Delegate", true))
properties = append(properties, newProp("DefaultDependencies", false)) properties = append(properties, newProp("DefaultDependencies", false))
_, err = conn.StartTransientUnit(unitName, "replace", properties, nil) ch := make(chan string)
_, err = conn.StartTransientUnit(unitName, "replace", properties, ch)
if err != nil {
return err return err
}
// Block until job is started
<-ch
return nil
} }
func newProp(name string, units interface{}) systemdDbus.Property { func newProp(name string, units interface{}) systemdDbus.Property {