From 55310f9a953ba73daa0b5d5de6cda8ce8851acfa Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 23 Jul 2017 18:54:03 +0200 Subject: [PATCH 1/2] conmon: do not fail if waitpid is interrupted Signed-off-by: Giuseppe Scrivano --- conmon/conmon.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/conmon/conmon.c b/conmon/conmon.c index 411ed39c..2273d6c2 100644 --- a/conmon/conmon.c +++ b/conmon/conmon.c @@ -1228,8 +1228,12 @@ int main(int argc, char *argv[]) g_main_loop_run (main_loop); g_source_remove (terminal_watch); } else { + int ret; /* Wait for our create child to exit with the return code. */ - if (waitpid(create_pid, &runtime_status, 0) < 0) { + do + ret = waitpid(create_pid, &runtime_status, 0); + while (ret < 0 && errno == EINTR); + if (ret < 0) { int old_errno = errno; kill(create_pid, SIGKILL); errno = old_errno; From 595b0557f39a6ad9778fa30b110c30414402809c Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sat, 22 Jul 2017 17:17:14 +0200 Subject: [PATCH 2/2] conmon: use waitpid to wait for terminated processes During my testing in OpenShift I've noticed that conmon leaves some zombies processes. The reason is that we are using PR_SET_CHILD_SUBREAPER in conmon and runC forks a new process (runc init) each time we start a container. Using g_child_watch_add only on the main runc process and on the container process is not enough as we do not cleanup any other zombie process. Since glib doesn't allow to catch SIGCHLD and to better integrate in the existing code, catch it with signal(2) then raise a SIGUSR1 that glib handles. Signed-off-by: Giuseppe Scrivano --- conmon/conmon.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/conmon/conmon.c b/conmon/conmon.c index 2273d6c2..c9edbd39 100644 --- a/conmon/conmon.c +++ b/conmon/conmon.c @@ -550,6 +550,45 @@ static bool read_stdio(int fd, stdpipe_t pipe, bool *eof) } } +static void on_sigchld(G_GNUC_UNUSED int signal) +{ + raise (SIGUSR1); +} + +static void check_child_processes(GHashTable *pid_to_handler) +{ + void (*cb) (GPid, int, gpointer); + + for (;;) { + int status; + pid_t pid = waitpid(-1, &status, WNOHANG); + + if (pid < 0 && errno == EINTR) + continue; + if (pid < 0 && errno == ECHILD) { + g_main_loop_quit (main_loop); + return; + } + if (pid < 0) + pexit("Failed to read child process status"); + + if (pid == 0) + return; + + /* If we got here, pid > 0, so we have a valid pid to check. */ + cb = g_hash_table_lookup(pid_to_handler, &pid); + if (cb) + cb(pid, status, 0); + } +} + +static gboolean on_sigusr1_cb(gpointer user_data) +{ + GHashTable *pid_to_handler = (GHashTable *) user_data; + check_child_processes (pid_to_handler); + return G_SOURCE_CONTINUE; +} + static gboolean stdio_cb(int fd, GIOCondition condition, gpointer user_data) { stdpipe_t pipe = GPOINTER_TO_INT(user_data); @@ -1221,10 +1260,24 @@ int main(int argc, char *argv[]) close(slavefd_stdout); close(slavefd_stderr); + /* Map pid to its handler. */ + GHashTable *pid_to_handler = g_hash_table_new (g_int_hash, g_int_equal); + g_hash_table_insert (pid_to_handler, &create_pid, runtime_exit_cb); + + /* + * Glib does not support SIGCHLD so use SIGUSR1 with the same semantic. We will + * catch SIGCHLD and raise(SIGUSR1) in the signal handler. + */ + g_unix_signal_add (SIGUSR1, on_sigusr1_cb, pid_to_handler); + + if (signal(SIGCHLD, on_sigchld) == SIG_ERR) + pexit("Failed to set handler for SIGCHLD"); + ninfo("about to waitpid: %d", create_pid); if (csname != NULL) { guint terminal_watch = g_unix_fd_add (console_socket_fd, G_IO_IN, terminal_accept_cb, csname); - g_child_watch_add (create_pid, runtime_exit_cb, NULL); + /* Process any SIGCHLD we may have missed before the signal handler was in place. */ + check_child_processes (pid_to_handler); g_main_loop_run (main_loop); g_source_remove (terminal_watch); } else { @@ -1239,6 +1292,7 @@ int main(int argc, char *argv[]) errno = old_errno; pexit("Failed to wait for `runtime %s`", opt_exec ? "exec" : "create"); } + } if (!WIFEXITED(runtime_status) || WEXITSTATUS(runtime_status) != 0) { @@ -1270,6 +1324,8 @@ int main(int argc, char *argv[]) container_pid = atoi(contents); ninfo("container PID: %d", container_pid); + g_hash_table_insert (pid_to_handler, &container_pid, container_exit_cb); + /* Setup endpoint for attach */ _cleanup_free_ char *attach_symlink_dir_path = NULL; if (!opt_exec) { @@ -1298,7 +1354,7 @@ int main(int argc, char *argv[]) g_timeout_add_seconds (opt_timeout, timeout_cb, NULL); } - g_child_watch_add (container_pid, container_exit_cb, NULL); + check_child_processes(pid_to_handler); g_main_loop_run (main_loop);