conmon: Don't leave zombies and fix cgroup race
Currently, when creating containers we never call Wait on the conmon exec.Command, which means that the child hangs around forever as a zombie after it dies. However, instead of doing this waitpid() in the parent we instead do a double-fork in conmon, to daemonize it. That makes a lot of sense, as conmon really is not tied to the launcher, but needs to outlive it if e.g. the cri-o daemon restarts. However, this makes even more obvious a race condition which we already have. When crio-d puts the conmon pid in a cgroup there is a race where conmon could already have spawned a child, and it would then not be part of the cgroup. In order to fix this we add another synchronization pipe to conmon, which we block on before we create any children. The parent then makes sure the pid is in the cgroup before letting it continue. Signed-off-by: Alexander Larsson <alexl@redhat.com>
This commit is contained in:
parent
7b9032bac7
commit
af4fbcd942
2 changed files with 80 additions and 18 deletions
|
@ -719,7 +719,7 @@ int main(int argc, char *argv[])
|
||||||
_cleanup_free_ char *contents;
|
_cleanup_free_ char *contents;
|
||||||
int cpid = -1;
|
int cpid = -1;
|
||||||
int status;
|
int status;
|
||||||
pid_t pid, create_pid;
|
pid_t pid, main_pid, create_pid;
|
||||||
_cleanup_close_ int epfd = -1;
|
_cleanup_close_ int epfd = -1;
|
||||||
_cleanup_close_ int csfd = -1;
|
_cleanup_close_ int csfd = -1;
|
||||||
/* Used for !terminal cases. */
|
/* Used for !terminal cases. */
|
||||||
|
@ -730,11 +730,14 @@ int main(int argc, char *argv[])
|
||||||
char buf[BUF_SIZE];
|
char buf[BUF_SIZE];
|
||||||
int num_read;
|
int num_read;
|
||||||
int sync_pipe_fd = -1;
|
int sync_pipe_fd = -1;
|
||||||
char *sync_pipe, *endptr;
|
int start_pipe_fd = -1;
|
||||||
|
char *start_pipe, *sync_pipe, *endptr;
|
||||||
int len;
|
int len;
|
||||||
GError *error = NULL;
|
GError *error = NULL;
|
||||||
GOptionContext *context;
|
GOptionContext *context;
|
||||||
GPtrArray *runtime_argv = NULL;
|
GPtrArray *runtime_argv = NULL;
|
||||||
|
_cleanup_close_ int dev_null_r = -1;
|
||||||
|
_cleanup_close_ int dev_null_w = -1;
|
||||||
|
|
||||||
_cleanup_free_ char *memory_cgroup_path = NULL;
|
_cleanup_free_ char *memory_cgroup_path = NULL;
|
||||||
int wb;
|
int wb;
|
||||||
|
@ -765,6 +768,14 @@ int main(int argc, char *argv[])
|
||||||
bundle_path = cwd;
|
bundle_path = cwd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dev_null_r = open("/dev/null", O_RDONLY | O_CLOEXEC);
|
||||||
|
if (dev_null_r < 0)
|
||||||
|
pexit("Failed to open /dev/null");
|
||||||
|
|
||||||
|
dev_null_w = open("/dev/null", O_WRONLY | O_CLOEXEC);
|
||||||
|
if (dev_null_w < 0)
|
||||||
|
pexit("Failed to open /dev/null");
|
||||||
|
|
||||||
if (exec && exec_process_spec == NULL) {
|
if (exec && exec_process_spec == NULL) {
|
||||||
nexit("Exec process spec path not provided. Use --exec-process-spec");
|
nexit("Exec process spec path not provided. Use --exec-process-spec");
|
||||||
}
|
}
|
||||||
|
@ -780,6 +791,44 @@ int main(int argc, char *argv[])
|
||||||
if (log_path == NULL)
|
if (log_path == NULL)
|
||||||
nexit("Log file path not provided. Use --log-path");
|
nexit("Log file path not provided. Use --log-path");
|
||||||
|
|
||||||
|
start_pipe = getenv("_OCI_STARTPIPE");
|
||||||
|
if (start_pipe) {
|
||||||
|
errno = 0;
|
||||||
|
start_pipe_fd = strtol(start_pipe, &endptr, 10);
|
||||||
|
if (errno != 0 || *endptr != '\0')
|
||||||
|
pexit("unable to parse _OCI_STARTPIPE");
|
||||||
|
/* Block for an initial write to the start pipe before
|
||||||
|
spawning any childred or exiting, to ensure the
|
||||||
|
parent can put us in the right cgroup. */
|
||||||
|
read(start_pipe_fd, buf, BUF_SIZE);
|
||||||
|
close(start_pipe_fd);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!exec) {
|
||||||
|
/* In the create-container case we double-fork in
|
||||||
|
order to disconnect from the parent, as we want to
|
||||||
|
continue in a daemon-like way */
|
||||||
|
main_pid = fork();
|
||||||
|
if (main_pid < 0) {
|
||||||
|
pexit("Failed to fork the create command");
|
||||||
|
} else if (main_pid != 0) {
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Disconnect stdio from parent. We need to do this, because
|
||||||
|
the parent is waiting for the stdout to end when the intermediate
|
||||||
|
child dies */
|
||||||
|
if (dup2(dev_null_r, STDIN_FILENO) < 0)
|
||||||
|
pexit("Failed to dup over stdout");
|
||||||
|
if (dup2(dev_null_w, STDOUT_FILENO) < 0)
|
||||||
|
pexit("Failed to dup over stdout");
|
||||||
|
if (dup2(dev_null_w, STDERR_FILENO) < 0)
|
||||||
|
pexit("Failed to dup over stderr");
|
||||||
|
|
||||||
|
/* Create a new session group */
|
||||||
|
setsid();
|
||||||
|
}
|
||||||
|
|
||||||
/* Environment variables */
|
/* Environment variables */
|
||||||
sync_pipe = getenv("_OCI_SYNCPIPE");
|
sync_pipe = getenv("_OCI_SYNCPIPE");
|
||||||
if (sync_pipe) {
|
if (sync_pipe) {
|
||||||
|
@ -918,25 +967,21 @@ int main(int argc, char *argv[])
|
||||||
if (create_pid < 0) {
|
if (create_pid < 0) {
|
||||||
pexit("Failed to fork the create command");
|
pexit("Failed to fork the create command");
|
||||||
} else if (!create_pid) {
|
} else if (!create_pid) {
|
||||||
_cleanup_close_ int dev_null = -1;
|
|
||||||
/* FIXME: This results in us not outputting runc error messages to crio's log. */
|
/* FIXME: This results in us not outputting runc error messages to crio's log. */
|
||||||
if (slavefd_stdin < 0) {
|
if (slavefd_stdin < 0)
|
||||||
dev_null = open("/dev/null", O_RDONLY);
|
slavefd_stdin = dev_null_r;
|
||||||
if (dev_null < 0)
|
|
||||||
pexit("Failed to open /dev/null");
|
|
||||||
slavefd_stdin = dev_null;
|
|
||||||
}
|
|
||||||
if (dup2(slavefd_stdin, STDIN_FILENO) < 0)
|
if (dup2(slavefd_stdin, STDIN_FILENO) < 0)
|
||||||
pexit("Failed to dup over stdout");
|
pexit("Failed to dup over stdout");
|
||||||
|
|
||||||
if (slavefd_stdout >= 0) {
|
if (slavefd_stdout < 0)
|
||||||
|
slavefd_stdout = dev_null_w;
|
||||||
if (dup2(slavefd_stdout, STDOUT_FILENO) < 0)
|
if (dup2(slavefd_stdout, STDOUT_FILENO) < 0)
|
||||||
pexit("Failed to dup over stdout");
|
pexit("Failed to dup over stdout");
|
||||||
}
|
|
||||||
if (slavefd_stderr >= 0) {
|
if (slavefd_stderr < 0)
|
||||||
|
slavefd_stderr = slavefd_stdout;
|
||||||
if (dup2(slavefd_stderr, STDERR_FILENO) < 0)
|
if (dup2(slavefd_stderr, STDERR_FILENO) < 0)
|
||||||
pexit("Failed to dup over stderr");
|
pexit("Failed to dup over stderr");
|
||||||
}
|
|
||||||
|
|
||||||
execv(g_ptr_array_index(runtime_argv,0), (char **)runtime_argv->pdata);
|
execv(g_ptr_array_index(runtime_argv,0), (char **)runtime_argv->pdata);
|
||||||
exit(127);
|
exit(127);
|
||||||
|
|
19
oci/oci.go
19
oci/oci.go
|
@ -104,10 +104,12 @@ func getOCIVersion(name string, args ...string) (string, error) {
|
||||||
func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
|
func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
|
||||||
var stderrBuf bytes.Buffer
|
var stderrBuf bytes.Buffer
|
||||||
parentPipe, childPipe, err := newPipe()
|
parentPipe, childPipe, err := newPipe()
|
||||||
|
childStartPipe, parentStartPipe, err := newPipe()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error creating socket pair: %v", err)
|
return fmt.Errorf("error creating socket pair: %v", err)
|
||||||
}
|
}
|
||||||
defer parentPipe.Close()
|
defer parentPipe.Close()
|
||||||
|
defer parentStartPipe.Close()
|
||||||
|
|
||||||
var args []string
|
var args []string
|
||||||
if r.cgroupManager == "systemd" {
|
if r.cgroupManager == "systemd" {
|
||||||
|
@ -139,9 +141,10 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
|
||||||
if c.terminal {
|
if c.terminal {
|
||||||
cmd.Stderr = &stderrBuf
|
cmd.Stderr = &stderrBuf
|
||||||
}
|
}
|
||||||
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
|
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe, childStartPipe)
|
||||||
// 0, 1 and 2 are stdin, stdout and stderr
|
// 0, 1 and 2 are stdin, stdout and stderr
|
||||||
cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3))
|
cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3))
|
||||||
|
cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_STARTPIPE=%d", 4))
|
||||||
|
|
||||||
err = cmd.Start()
|
err = cmd.Start()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -151,6 +154,7 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
|
||||||
|
|
||||||
// We don't need childPipe on the parent side
|
// We don't need childPipe on the parent side
|
||||||
childPipe.Close()
|
childPipe.Close()
|
||||||
|
childStartPipe.Close()
|
||||||
|
|
||||||
// Move conmon to specified cgroup
|
// Move conmon to specified cgroup
|
||||||
if cgroupParent != "" {
|
if cgroupParent != "" {
|
||||||
|
@ -162,6 +166,19 @@ func (r *Runtime) CreateContainer(c *Container, cgroupParent string) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* We set the cgroup, now the child can start creating children */
|
||||||
|
someData := []byte{0}
|
||||||
|
_, err = parentStartPipe.Write(someData)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Wait for initial setup and fork, and reap child */
|
||||||
|
err = cmd.Wait()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// Wait to get container pid from conmon
|
// Wait to get container pid from conmon
|
||||||
type syncStruct struct {
|
type syncStruct struct {
|
||||||
si *syncInfo
|
si *syncInfo
|
||||||
|
|
Loading…
Reference in a new issue