Merge pull request #317 from mlaventure/handle-exec-clingy-children

Handle exec clingy children
This commit is contained in:
Michael Crosby 2016-09-19 11:53:06 -07:00 committed by GitHub
commit 4eb3147efc
6 changed files with 75 additions and 54 deletions

View file

@ -106,7 +106,7 @@ func start(log *os.File) error {
case s := <-signals: case s := <-signals:
switch s { switch s {
case syscall.SIGCHLD: case syscall.SIGCHLD:
exits, _ := osutils.Reap() exits, _ := osutils.Reap(false)
for _, e := range exits { for _, e := range exits {
// check to see if runtime is one of the processes that has exited // check to see if runtime is one of the processes that has exited
if e.Pid == p.pid() { if e.Pid == p.pid() {
@ -117,6 +117,9 @@ func start(log *os.File) error {
} }
// runtime has exited so the shim can also exit // runtime has exited so the shim can also exit
if exitShim { if exitShim {
// Wait for all the childs this process may have created
// (only needed for exec, but it won't hurt when done on init)
osutils.Reap(true)
// Let containerd take care of calling the runtime delete // Let containerd take care of calling the runtime delete
f.Close() f.Close()
p.Wait() p.Wait()

View file

@ -23,7 +23,6 @@ import (
"github.com/docker/containerd/api/grpc/server" "github.com/docker/containerd/api/grpc/server"
"github.com/docker/containerd/api/grpc/types" "github.com/docker/containerd/api/grpc/types"
"github.com/docker/containerd/api/http/pprof" "github.com/docker/containerd/api/http/pprof"
"github.com/docker/containerd/osutils"
"github.com/docker/containerd/supervisor" "github.com/docker/containerd/supervisor"
"github.com/docker/docker/pkg/listeners" "github.com/docker/docker/pkg/listeners"
"github.com/rcrowley/go-metrics" "github.com/rcrowley/go-metrics"
@ -160,7 +159,6 @@ func main() {
func daemon(context *cli.Context) error { func daemon(context *cli.Context) error {
s := make(chan os.Signal, 2048) s := make(chan os.Signal, 2048)
signal.Notify(s, syscall.SIGTERM, syscall.SIGINT) signal.Notify(s, syscall.SIGTERM, syscall.SIGINT)
osutils.SetSubreaper(1)
sv, err := supervisor.New( sv, err := supervisor.New(
context.String("state-dir"), context.String("state-dir"),
context.String("runtime"), context.String("runtime"),

View file

@ -12,13 +12,17 @@ type Exit struct {
// Reap reaps all child processes for the calling process and returns their // Reap reaps all child processes for the calling process and returns their
// exit information // exit information
func Reap() (exits []Exit, err error) { func Reap(wait bool) (exits []Exit, err error) {
var ( var (
ws syscall.WaitStatus ws syscall.WaitStatus
rus syscall.Rusage rus syscall.Rusage
) )
flag := syscall.WNOHANG
if wait {
flag = 0
}
for { for {
pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, &rus) pid, err := syscall.Wait4(-1, &ws, flag, &rus)
if err != nil { if err != nil {
if err == syscall.ECHILD { if err == syscall.ECHILD {
return exits, nil return exits, nil

View file

@ -14,6 +14,7 @@ import (
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/docker/containerd/specs" "github.com/docker/containerd/specs"
ocs "github.com/opencontainers/runtime-spec/specs-go" ocs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
) )
// Container defines the operations allowed on a container // Container defines the operations allowed on a container
@ -480,13 +481,34 @@ func (c *container) createCmd(pid string, cmd *exec.Cmd, p *process) error {
} }
return err return err
} }
// We need the pid file to have been written to run
defer func() {
go func() { go func() {
err := p.cmd.Wait() err := p.cmd.Wait()
if err == nil { if err == nil {
p.cmdSuccess = true p.cmdSuccess = true
} }
if same, err := p.isSameProcess(); same && p.pid > 0 {
// The process changed its PR_SET_PDEATHSIG, so force
// kill it
logrus.Infof("containerd: %s:%s (pid %v) has become an orphan, killing it", p.container.id, p.id, p.pid)
err = unix.Kill(p.pid, syscall.SIGKILL)
if err != nil && err != syscall.ESRCH {
logrus.Errorf("containerd: unable to SIGKILL %s:%s (pid %v): %v", p.container.id, p.id, p.pid, err)
} else {
for {
err = unix.Kill(p.pid, 0)
if err != nil {
break
}
time.Sleep(5 * time.Millisecond)
}
}
}
close(p.cmdDoneCh) close(p.cmdDoneCh)
}() }()
}()
if err := c.waitForCreate(p, cmd); err != nil { if err := c.waitForCreate(p, cmd); err != nil {
return err return err
} }

View file

@ -228,24 +228,31 @@ func (p *process) Resize(w, h int) error {
return err return err
} }
func (p *process) updateExitStatusFile(status int) (int, error) {
p.stateLock.Lock()
p.state = Stopped
p.stateLock.Unlock()
err := ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", status)), 0644)
return status, err
}
func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) { func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
if p.cmd == nil || p.cmd.Process == nil { if p.cmd == nil || p.cmd.Process == nil {
e := unix.Kill(p.pid, 0) e := unix.Kill(p.pid, 0)
if e == syscall.ESRCH { if e == syscall.ESRCH {
return rst, rerr logrus.Warnf("containerd: %s:%s (pid %d) does not exist", p.container.id, p.id, p.pid)
// The process died while containerd was down (probably of
// SIGKILL, but no way to be sure)
return p.updateExitStatusFile(255)
} }
// If it's not the same process, just mark it stopped and set // If it's not the same process, just mark it stopped and set
// the status to 255 // the status to 255
if same, err := p.isSameProcess(); !same { if same, err := p.isSameProcess(); !same {
logrus.Warnf("containerd: %s:%s (pid %d) is not the same process anymore (%v)", p.container.id, p.id, p.pid, err) logrus.Warnf("containerd: %s:%s (pid %d) is not the same process anymore (%v)", p.container.id, p.id, p.pid, err)
p.stateLock.Lock()
p.state = Stopped
p.stateLock.Unlock()
// Create the file so we get the exit event generated once monitor kicks in // Create the file so we get the exit event generated once monitor kicks in
// without going to this all process again // without having to go through all this process again
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte("255"), 0644) return p.updateExitStatusFile(255)
return 255, nil
} }
ppid, err := readProcStatField(p.pid, 4) ppid, err := readProcStatField(p.pid, 4)
@ -255,19 +262,21 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
if ppid == "1" { if ppid == "1" {
logrus.Warnf("containerd: %s:%s shim died, killing associated process", p.container.id, p.id) logrus.Warnf("containerd: %s:%s shim died, killing associated process", p.container.id, p.id)
unix.Kill(p.pid, syscall.SIGKILL) unix.Kill(p.pid, syscall.SIGKILL)
if err != nil && err != syscall.ESRCH {
return 255, fmt.Errorf("containerd: unable to SIGKILL %s:%s (pid %v): %v", p.container.id, p.id, p.pid, err)
}
// wait for the process to die // wait for the process to die
for { for {
e := unix.Kill(p.pid, 0) e := unix.Kill(p.pid, 0)
if e == syscall.ESRCH { if e == syscall.ESRCH {
break break
} }
time.Sleep(10 * time.Millisecond) time.Sleep(5 * time.Millisecond)
} }
rst = 128 + int(syscall.SIGKILL)
// Create the file so we get the exit event generated once monitor kicks in // Create the file so we get the exit event generated once monitor kicks in
// without going to this all process again // without having to go through all this process again
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", rst)), 0644) return p.updateExitStatusFile(128 + int(syscall.SIGKILL))
} }
return rst, rerr return rst, rerr
@ -286,29 +295,8 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
if shimStatus.Signaled() && shimStatus.Signal() == syscall.SIGKILL { if shimStatus.Signaled() && shimStatus.Signal() == syscall.SIGKILL {
logrus.Debugf("containerd: ExitStatus(container: %s, process: %s): shim was SIGKILL'ed reaping its child with pid %d", p.container.id, p.id, p.pid) logrus.Debugf("containerd: ExitStatus(container: %s, process: %s): shim was SIGKILL'ed reaping its child with pid %d", p.container.id, p.id, p.pid)
var (
status unix.WaitStatus
rusage unix.Rusage
wpid int
)
// Some processes change their PR_SET_PDEATHSIG, so force kill them
unix.Kill(p.pid, syscall.SIGKILL)
for wpid == 0 {
wpid, e = unix.Wait4(p.pid, &status, unix.WNOHANG, &rusage)
if e != nil {
logrus.Debugf("containerd: ExitStatus(container: %s, process: %s): Wait4(%d): %v", p.container.id, p.id, p.pid, rerr)
return rst, rerr
}
}
if wpid == p.pid {
rerr = nil rerr = nil
rst = 128 + int(shimStatus.Signal()) rst = 128 + int(shimStatus.Signal())
} else {
logrus.Errorf("containerd: ExitStatus(container: %s, process: %s): unexpected returned pid from wait4 %v (expected %v)", p.container.id, p.id, wpid, p.pid)
}
p.stateLock.Lock() p.stateLock.Lock()
p.state = Stopped p.state = Stopped

View file

@ -73,6 +73,11 @@ func (s *Supervisor) execExit(t *ExecExitTask) error {
if err := container.RemoveProcess(t.PID); err != nil { if err := container.RemoveProcess(t.PID); err != nil {
logrus.WithField("error", err).Error("containerd: find container for pid") logrus.WithField("error", err).Error("containerd: find container for pid")
} }
// If the exec spawned children which are still using its IO
// waiting here will block until they die or close their IO
// descriptors.
// Hence, we use a go routine to avoid block all other operations
go func() {
t.Process.Wait() t.Process.Wait()
s.notifySubscribers(Event{ s.notifySubscribers(Event{
Timestamp: time.Now(), Timestamp: time.Now(),
@ -81,5 +86,6 @@ func (s *Supervisor) execExit(t *ExecExitTask) error {
PID: t.PID, PID: t.PID,
Status: t.Status, Status: t.Status,
}) })
}()
return nil return nil
} }