Handle shim being sigkilled while containerd is down
Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
This commit is contained in:
parent
b6b2fd623e
commit
3281909583
3 changed files with 122 additions and 2 deletions
|
@ -629,6 +629,10 @@ func (c *container) waitForCreate(p *process, cmd *exec.Cmd) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
err = p.saveStartTime()
|
||||||
|
if err != nil {
|
||||||
|
logrus.Warnf("containerd: unable to save %s:%s starttime: %v", p.container.id, p.id)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
case <-time.After(c.timeout):
|
case <-time.After(c.timeout):
|
||||||
cmd.Process.Kill()
|
cmd.Process.Kill()
|
||||||
|
|
|
@ -9,8 +9,10 @@ import (
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Sirupsen/logrus"
|
"github.com/Sirupsen/logrus"
|
||||||
"github.com/docker/containerd/specs"
|
"github.com/docker/containerd/specs"
|
||||||
|
@ -126,6 +128,13 @@ func loadProcess(root, id string, c *container, s *ProcessState) (*process, erro
|
||||||
},
|
},
|
||||||
state: Stopped,
|
state: Stopped,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
startTime, err := ioutil.ReadFile(filepath.Join(p.root, StartTimeFile))
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
p.startTime = string(startTime)
|
||||||
|
|
||||||
if _, err := p.getPidFromFile(); err != nil {
|
if _, err := p.getPidFromFile(); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -151,6 +160,30 @@ func loadProcess(root, id string, c *container, s *ProcessState) (*process, erro
|
||||||
return p, nil
|
return p, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func readProcStatField(pid int, field int) (string, error) {
|
||||||
|
data, err := ioutil.ReadFile(filepath.Join(string(filepath.Separator), "proc", strconv.Itoa(pid), "stat"))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if field > 2 {
|
||||||
|
// First, split out the name since he could contains spaces.
|
||||||
|
parts := strings.Split(string(data), ") ")
|
||||||
|
// Now split out the rest, we end up with 2 fields less
|
||||||
|
parts = strings.Split(parts[1], " ")
|
||||||
|
return parts[field-2-1], nil // field count start at 1 in manual
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.Split(string(data), " (")
|
||||||
|
|
||||||
|
if field == 1 {
|
||||||
|
return parts[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parts = strings.Split(parts[1], ") ")
|
||||||
|
return parts[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
type process struct {
|
type process struct {
|
||||||
root string
|
root string
|
||||||
id string
|
id string
|
||||||
|
@ -165,6 +198,7 @@ type process struct {
|
||||||
cmdDoneCh chan struct{}
|
cmdDoneCh chan struct{}
|
||||||
state State
|
state State
|
||||||
stateLock sync.Mutex
|
stateLock sync.Mutex
|
||||||
|
startTime string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *process) ID() string {
|
func (p *process) ID() string {
|
||||||
|
@ -195,7 +229,47 @@ func (p *process) Resize(w, h int) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
|
func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
|
||||||
if rerr == nil || p.cmd == nil || p.cmd.Process == nil {
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
|
e := unix.Kill(p.pid, 0)
|
||||||
|
if e == syscall.ESRCH {
|
||||||
|
return rst, rerr
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it's not the same process, just mark it stopped and set
|
||||||
|
// the status to 255
|
||||||
|
if same, err := p.isSameProcess(); !same {
|
||||||
|
logrus.Warnf("containerd: %s:%s (pid %d) is not the same process anymore (%v)", p.container.id, p.id, p.pid, err)
|
||||||
|
p.stateLock.Lock()
|
||||||
|
p.state = Stopped
|
||||||
|
p.stateLock.Unlock()
|
||||||
|
// Create the file so we get the exit event generated once monitor kicks in
|
||||||
|
// without going to this all process again
|
||||||
|
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte("255"), 0644)
|
||||||
|
return 255, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ppid, err := readProcStatField(p.pid, 4)
|
||||||
|
if err != nil {
|
||||||
|
return rst, fmt.Errorf("could not check process ppid: %v (%v)", err, rerr)
|
||||||
|
}
|
||||||
|
if ppid == "1" {
|
||||||
|
logrus.Warnf("containerd: %s:%s shim died, killing associated process", p.container.id, p.id)
|
||||||
|
unix.Kill(p.pid, syscall.SIGKILL)
|
||||||
|
// wait for the process to die
|
||||||
|
for {
|
||||||
|
e := unix.Kill(p.pid, 0)
|
||||||
|
if e == syscall.ESRCH {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
rst = 128 + int(syscall.SIGKILL)
|
||||||
|
// Create the file so we get the exit event generated once monitor kicks in
|
||||||
|
// without going to this all process again
|
||||||
|
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", rst)), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
return rst, rerr
|
return rst, rerr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,6 +292,9 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
|
||||||
wpid int
|
wpid int
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Some processes change their PR_SET_PDEATHSIG, so force kill them
|
||||||
|
unix.Kill(p.pid, syscall.SIGKILL)
|
||||||
|
|
||||||
for wpid == 0 {
|
for wpid == 0 {
|
||||||
wpid, e = unix.Wait4(p.pid, &status, unix.WNOHANG, &rusage)
|
wpid, e = unix.Wait4(p.pid, &status, unix.WNOHANG, &rusage)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
|
@ -244,7 +321,9 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
|
||||||
func (p *process) ExitStatus() (rst int, rerr error) {
|
func (p *process) ExitStatus() (rst int, rerr error) {
|
||||||
data, err := ioutil.ReadFile(filepath.Join(p.root, ExitStatusFile))
|
data, err := ioutil.ReadFile(filepath.Join(p.root, ExitStatusFile))
|
||||||
defer func() {
|
defer func() {
|
||||||
|
if rerr != nil {
|
||||||
rst, rerr = p.handleSigkilledShim(rst, rerr)
|
rst, rerr = p.handleSigkilledShim(rst, rerr)
|
||||||
|
}
|
||||||
}()
|
}()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
|
@ -297,6 +376,40 @@ func (p *process) getPidFromFile() (int, error) {
|
||||||
return i, nil
|
return i, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *process) readStartTime() (string, error) {
|
||||||
|
return readProcStatField(p.pid, 22)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *process) saveStartTime() error {
|
||||||
|
startTime, err := p.readStartTime()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
p.startTime = startTime
|
||||||
|
return ioutil.WriteFile(filepath.Join(p.root, StartTimeFile), []byte(startTime), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *process) isSameProcess() (bool, error) {
|
||||||
|
// for backward compat assume it's the same if startTime wasn't set
|
||||||
|
if p.startTime == "" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
if p.pid == 0 {
|
||||||
|
_, err := p.getPidFromFile()
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, err := p.readStartTime()
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return startTime == p.startTime, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Wait will reap the shim process
|
// Wait will reap the shim process
|
||||||
func (p *process) Wait() {
|
func (p *process) Wait() {
|
||||||
if p.cmdDoneCh != nil {
|
if p.cmdDoneCh != nil {
|
||||||
|
|
|
@ -44,6 +44,9 @@ const (
|
||||||
// InitProcessID holds the special ID used for the very first
|
// InitProcessID holds the special ID used for the very first
|
||||||
// container's process
|
// container's process
|
||||||
InitProcessID = "init"
|
InitProcessID = "init"
|
||||||
|
// StartTimeFile holds the name of the file in which the process
|
||||||
|
// start time is saved
|
||||||
|
StartTimeFile = "starttime"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Checkpoint holds information regarding a container checkpoint
|
// Checkpoint holds information regarding a container checkpoint
|
||||||
|
|
Loading…
Reference in a new issue