diff --git a/Makefile b/Makefile index 20da70e..fc450f7 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ PACKAGES=$(shell go list ./... | grep -v /vendor/) INTEGRATION_PACKAGE=${PROJECT_ROOT}/integration # Project binaries. -COMMANDS=ctr containerd protoc-gen-gogoctrd +COMMANDS=ctr containerd containerd-shim protoc-gen-gogoctrd BINARIES=$(addprefix bin/,$(COMMANDS)) # TODO(stevvooe): This will set version from git tag, but overrides major, @@ -130,4 +130,3 @@ coverage-integration: ## generate coverprofiles from the integration tests help: ## this help @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) | sort - diff --git a/cmd/containerd-shim/console.go b/cmd/containerd-shim/console.go new file mode 100644 index 0000000..1d3262d --- /dev/null +++ b/cmd/containerd-shim/console.go @@ -0,0 +1,56 @@ +// +build !solaris + +package main + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +// NewConsole returns an initialized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func newConsole(uid, gid int) (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + console, err := ptsname(master) + if err != nil { + return nil, "", err + } + if err := unlockpt(master); err != nil { + return nil, "", err + } + if err := os.Chmod(console, 0600); err != nil { + return nil, "", err + } + if err := os.Chown(console, uid, gid); err != nil { + return nil, "", err + } + return master, console, nil +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// unlockpt should be called before opening the slave side of a pty. +func unlockpt(f *os.File) error { + var u int32 + return ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// ptsname retrieves the name of the first available pts for the given master. +func ptsname(f *os.File) (string, error) { + var n int32 + if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} diff --git a/cmd/containerd-shim/console_solaris.go b/cmd/containerd-shim/console_solaris.go new file mode 100644 index 0000000..37b3368 --- /dev/null +++ b/cmd/containerd-shim/console_solaris.go @@ -0,0 +1,14 @@ +// +build solaris + +package main + +import ( + "errors" + "os" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func newConsole(uid, gid int) (*os.File, string, error) { + return nil, "", errors.New("newConsole not implemented on Solaris") +} diff --git a/cmd/containerd-shim/main.go b/cmd/containerd-shim/main.go new file mode 100644 index 0000000..34245ad --- /dev/null +++ b/cmd/containerd-shim/main.go @@ -0,0 +1,213 @@ +package main + +import ( + "flag" + "fmt" + "os" + "os/signal" + "path/filepath" + "runtime" + "syscall" + + "github.com/docker/containerd/sys" + "github.com/docker/docker/pkg/term" +) + +var logFile *os.File + +func writeMessage(f *os.File, level string, err error) { + fmt.Fprintf(f, `{"level": "%s","msg": "%s"}`, level, err) + f.Sync() +} + +type controlMessage struct { + Type int + Width int + Height int +} + +// containerd-shim is a small shim that sits in front of a runtime implementation +// that allows it to be repartented to init and handle reattach from the caller. +// +// the cwd of the shim should be the path to the state directory where the shim +// can locate fifos and other information. +// Arg0: id of the container +// Arg1: bundle path +// Arg2: runtime binary +func main() { + flag.Parse() + cwd, err := os.Getwd() + if err != nil { + panic(err) + } + f, err := os.OpenFile(filepath.Join(cwd, "shim-log.json"), os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666) + if err != nil { + panic(err) + } + if err := start(f); err != nil { + // this means that the runtime failed starting the container and will have the + // proper error messages in the runtime log so we should to treat this as a + // shim failure because the sim executed properly + if err == errRuntime { + f.Close() + return + } + // log the error instead of writing to stderr because the shim will have + // /dev/null as it's stdio because it is supposed to be reparented to system + // init and will not have anyone to read from it + writeMessage(f, "error", err) + f.Close() + os.Exit(1) + } +} + +func start(log *os.File) error { + // start handling signals as soon as possible so that things are properly reaped + // or if runtime exits before we hit the handler + signals := make(chan os.Signal, 2048) + signal.Notify(signals) + // set the shim as the subreaper for all orphaned processes created by the container + if err := sys.SetSubreaper(1); err != nil { + return err + } + // open the exit pipe + f, err := os.OpenFile("exit", syscall.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + control, err := os.OpenFile("control", syscall.O_RDWR, 0) + if err != nil { + return err + } + defer control.Close() + p, err := newProcess(flag.Arg(0), flag.Arg(1), flag.Arg(2)) + if err != nil { + return err + } + defer func() { + if err := p.Close(); err != nil { + writeMessage(log, "warn", err) + } + }() + if err := p.create(); err != nil { + p.delete() + return err + } + msgC := make(chan controlMessage, 32) + go func() { + for { + var m controlMessage + if _, err := fmt.Fscanf(control, "%d %d %d\n", &m.Type, &m.Width, &m.Height); err != nil { + continue + } + msgC <- m + } + }() + if runtime.GOOS == "solaris" { + return nil + } + var exitShim bool + for { + select { + case s := <-signals: + switch s { + case syscall.SIGCHLD: + exits, _ := Reap(false) + for _, e := range exits { + // check to see if runtime is one of the processes that has exited + if e.Pid == p.pid() { + exitShim = true + writeInt("exitStatus", e.Status) + } + } + } + // runtime has exited so the shim can also exit + if exitShim { + // kill all processes in the container incase it was not running in + // its own PID namespace + p.killAll() + // wait for all the processes and IO to finish + p.Wait() + // delete the container from the runtime + p.delete() + // the close of the exit fifo will happen when the shim exits + return nil + } + case msg := <-msgC: + switch msg.Type { + case 0: + // close stdin + if p.stdinCloser != nil { + p.stdinCloser.Close() + } + case 1: + if p.console == nil { + continue + } + ws := term.Winsize{ + Width: uint16(msg.Width), + Height: uint16(msg.Height), + } + term.SetWinsize(p.console.Fd(), &ws) + } + } + } + return nil +} + +func writeInt(path string, i int) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + _, err = fmt.Fprintf(f, "%d", i) + return err +} + +// Exit is the wait4 information from an exited process +type Exit struct { + Pid int + Status int +} + +// Reap reaps all child processes for the calling process and returns their +// exit information +func Reap(wait bool) (exits []Exit, err error) { + var ( + ws syscall.WaitStatus + rus syscall.Rusage + ) + flag := syscall.WNOHANG + if wait { + flag = 0 + } + for { + pid, err := syscall.Wait4(-1, &ws, flag, &rus) + if err != nil { + if err == syscall.ECHILD { + return exits, nil + } + return exits, err + } + if pid <= 0 { + return exits, nil + } + exits = append(exits, Exit{ + Pid: pid, + Status: exitStatus(ws), + }) + } +} + +const exitSignalOffset = 128 + +// exitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func exitStatus(status syscall.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} diff --git a/cmd/containerd-shim/process.go b/cmd/containerd-shim/process.go new file mode 100644 index 0000000..a334aa1 --- /dev/null +++ b/cmd/containerd-shim/process.go @@ -0,0 +1,295 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "sync" + "syscall" + "time" +) + +var errRuntime = errors.New("shim: runtime execution error") + +type checkpoint struct { + // Timestamp is the time that checkpoint happened + Created time.Time `json:"created"` + // Name is the name of the checkpoint + Name string `json:"name"` + // TCP checkpoints open tcp connections + TCP bool `json:"tcp"` + // UnixSockets persists unix sockets in the checkpoint + UnixSockets bool `json:"unixSockets"` + // Shell persists tty sessions in the checkpoint + Shell bool `json:"shell"` + // Exit exits the container after the checkpoint is finished + Exit bool `json:"exit"` + // EmptyNS tells CRIU not to restore a particular namespace + EmptyNS []string `json:"emptyNS,omitempty"` +} + +type processState struct { + Terminal bool `json:terminal` + Exec bool `json:"exec"` + Stdin string `json:"containerdStdin"` + Stdout string `json:"containerdStdout"` + Stderr string `json:"containerdStderr"` + RuntimeArgs []string `json:"runtimeArgs"` + + NoPivotRoot bool `json:"noPivotRoot"` + CheckpointPath string `json:"checkpoint"` + RootUID int `json:"rootUID"` + RootGID int `json:"rootGID"` +} + +type process struct { + sync.WaitGroup + id string + bundle string + stdio *stdio + exec bool + containerPid int + checkpoint *checkpoint + checkpointPath string + shimIO *IO + stdinCloser io.Closer + console *os.File + consolePath string + state *processState + runtime string +} + +func newProcess(id, bundle, runtimeName string) (*process, error) { + p := &process{ + id: id, + bundle: bundle, + runtime: runtimeName, + } + s, err := loadProcess() + if err != nil { + return nil, err + } + p.state = s + if s.CheckpointPath != "" { + cpt, err := loadCheckpoint(s.CheckpointPath) + if err != nil { + return nil, err + } + p.checkpoint = cpt + p.checkpointPath = s.CheckpointPath + } + if err := p.openIO(); err != nil { + return nil, err + } + return p, nil +} + +func loadProcess() (*processState, error) { + f, err := os.Open("process.json") + if err != nil { + return nil, err + } + defer f.Close() + var s processState + if err := json.NewDecoder(f).Decode(&s); err != nil { + return nil, err + } + return &s, nil +} + +func loadCheckpoint(checkpointPath string) (*checkpoint, error) { + f, err := os.Open(filepath.Join(checkpointPath, "config.json")) + if err != nil { + return nil, err + } + defer f.Close() + var cpt checkpoint + if err := json.NewDecoder(f).Decode(&cpt); err != nil { + return nil, err + } + return &cpt, nil +} + +func (p *process) create() error { + cwd, err := os.Getwd() + if err != nil { + return err + } + logPath := filepath.Join(cwd, "log.json") + args := append([]string{ + "--log", logPath, + "--log-format", "json", + }, p.state.RuntimeArgs...) + if p.state.Exec { + args = append(args, "exec", + "-d", + "--process", filepath.Join(cwd, "process.json"), + "--console", p.consolePath, + ) + } else if p.checkpoint != nil { + args = append(args, "restore", + "-d", + "--image-path", p.checkpointPath, + "--work-path", filepath.Join(p.checkpointPath, "criu.work", "restore-"+time.Now().Format(time.RFC3339)), + ) + add := func(flags ...string) { + args = append(args, flags...) + } + if p.checkpoint.Shell { + add("--shell-job") + } + if p.checkpoint.TCP { + add("--tcp-established") + } + if p.checkpoint.UnixSockets { + add("--ext-unix-sk") + } + if p.state.NoPivotRoot { + add("--no-pivot") + } + for _, ns := range p.checkpoint.EmptyNS { + add("--empty-ns", ns) + } + + } else { + args = append(args, "create", + "--bundle", p.bundle, + "--console", p.consolePath, + ) + if p.state.NoPivotRoot { + args = append(args, "--no-pivot") + } + } + args = append(args, + "--pid-file", filepath.Join(cwd, "pid"), + p.id, + ) + cmd := exec.Command(p.runtime, args...) + cmd.Dir = p.bundle + cmd.Stdin = p.stdio.stdin + cmd.Stdout = p.stdio.stdout + cmd.Stderr = p.stdio.stderr + // Call out to setPDeathSig to set SysProcAttr as elements are platform specific + cmd.SysProcAttr = setPDeathSig() + + if err := cmd.Start(); err != nil { + if exErr, ok := err.(*exec.Error); ok { + if exErr.Err == exec.ErrNotFound || exErr.Err == os.ErrNotExist { + return fmt.Errorf("%s not installed on system", p.runtime) + } + } + return err + } + if runtime.GOOS != "solaris" { + // Since current logic dictates that we need a pid at the end of p.create + // we need to call runtime start as well on Solaris hence we need the + // pipes to stay open. + p.stdio.stdout.Close() + p.stdio.stderr.Close() + } + if err := cmd.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); ok { + return errRuntime + } + return err + } + data, err := ioutil.ReadFile("pid") + if err != nil { + return err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return err + } + p.containerPid = pid + return nil +} + +func (p *process) pid() int { + return p.containerPid +} + +func (p *process) delete() error { + if !p.state.Exec { + cmd := exec.Command(p.runtime, append(p.state.RuntimeArgs, "delete", p.id)...) + cmd.SysProcAttr = setPDeathSig() + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%s: %v", out, err) + } + } + return nil +} + +// IO holds all 3 standard io Reader/Writer (stdin,stdout,stderr) +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} + +func (p *process) initializeIO(rootuid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.stdio.stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.stdio.stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.stdio.stderr, i.Stderr = w, r + // change ownership of the pipes in case we are in a user namespace + for _, fd := range fds { + if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { + return nil, err + } + } + return i, nil +} +func (p *process) Close() error { + return p.stdio.Close() +} + +type stdio struct { + stdin *os.File + stdout *os.File + stderr *os.File +} + +func (s *stdio) Close() error { + err := s.stdin.Close() + if oerr := s.stdout.Close(); err == nil { + err = oerr + } + if oerr := s.stderr.Close(); err == nil { + err = oerr + } + return err +} diff --git a/cmd/containerd-shim/process_linux.go b/cmd/containerd-shim/process_linux.go new file mode 100644 index 0000000..f42af57 --- /dev/null +++ b/cmd/containerd-shim/process_linux.go @@ -0,0 +1,131 @@ +// +build !solaris + +package main + +import ( + "fmt" + "io" + "os/exec" + "syscall" + "time" + + "github.com/tonistiigi/fifo" + "golang.org/x/net/context" +) + +// setPDeathSig sets the parent death signal to SIGKILL so that if the +// shim dies the container process also dies. +func setPDeathSig() *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Pdeathsig: syscall.SIGKILL, + } +} + +// openIO opens the pre-created fifo's for use with the container +// in RDWR so that they remain open if the other side stops listening +func (p *process) openIO() error { + p.stdio = &stdio{} + var ( + uid = p.state.RootUID + gid = p.state.RootGID + ) + + ctx, _ := context.WithTimeout(context.Background(), 15*time.Second) + + stdinCloser, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0) + if err != nil { + return err + } + p.stdinCloser = stdinCloser + + if p.state.Terminal { + master, console, err := newConsole(uid, gid) + if err != nil { + return err + } + p.console = master + p.consolePath = console + stdin, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_RDONLY, 0) + if err != nil { + return err + } + go io.Copy(master, stdin) + stdoutw, err := fifo.OpenFifo(ctx, p.state.Stdout, syscall.O_WRONLY, 0) + if err != nil { + return err + } + stdoutr, err := fifo.OpenFifo(ctx, p.state.Stdout, syscall.O_RDONLY, 0) + if err != nil { + return err + } + p.Add(1) + go func() { + io.Copy(stdoutw, master) + master.Close() + stdoutr.Close() + stdoutw.Close() + p.Done() + }() + return nil + } + i, err := p.initializeIO(uid) + if err != nil { + return err + } + p.shimIO = i + // non-tty + for name, dest := range map[string]func(wc io.WriteCloser, rc io.Closer){ + p.state.Stdout: func(wc io.WriteCloser, rc io.Closer) { + p.Add(1) + go func() { + io.Copy(wc, i.Stdout) + p.Done() + wc.Close() + rc.Close() + }() + }, + p.state.Stderr: func(wc io.WriteCloser, rc io.Closer) { + p.Add(1) + go func() { + io.Copy(wc, i.Stderr) + p.Done() + wc.Close() + rc.Close() + }() + }, + } { + fw, err := fifo.OpenFifo(ctx, name, syscall.O_WRONLY, 0) + if err != nil { + return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err) + } + fr, err := fifo.OpenFifo(ctx, name, syscall.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err) + } + dest(fw, fr) + } + + f, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("containerd-shim: opening %s failed: %s", p.state.Stdin, err) + } + go func() { + io.Copy(i.Stdin, f) + i.Stdin.Close() + f.Close() + }() + + return nil +} + +func (p *process) killAll() error { + if !p.state.Exec { + cmd := exec.Command(p.runtime, append(p.state.RuntimeArgs, "kill", "--all", p.id, "SIGKILL")...) + cmd.SysProcAttr = setPDeathSig() + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%s: %v", out, err) + } + } + return nil +} diff --git a/cmd/containerd-shim/process_solaris.go b/cmd/containerd-shim/process_solaris.go new file mode 100644 index 0000000..8a0ceb6 --- /dev/null +++ b/cmd/containerd-shim/process_solaris.go @@ -0,0 +1,70 @@ +// +build solaris + +package main + +import ( + "io" + "os" + "syscall" +) + +// setPDeathSig is a no-op on Solaris as Pdeathsig is not defined. +func setPDeathSig() *syscall.SysProcAttr { + return nil +} + +// TODO: Update to using fifo's package in openIO. Need to +// 1. Merge and vendor changes in the package to use sys/unix. +// 2. Figure out why context.Background is timing out. +// openIO opens the pre-created fifo's for use with the container +// in RDWR so that they remain open if the other side stops listening +func (p *process) openIO() error { + p.stdio = &stdio{} + var ( + uid = p.state.RootUID + ) + i, err := p.initializeIO(uid) + if err != nil { + return err + } + p.shimIO = i + // Both tty and non-tty mode are handled by the runtime using + // the following pipes + for name, dest := range map[string]func(f *os.File){ + p.state.Stdout: func(f *os.File) { + p.Add(1) + go func() { + io.Copy(f, i.Stdout) + p.Done() + }() + }, + p.state.Stderr: func(f *os.File) { + p.Add(1) + go func() { + io.Copy(f, i.Stderr) + p.Done() + }() + }, + } { + f, err := os.OpenFile(name, syscall.O_RDWR, 0) + if err != nil { + return err + } + dest(f) + } + + f, err := os.OpenFile(p.state.Stdin, syscall.O_RDONLY, 0) + if err != nil { + return err + } + go func() { + io.Copy(i.Stdin, f) + i.Stdin.Close() + }() + + return nil +} + +func (p *process) killAll() error { + return nil +}