Merge pull request #462 from crosbymichael/shim

shim: GRPC service
This commit is contained in:
Kenfe-Mickaël Laventure 2017-01-26 16:11:45 -08:00 committed by GitHub
commit 24c2810899
40 changed files with 6741 additions and 846 deletions

3
api/shim/gen.go Normal file
View file

@ -0,0 +1,3 @@
package shim
//go:generate protoc -I.:../../vendor:../../vendor/github.com/gogo/protobuf:../../../../../..:/usr/local/include --gogoctrd_out=plugins=grpc,import_path=github.com/docker/containerd/api/shim,Mgogoproto/gogo.proto=github.com/gogo/protobuf/gogoproto,Mgoogle/protobuf/descriptor.proto=github.com/gogo/protobuf/protoc-gen-gogo/descriptor:. shim.proto

3820
api/shim/shim.pb.go Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,19 +1,116 @@
syntax = "proto3";
package containerd.v1;
package containerd.shim.v1;
import "google/protobuf/empty.proto";
import "gogoproto/gogo.proto";
service ShimService {
rpc Create(CreateRequest) returns (google.protobuf.Empty);
rpc Exec(ExecRequest) returns (google.protobuf.Empty);
rpc State(StateRequest) returns (StateResponse);
service Shim {
rpc Create(CreateRequest) returns (CreateResponse);
rpc Start(StartRequest) returns (google.protobuf.Empty);
rpc Delete(DeleteRequest) returns (DeleteResponse);
rpc Exec(ExecRequest) returns (ExecResponse);
rpc Pty(PtyRequest) returns (google.protobuf.Empty);
rpc Events(EventsRequest) returns (stream Event);
rpc State(StateRequest) returns (StateResponse);
}
message CreateRequest {
string id = 1 [(gogoproto.customname) = "ID"];
string bundle = 2;
string runtime = 3;
bool no_pivot = 4;
bool terminal = 5;
string stdin = 6;
string stdout = 7;
string stderr = 8;
}
message CreateResponse {
uint32 pid = 1;
}
message StartRequest {
}
message DeleteRequest {
uint32 pid = 1;
}
message DeleteResponse {
uint32 exit_status = 1;
}
message ExecRequest {
bool terminal = 1;
string stdin = 2;
string stdout = 3;
string stderr = 4;
string selinux_label = 5;
User user = 6;
repeated string args = 7;
repeated string env = 8;
string cwd = 9;
repeated string capabilities = 10;
repeated Rlimit rlimits = 11;
bool no_new_privileges = 12;
string apparmor_profile = 13;
}
message User {
uint32 uid = 1;
uint32 gid = 2;
repeated uint32 additional_gids = 3;
}
message Rlimit {
string type = 1;
uint64 hard = 2;
uint64 soft = 3;
}
message ExecResponse {
uint32 pid = 1;
}
message PtyRequest {
string id = 1 [(gogoproto.customname) = "ID"];
uint32 pid = 1;
uint32 width = 2;
uint32 height = 3;
}
message EventsRequest {
}
enum EventType {
EXIT = 0;
OOM = 1;
CREATED = 2;
STARTED = 3;
EXEC_ADDED = 4;
}
message Event {
string id = 1 [(gogoproto.customname) = "ID"];
EventType type = 2;
uint32 pid = 3;
uint32 exit_status = 4;
}
message StateRequest {
}
message StateResponse {
string id = 1 [(gogoproto.customname) = "ID"];
repeated Process processes = 2;
}
enum State {
STOPPED = 0;
RUNNING = 1;
}
message Process {
uint32 pid = 1;
State state = 2;
}

View file

@ -1,82 +0,0 @@
// +build !solaris
package main
import (
"fmt"
"os"
"syscall"
"unsafe"
)
// NewConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (*os.File, string, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, "", err
}
if err = saneTerminal(master); err != nil {
return nil, "", err
}
console, err := ptsname(master)
if err != nil {
return nil, "", err
}
if err := unlockpt(master); err != nil {
return nil, "", err
}
if err := os.Chmod(console, 0600); err != nil {
return nil, "", err
}
if err := os.Chown(console, uid, gid); err != nil {
return nil, "", err
}
return master, console, nil
}
// saneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair
// created by us acts normally. In particular, a not-very-well-known default of
// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
// problem for terminal emulators, because we relay data from the terminal we
// also relay that funky line discipline.
func saneTerminal(terminal *os.File) error {
// Go doesn't have a wrapper for any of the termios ioctls.
var termios syscall.Termios
if err := ioctl(terminal.Fd(), syscall.TCGETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error())
}
// Set -onlcr so we don't have to deal with \r.
termios.Oflag &^= syscall.ONLCR
if err := ioctl(terminal.Fd(), syscall.TCSETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error())
}
return nil
}
func ioctl(fd uintptr, flag, data uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
return err
}
return nil
}
// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
// unlockpt should be called before opening the slave side of a pty.
func unlockpt(f *os.File) error {
var u int32
return ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
}
// ptsname retrieves the name of the first available pts for the given master.
func ptsname(f *os.File) (string, error) {
var n int32
if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
return "", err
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}

View file

@ -1,14 +0,0 @@
// +build solaris
package main
import (
"errors"
"os"
)
// NewConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (*os.File, string, error) {
return nil, "", errors.New("newConsole not implemented on Solaris")
}

View file

@ -1,210 +1,129 @@
package main
import (
"flag"
"fmt"
"os"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"google.golang.org/grpc"
"github.com/Sirupsen/logrus"
"github.com/docker/containerd"
apishim "github.com/docker/containerd/api/shim"
"github.com/docker/containerd/shim"
"github.com/docker/containerd/sys"
"github.com/docker/docker/pkg/term"
"github.com/docker/containerd/utils"
"github.com/urfave/cli"
)
func writeMessage(f *os.File, level string, err error) {
fmt.Fprintf(f, `{"level": "%s","msg": "%s"}`, level, err)
f.Sync()
}
const usage = `
__ _ __ __ _
_________ ____ / /_____ _(_)___ ___ _________/ / _____/ /_ (_)___ ___
/ ___/ __ \/ __ \/ __/ __ ` + "`" + `/ / __ \/ _ \/ ___/ __ /_____/ ___/ __ \/ / __ ` + "`" + `__ \
/ /__/ /_/ / / / / /_/ /_/ / / / / / __/ / / /_/ /_____(__ ) / / / / / / / / /
\___/\____/_/ /_/\__/\__,_/_/_/ /_/\___/_/ \__,_/ /____/_/ /_/_/_/ /_/ /_/
type controlMessage struct {
Type int
Width int
Height int
}
shim for container lifecycle and reconnection
`
// containerd-shim is a small shim that sits in front of a runtime implementation
// that allows it to be reparented to init and handle reattach from the caller.
//
// the cwd of the shim should be the path to the state directory where the shim
// can locate fifos and other information.
// Arg0: id of the container
// Arg1: bundle path
// Arg2: runtime binary
func main() {
flag.Parse()
cwd, err := os.Getwd()
app := cli.NewApp()
app.Name = "containerd-shim"
app.Version = containerd.Version
app.Usage = usage
app.Flags = []cli.Flag{
cli.BoolFlag{
Name: "debug",
Usage: "enable debug output in logs",
},
}
app.Before = func(context *cli.Context) error {
if context.GlobalBool("debug") {
logrus.SetLevel(logrus.DebugLevel)
}
return nil
}
app.Action = func(context *cli.Context) error {
// start handling signals as soon as possible so that things are properly reaped
// or if runtime exits before we hit the handler
signals, err := setupSignals()
if err != nil {
panic(err)
return err
}
f, err := os.OpenFile(filepath.Join(cwd, "shim-log.json"), os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
if err != nil {
panic(err)
var (
server = grpc.NewServer()
sv = shim.NewService()
)
logrus.Debug("registering grpc server")
apishim.RegisterShimServer(server, sv)
if err := serve(server, "shim.sock"); err != nil {
return err
}
if err := start(f); err != nil {
// this means that the runtime failed starting the container and will have the
// proper error messages in the runtime log so we should to treat this as a
// shim failure because the sim executed properly
if err == errRuntime {
f.Close()
return
return handleSignals(signals, server, sv)
}
// log the error instead of writing to stderr because the shim will have
// /dev/null as it's stdio because it is supposed to be reparented to system
// init and will not have anyone to read from it
writeMessage(f, "error", err)
f.Close()
if err := app.Run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "containerd-shim: %s\n", err)
os.Exit(1)
}
}
func start(log *os.File) error {
// start handling signals as soon as possible so that things are properly reaped
// or if runtime exits before we hit the handler
// setupSignals creates a new signal handler for all signals and sets the shim as a
// sub-reaper so that the container processes are reparented
func setupSignals() (chan os.Signal, error) {
signals := make(chan os.Signal, 2048)
signal.Notify(signals)
// set the shim as the subreaper for all orphaned processes created by the container
if err := sys.SetSubreaper(1); err != nil {
return err
return nil, err
}
// open the exit pipe
f, err := os.OpenFile("exit", syscall.O_WRONLY, 0)
return signals, nil
}
// serve serves the grpc API over a unix socket at the provided path
// this function does not block
func serve(server *grpc.Server, path string) error {
l, err := utils.CreateUnixSocket(path)
if err != nil {
return err
}
defer f.Close()
control, err := os.OpenFile("control", syscall.O_RDWR, 0)
if err != nil {
return err
}
defer control.Close()
p, err := newProcess(flag.Arg(0), flag.Arg(1), flag.Arg(2))
if err != nil {
return err
}
defer func() {
if err := p.Close(); err != nil {
writeMessage(log, "warn", err)
}
}()
if err := p.create(); err != nil {
p.delete()
return err
}
msgC := make(chan controlMessage, 32)
logrus.WithField("socket", path).Debug("serving api on unix socket")
go func() {
for {
var m controlMessage
if _, err := fmt.Fscanf(control, "%d %d %d\n", &m.Type, &m.Width, &m.Height); err != nil {
continue
}
msgC <- m
defer l.Close()
if err := server.Serve(l); err != nil &&
!strings.Contains(err.Error(), "use of closed network connection") {
l.Close()
logrus.WithError(err).Fatal("containerd-shim: GRPC server failure")
}
}()
if runtime.GOOS == "solaris" {
return nil
}
var exitShim bool
for !exitShim {
select {
case s := <-signals:
func handleSignals(signals chan os.Signal, server *grpc.Server, service *shim.Service) error {
for s := range signals {
logrus.WithField("signal", s).Debug("received signal")
switch s {
case syscall.SIGCHLD:
exits, _ := Reap(false)
exits, err := utils.Reap(false)
if err != nil {
logrus.WithError(err).Error("reap exit status")
}
for _, e := range exits {
// check to see if runtime is one of the processes that has exited
if e.Pid == p.pid() {
exitShim = true
writeInt("exitStatus", e.Status)
logrus.WithFields(logrus.Fields{
"status": e.Status,
"pid": e.Pid,
}).Debug("process exited")
if err := service.ProcessExit(e); err != nil {
return err
}
}
}
case msg := <-msgC:
switch msg.Type {
case 0:
// close stdin
if p.stdinCloser != nil {
p.stdinCloser.Close()
}
case 1:
if p.console == nil {
continue
}
ws := term.Winsize{
Width: uint16(msg.Width),
Height: uint16(msg.Height),
}
term.SetWinsize(p.console.Fd(), &ws)
}
}
}
// runtime has exited so the shim can also exit
// kill all processes in the container incase it was not running in
// its own PID namespace
p.killAll()
// wait for all the processes and IO to finish
p.Wait()
// delete the container from the runtime
p.delete()
// the close of the exit fifo will happen when the shim exits
case syscall.SIGTERM, syscall.SIGINT:
// TODO: should we forward signals to the processes if they are still running?
// i.e. machine reboot
server.Stop()
return nil
}
func writeInt(path string, i int) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
_, err = fmt.Fprintf(f, "%d", i)
return err
}
// Exit is the wait4 information from an exited process
type Exit struct {
Pid int
Status int
}
// Reap reaps all child processes for the calling process and returns their
// exit information
func Reap(wait bool) (exits []Exit, err error) {
var (
ws syscall.WaitStatus
rus syscall.Rusage
)
flag := syscall.WNOHANG
if wait {
flag = 0
}
for {
pid, err := syscall.Wait4(-1, &ws, flag, &rus)
if err != nil {
if err == syscall.ECHILD {
return exits, nil
}
return exits, err
}
if pid <= 0 {
return exits, nil
}
exits = append(exits, Exit{
Pid: pid,
Status: exitStatus(ws),
})
}
}
const exitSignalOffset = 128
// exitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func exitStatus(status syscall.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
return nil
}

View file

@ -1,295 +0,0 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"sync"
"syscall"
"time"
)
var errRuntime = errors.New("shim: runtime execution error")
type checkpoint struct {
// Timestamp is the time that checkpoint happened
Created time.Time `json:"created"`
// Name is the name of the checkpoint
Name string `json:"name"`
// TCP checkpoints open tcp connections
TCP bool `json:"tcp"`
// UnixSockets persists unix sockets in the checkpoint
UnixSockets bool `json:"unixSockets"`
// Shell persists tty sessions in the checkpoint
Shell bool `json:"shell"`
// Exit exits the container after the checkpoint is finished
Exit bool `json:"exit"`
// EmptyNS tells CRIU not to restore a particular namespace
EmptyNS []string `json:"emptyNS,omitempty"`
}
type processState struct {
Terminal bool `json:"terminal"`
Exec bool `json:"exec"`
Stdin string `json:"containerdStdin"`
Stdout string `json:"containerdStdout"`
Stderr string `json:"containerdStderr"`
RuntimeArgs []string `json:"runtimeArgs"`
NoPivotRoot bool `json:"noPivotRoot"`
CheckpointPath string `json:"checkpoint"`
RootUID int `json:"rootUID"`
RootGID int `json:"rootGID"`
}
type process struct {
sync.WaitGroup
id string
bundle string
stdio *stdio
exec bool
containerPid int
checkpoint *checkpoint
checkpointPath string
shimIO *IO
stdinCloser io.Closer
console *os.File
consolePath string
state *processState
runtime string
}
func newProcess(id, bundle, runtimeName string) (*process, error) {
p := &process{
id: id,
bundle: bundle,
runtime: runtimeName,
}
s, err := loadProcess()
if err != nil {
return nil, err
}
p.state = s
if s.CheckpointPath != "" {
cpt, err := loadCheckpoint(s.CheckpointPath)
if err != nil {
return nil, err
}
p.checkpoint = cpt
p.checkpointPath = s.CheckpointPath
}
if err := p.openIO(); err != nil {
return nil, err
}
return p, nil
}
func loadProcess() (*processState, error) {
f, err := os.Open("process.json")
if err != nil {
return nil, err
}
defer f.Close()
var s processState
if err := json.NewDecoder(f).Decode(&s); err != nil {
return nil, err
}
return &s, nil
}
func loadCheckpoint(checkpointPath string) (*checkpoint, error) {
f, err := os.Open(filepath.Join(checkpointPath, "config.json"))
if err != nil {
return nil, err
}
defer f.Close()
var cpt checkpoint
if err := json.NewDecoder(f).Decode(&cpt); err != nil {
return nil, err
}
return &cpt, nil
}
func (p *process) create() error {
cwd, err := os.Getwd()
if err != nil {
return err
}
logPath := filepath.Join(cwd, "log.json")
args := append([]string{
"--log", logPath,
"--log-format", "json",
}, p.state.RuntimeArgs...)
if p.state.Exec {
args = append(args, "exec",
"-d",
"--process", filepath.Join(cwd, "process.json"),
"--console", p.consolePath,
)
} else if p.checkpoint != nil {
args = append(args, "restore",
"-d",
"--image-path", p.checkpointPath,
"--work-path", filepath.Join(p.checkpointPath, "criu.work", "restore-"+time.Now().Format(time.RFC3339)),
)
add := func(flags ...string) {
args = append(args, flags...)
}
if p.checkpoint.Shell {
add("--shell-job")
}
if p.checkpoint.TCP {
add("--tcp-established")
}
if p.checkpoint.UnixSockets {
add("--ext-unix-sk")
}
if p.state.NoPivotRoot {
add("--no-pivot")
}
for _, ns := range p.checkpoint.EmptyNS {
add("--empty-ns", ns)
}
} else {
args = append(args, "create",
"--bundle", p.bundle,
"--console", p.consolePath,
)
if p.state.NoPivotRoot {
args = append(args, "--no-pivot")
}
}
args = append(args,
"--pid-file", filepath.Join(cwd, "pid"),
p.id,
)
cmd := exec.Command(p.runtime, args...)
cmd.Dir = p.bundle
cmd.Stdin = p.stdio.stdin
cmd.Stdout = p.stdio.stdout
cmd.Stderr = p.stdio.stderr
// Call out to setPDeathSig to set SysProcAttr as elements are platform specific
cmd.SysProcAttr = setPDeathSig()
if err := cmd.Start(); err != nil {
if exErr, ok := err.(*exec.Error); ok {
if exErr.Err == exec.ErrNotFound || exErr.Err == os.ErrNotExist {
return fmt.Errorf("%s not installed on system", p.runtime)
}
}
return err
}
if runtime.GOOS != "solaris" {
// Since current logic dictates that we need a pid at the end of p.create
// we need to call runtime start as well on Solaris hence we need the
// pipes to stay open.
p.stdio.stdout.Close()
p.stdio.stderr.Close()
}
if err := cmd.Wait(); err != nil {
if _, ok := err.(*exec.ExitError); ok {
return errRuntime
}
return err
}
data, err := ioutil.ReadFile("pid")
if err != nil {
return err
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return err
}
p.containerPid = pid
return nil
}
func (p *process) pid() int {
return p.containerPid
}
func (p *process) delete() error {
if !p.state.Exec {
cmd := exec.Command(p.runtime, append(p.state.RuntimeArgs, "delete", p.id)...)
cmd.SysProcAttr = setPDeathSig()
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%s: %v", out, err)
}
}
return nil
}
// IO holds all 3 standard io Reader/Writer (stdin,stdout,stderr)
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}
func (p *process) initializeIO(rootuid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.stdio.stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.stdio.stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.stdio.stderr, i.Stderr = w, r
// change ownership of the pipes in case we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
return nil, err
}
}
return i, nil
}
func (p *process) Close() error {
return p.stdio.Close()
}
type stdio struct {
stdin *os.File
stdout *os.File
stderr *os.File
}
func (s *stdio) Close() error {
err := s.stdin.Close()
if oerr := s.stdout.Close(); err == nil {
err = oerr
}
if oerr := s.stderr.Close(); err == nil {
err = oerr
}
return err
}

View file

@ -1,131 +0,0 @@
// +build !solaris
package main
import (
"fmt"
"io"
"os/exec"
"syscall"
"time"
"github.com/tonistiigi/fifo"
"golang.org/x/net/context"
)
// setPDeathSig sets the parent death signal to SIGKILL so that if the
// shim dies the container process also dies.
func setPDeathSig() *syscall.SysProcAttr {
return &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
}
}
// openIO opens the pre-created fifo's for use with the container
// in RDWR so that they remain open if the other side stops listening
func (p *process) openIO() error {
p.stdio = &stdio{}
var (
uid = p.state.RootUID
gid = p.state.RootGID
)
ctx, _ := context.WithTimeout(context.Background(), 15*time.Second)
stdinCloser, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0)
if err != nil {
return err
}
p.stdinCloser = stdinCloser
if p.state.Terminal {
master, console, err := newConsole(uid, gid)
if err != nil {
return err
}
p.console = master
p.consolePath = console
stdin, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_RDONLY, 0)
if err != nil {
return err
}
go io.Copy(master, stdin)
stdoutw, err := fifo.OpenFifo(ctx, p.state.Stdout, syscall.O_WRONLY, 0)
if err != nil {
return err
}
stdoutr, err := fifo.OpenFifo(ctx, p.state.Stdout, syscall.O_RDONLY, 0)
if err != nil {
return err
}
p.Add(1)
go func() {
io.Copy(stdoutw, master)
master.Close()
stdoutr.Close()
stdoutw.Close()
p.Done()
}()
return nil
}
i, err := p.initializeIO(uid)
if err != nil {
return err
}
p.shimIO = i
// non-tty
for name, dest := range map[string]func(wc io.WriteCloser, rc io.Closer){
p.state.Stdout: func(wc io.WriteCloser, rc io.Closer) {
p.Add(1)
go func() {
io.Copy(wc, i.Stdout)
p.Done()
wc.Close()
rc.Close()
}()
},
p.state.Stderr: func(wc io.WriteCloser, rc io.Closer) {
p.Add(1)
go func() {
io.Copy(wc, i.Stderr)
p.Done()
wc.Close()
rc.Close()
}()
},
} {
fw, err := fifo.OpenFifo(ctx, name, syscall.O_WRONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err)
}
fr, err := fifo.OpenFifo(ctx, name, syscall.O_RDONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err)
}
dest(fw, fr)
}
f, err := fifo.OpenFifo(ctx, p.state.Stdin, syscall.O_RDONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", p.state.Stdin, err)
}
go func() {
io.Copy(i.Stdin, f)
i.Stdin.Close()
f.Close()
}()
return nil
}
func (p *process) killAll() error {
if !p.state.Exec {
cmd := exec.Command(p.runtime, append(p.state.RuntimeArgs, "kill", "--all", p.id, "SIGKILL")...)
cmd.SysProcAttr = setPDeathSig()
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%s: %v", out, err)
}
}
return nil
}

View file

@ -1,70 +0,0 @@
// +build solaris
package main
import (
"io"
"os"
"syscall"
)
// setPDeathSig is a no-op on Solaris as Pdeathsig is not defined.
func setPDeathSig() *syscall.SysProcAttr {
return nil
}
// TODO: Update to using fifo's package in openIO. Need to
// 1. Merge and vendor changes in the package to use sys/unix.
// 2. Figure out why context.Background is timing out.
// openIO opens the pre-created fifo's for use with the container
// in RDWR so that they remain open if the other side stops listening
func (p *process) openIO() error {
p.stdio = &stdio{}
var (
uid = p.state.RootUID
)
i, err := p.initializeIO(uid)
if err != nil {
return err
}
p.shimIO = i
// Both tty and non-tty mode are handled by the runtime using
// the following pipes
for name, dest := range map[string]func(f *os.File){
p.state.Stdout: func(f *os.File) {
p.Add(1)
go func() {
io.Copy(f, i.Stdout)
p.Done()
}()
},
p.state.Stderr: func(f *os.File) {
p.Add(1)
go func() {
io.Copy(f, i.Stderr)
p.Done()
}()
},
} {
f, err := os.OpenFile(name, syscall.O_RDWR, 0)
if err != nil {
return err
}
dest(f)
}
f, err := os.OpenFile(p.state.Stdin, syscall.O_RDONLY, 0)
if err != nil {
return err
}
go func() {
io.Copy(i.Stdin, f)
i.Stdin.Close()
}()
return nil
}
func (p *process) killAll() error {
return nil
}

View file

@ -23,6 +23,7 @@ import (
"github.com/docker/containerd/execution"
"github.com/docker/containerd/execution/executors/shim"
"github.com/docker/containerd/log"
"github.com/docker/containerd/utils"
metrics "github.com/docker/go-metrics"
"github.com/urfave/cli"
@ -30,11 +31,7 @@ import (
stand "github.com/nats-io/nats-streaming-server/server"
)
func main() {
app := cli.NewApp()
app.Name = "containerd"
app.Version = containerd.Version
app.Usage = `
const usage = `
__ _ __
_________ ____ / /_____ _(_)___ ___ _________/ /
/ ___/ __ \/ __ \/ __/ __ ` + "`" + `/ / __ \/ _ \/ ___/ __ /
@ -43,6 +40,12 @@ func main() {
high performance container runtime
`
func main() {
app := cli.NewApp()
app.Name = "containerd"
app.Version = containerd.Version
app.Usage = usage
app.Flags = []cli.Flag{
cli.BoolFlag{
Name: "debug",
@ -98,7 +101,7 @@ high performance container runtime
if path == "" {
return fmt.Errorf("--socket path cannot be empty")
}
l, err := createUnixSocket(path)
l, err := utils.CreateUnixSocket(path)
if err != nil {
return err
}
@ -171,16 +174,6 @@ high performance container runtime
}
}
func createUnixSocket(path string) (net.Listener, error) {
if err := os.MkdirAll(filepath.Dir(path), 0660); err != nil {
return nil, err
}
if err := syscall.Unlink(path); err != nil && !os.IsNotExist(err) {
return nil, err
}
return net.Listen("unix", path)
}
func serveMetrics(address string) {
m := http.NewServeMux()
m.Handle("/metrics", metrics.Handler())

View file

@ -40,6 +40,7 @@ containerd client
deleteCommand,
listCommand,
inspectCommand,
shimCommand,
}
app.Before = func(context *cli.Context) error {
if context.GlobalBool("debug") {
@ -48,7 +49,7 @@ containerd client
return nil
}
if err := app.Run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "containerd: %s\n", err)
fmt.Fprintf(os.Stderr, "ctr: %s\n", err)
os.Exit(1)
}
}

296
cmd/ctr/shim.go Normal file
View file

@ -0,0 +1,296 @@
package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net"
"os"
"strconv"
"time"
gocontext "context"
"google.golang.org/grpc"
"google.golang.org/grpc/grpclog"
"github.com/Sirupsen/logrus"
"github.com/crosbymichael/console"
"github.com/docker/containerd/api/shim"
"github.com/urfave/cli"
)
var fifoFlags = []cli.Flag{
cli.StringFlag{
Name: "stdin",
Usage: "specify the path to the stdin fifo",
},
cli.StringFlag{
Name: "stdout",
Usage: "specify the path to the stdout fifo",
},
cli.StringFlag{
Name: "stderr",
Usage: "specify the path to the stderr fifo",
},
cli.BoolFlag{
Name: "tty,t",
Usage: "enable tty support",
},
}
var shimCommand = cli.Command{
Name: "shim",
Usage: "interact with a shim directly",
Subcommands: []cli.Command{
shimCreateCommand,
shimStartCommand,
shimDeleteCommand,
shimEventsCommand,
shimStateCommand,
shimExecCommand,
},
}
var shimCreateCommand = cli.Command{
Name: "create",
Usage: "create a container with a shim",
Flags: append(fifoFlags,
cli.StringFlag{
Name: "bundle",
Usage: "bundle path for the container",
},
cli.StringFlag{
Name: "runtime",
Value: "runc",
Usage: "runtime to use for the container",
},
cli.BoolFlag{
Name: "attach,a",
Usage: "stay attached to the container and open the fifos",
},
),
Action: func(context *cli.Context) error {
id := context.Args().First()
if id == "" {
return fmt.Errorf("container id must be provided")
}
service, err := getShimService()
if err != nil {
return err
}
tty := context.Bool("tty")
wg, err := prepareStdio(context.String("stdin"), context.String("stdout"), context.String("stderr"), tty)
if err != nil {
return err
}
r, err := service.Create(gocontext.Background(), &shim.CreateRequest{
ID: id,
Bundle: context.String("bundle"),
Runtime: context.String("runtime"),
Stdin: context.String("stdin"),
Stdout: context.String("stdout"),
Stderr: context.String("stderr"),
Terminal: tty,
})
if err != nil {
return err
}
fmt.Printf("container created with id %s and pid %d\n", id, r.Pid)
if context.Bool("attach") {
if tty {
current := console.Current()
defer current.Reset()
if err := current.SetRaw(); err != nil {
return err
}
size, err := current.Size()
if err != nil {
return err
}
if _, err := service.Pty(gocontext.Background(), &shim.PtyRequest{
Pid: r.Pid,
Width: uint32(size.Width),
Height: uint32(size.Height),
}); err != nil {
return err
}
}
wg.Wait()
}
return nil
},
}
var shimStartCommand = cli.Command{
Name: "start",
Usage: "start a container with a shim",
Action: func(context *cli.Context) error {
service, err := getShimService()
if err != nil {
return err
}
_, err = service.Start(gocontext.Background(), &shim.StartRequest{})
return err
},
}
var shimDeleteCommand = cli.Command{
Name: "delete",
Usage: "delete a container with a shim",
Action: func(context *cli.Context) error {
service, err := getShimService()
if err != nil {
return err
}
pid, err := strconv.Atoi(context.Args().First())
if err != nil {
return err
}
r, err := service.Delete(gocontext.Background(), &shim.DeleteRequest{
Pid: uint32(pid),
})
if err != nil {
return err
}
fmt.Printf("container deleted and returned exit status %d\n", r.ExitStatus)
return nil
},
}
var shimStateCommand = cli.Command{
Name: "state",
Usage: "get the state of all the processes of the shim",
Action: func(context *cli.Context) error {
service, err := getShimService()
if err != nil {
return err
}
r, err := service.State(gocontext.Background(), &shim.StateRequest{})
if err != nil {
return err
}
data, err := json.Marshal(r)
if err != nil {
return err
}
buf := bytes.NewBuffer(nil)
if err := json.Indent(buf, data, " ", " "); err != nil {
return err
}
buf.WriteTo(os.Stdout)
return nil
},
}
var shimExecCommand = cli.Command{
Name: "exec",
Usage: "exec a new process in the shim's container",
Flags: append(fifoFlags,
cli.BoolFlag{
Name: "attach,a",
Usage: "stay attached to the container and open the fifos",
},
cli.StringSliceFlag{
Name: "env,e",
Usage: "add environment vars",
Value: &cli.StringSlice{},
},
cli.StringFlag{
Name: "cwd",
Usage: "current working directory",
},
),
Action: func(context *cli.Context) error {
service, err := getShimService()
if err != nil {
return err
}
tty := context.Bool("tty")
wg, err := prepareStdio(context.String("stdin"), context.String("stdout"), context.String("stderr"), tty)
if err != nil {
return err
}
rq := &shim.ExecRequest{
Args: []string(context.Args()),
Env: context.StringSlice("env"),
Cwd: context.String("cwd"),
Stdin: context.String("stdin"),
Stdout: context.String("stdout"),
Stderr: context.String("stderr"),
Terminal: tty,
}
r, err := service.Exec(gocontext.Background(), rq)
if err != nil {
return err
}
fmt.Printf("exec running with pid %d\n", r.Pid)
if context.Bool("attach") {
logrus.Info("attaching")
if tty {
current := console.Current()
defer current.Reset()
if err := current.SetRaw(); err != nil {
return err
}
size, err := current.Size()
if err != nil {
return err
}
if _, err := service.Pty(gocontext.Background(), &shim.PtyRequest{
Pid: r.Pid,
Width: uint32(size.Width),
Height: uint32(size.Height),
}); err != nil {
return err
}
}
wg.Wait()
}
return nil
},
}
var shimEventsCommand = cli.Command{
Name: "events",
Usage: "get events for a shim",
Action: func(context *cli.Context) error {
service, err := getShimService()
if err != nil {
return err
}
events, err := service.Events(gocontext.Background(), &shim.EventsRequest{})
if err != nil {
return err
}
for {
e, err := events.Recv()
if err != nil {
return err
}
fmt.Printf("type=%s id=%s pid=%d status=%d\n", e.Type, e.ID, e.Pid, e.ExitStatus)
}
return nil
},
}
func getShimService() (shim.ShimClient, error) {
bindSocket := "shim.sock"
// reset the logger for grpc to log to dev/null so that it does not mess with our stdio
grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
dialOpts := []grpc.DialOption{grpc.WithInsecure(), grpc.WithTimeout(100 * time.Second)}
dialOpts = append(dialOpts,
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", bindSocket, timeout)
},
))
conn, err := grpc.Dial(fmt.Sprintf("unix://%s", bindSocket), dialOpts...)
if err != nil {
return nil, err
}
return shim.NewShimClient(conn), nil
}

View file

@ -14,6 +14,7 @@ import (
gocontext "context"
"github.com/Sirupsen/logrus"
"github.com/docker/containerd/api/execution"
"github.com/tonistiigi/fifo"
"github.com/urfave/cli"
@ -38,6 +39,7 @@ func prepareStdio(stdin, stdout, stderr string, console bool) (*sync.WaitGroup,
}(f)
go func(w io.WriteCloser) {
io.Copy(w, os.Stdin)
logrus.Info("stdin copy finished")
w.Close()
}(f)
@ -54,6 +56,7 @@ func prepareStdio(stdin, stdout, stderr string, console bool) (*sync.WaitGroup,
go func(r io.ReadCloser) {
io.Copy(os.Stdout, r)
r.Close()
logrus.Info("stdout copy finished")
wg.Done()
}(f)
@ -71,6 +74,7 @@ func prepareStdio(stdin, stdout, stderr string, console bool) (*sync.WaitGroup,
go func(r io.ReadCloser) {
io.Copy(os.Stderr, r)
r.Close()
logrus.Info("stderr copy finished")
wg.Done()
}(f)
}
@ -115,7 +119,6 @@ func getTempDir(id string) (string, error) {
if err != nil {
return "", err
}
tmpDir, err := ioutil.TempDir(filepath.Join(os.TempDir(), "ctr"), fmt.Sprintf("%s-", id))
if err != nil {
return "", err

142
shim/exec.go Normal file
View file

@ -0,0 +1,142 @@
package shim
import (
"context"
"fmt"
"os"
"path/filepath"
"sync"
"github.com/crosbymichael/console"
runc "github.com/crosbymichael/go-runc"
apishim "github.com/docker/containerd/api/shim"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
type execProcess struct {
sync.WaitGroup
id int
console console.Console
io runc.IO
status int
pid int
parent *initProcess
}
func newExecProcess(context context.Context, r *apishim.ExecRequest, parent *initProcess, id int) (process, error) {
cwd, err := os.Getwd()
if err != nil {
return nil, err
}
e := &execProcess{
id: id,
parent: parent,
}
var (
socket *runc.ConsoleSocket
io runc.IO
pidfile = filepath.Join(cwd, fmt.Sprintf("%d.pid", id))
)
if r.Terminal {
if socket, err = runc.NewConsoleSocket(filepath.Join(cwd, "pty.sock")); err != nil {
return nil, err
}
defer os.Remove(socket.Path())
} else {
// TODO: get uid/gid
if io, err = runc.NewPipeIO(0, 0); err != nil {
return nil, err
}
e.io = io
}
opts := &runc.ExecOpts{
PidFile: pidfile,
ConsoleSocket: socket,
IO: io,
Detach: true,
}
if err := parent.runc.Exec(context, parent.id, processFromRequest(r), opts); err != nil {
return nil, err
}
if socket != nil {
console, err := socket.ReceiveMaster()
if err != nil {
return nil, err
}
e.console = console
if err := copyConsole(context, console, r.Stdin, r.Stdout, r.Stderr, &e.WaitGroup); err != nil {
return nil, err
}
} else {
if err := copyPipes(context, io, r.Stdin, r.Stdout, r.Stderr, &e.WaitGroup); err != nil {
return nil, err
}
}
pid, err := runc.ReadPidFile(opts.PidFile)
if err != nil {
return nil, err
}
e.pid = pid
return e, nil
}
func processFromRequest(r *apishim.ExecRequest) specs.Process {
var user specs.User
if r.User != nil {
user.UID = r.User.Uid
user.GID = r.User.Gid
user.AdditionalGids = r.User.AdditionalGids
}
return specs.Process{
Terminal: r.Terminal,
User: user,
Rlimits: rlimits(r.Rlimits),
Args: r.Args,
Env: r.Env,
Cwd: r.Cwd,
Capabilities: r.Capabilities,
NoNewPrivileges: r.NoNewPrivileges,
ApparmorProfile: r.ApparmorProfile,
SelinuxLabel: r.SelinuxLabel,
}
}
func rlimits(rr []*apishim.Rlimit) (o []specs.LinuxRlimit) {
for _, r := range rr {
o = append(o, specs.LinuxRlimit{
Type: r.Type,
Hard: r.Hard,
Soft: r.Soft,
})
}
return o
}
func (e *execProcess) Pid() int {
return e.pid
}
func (e *execProcess) Status() int {
return e.status
}
func (e *execProcess) Exited(status int) {
e.status = status
e.Wait()
if e.io != nil {
e.io.Close()
}
}
func (e *execProcess) Delete(ctx context.Context) error {
return nil
}
func (e *execProcess) Resize(ws console.WinSize) error {
if e.console == nil {
return nil
}
return e.console.Resize(ws)
}

127
shim/init.go Normal file
View file

@ -0,0 +1,127 @@
package shim
import (
"context"
"os"
"path/filepath"
"sync"
"syscall"
"github.com/crosbymichael/console"
runc "github.com/crosbymichael/go-runc"
apishim "github.com/docker/containerd/api/shim"
)
type initProcess struct {
sync.WaitGroup
id string
bundle string
console console.Console
io runc.IO
runc *runc.Runc
status int
pid int
}
func newInitProcess(context context.Context, r *apishim.CreateRequest) (*initProcess, error) {
cwd, err := os.Getwd()
if err != nil {
return nil, err
}
runtime := &runc.Runc{
Command: r.Runtime,
Log: filepath.Join(cwd, "log.json"),
LogFormat: runc.JSON,
PdeathSignal: syscall.SIGKILL,
}
p := &initProcess{
id: r.ID,
bundle: r.Bundle,
runc: runtime,
}
var (
socket *runc.ConsoleSocket
io runc.IO
)
if r.Terminal {
if socket, err = runc.NewConsoleSocket(filepath.Join(cwd, "pty.sock")); err != nil {
return nil, err
}
defer os.Remove(socket.Path())
} else {
// TODO: get uid/gid
if io, err = runc.NewPipeIO(0, 0); err != nil {
return nil, err
}
p.io = io
}
opts := &runc.CreateOpts{
PidFile: filepath.Join(cwd, "init.pid"),
ConsoleSocket: socket,
IO: io,
NoPivot: r.NoPivot,
}
if err := p.runc.Create(context, r.ID, r.Bundle, opts); err != nil {
return nil, err
}
if socket != nil {
console, err := socket.ReceiveMaster()
if err != nil {
return nil, err
}
p.console = console
if err := copyConsole(context, console, r.Stdin, r.Stdout, r.Stderr, &p.WaitGroup); err != nil {
return nil, err
}
} else {
if err := copyPipes(context, io, r.Stdin, r.Stdout, r.Stderr, &p.WaitGroup); err != nil {
return nil, err
}
}
pid, err := runc.ReadPidFile(opts.PidFile)
if err != nil {
return nil, err
}
p.pid = pid
return p, nil
}
func (p *initProcess) Pid() int {
return p.pid
}
func (p *initProcess) Status() int {
return p.status
}
func (p *initProcess) Start(context context.Context) error {
return p.runc.Start(context, p.id)
}
func (p *initProcess) Exited(status int) {
p.status = status
}
func (p *initProcess) Delete(context context.Context) error {
p.killAll(context)
p.Wait()
err := p.runc.Delete(context, p.id)
if p.io != nil {
p.io.Close()
}
return err
}
func (p *initProcess) Resize(ws console.WinSize) error {
if p.console == nil {
return nil
}
return p.console.Resize(ws)
}
func (p *initProcess) killAll(context context.Context) error {
return p.runc.Kill(context, p.id, int(syscall.SIGKILL), &runc.KillOpts{
All: true,
})
}

81
shim/io.go Normal file
View file

@ -0,0 +1,81 @@
package shim
import (
"context"
"fmt"
"io"
"sync"
"syscall"
"github.com/crosbymichael/console"
runc "github.com/crosbymichael/go-runc"
"github.com/tonistiigi/fifo"
)
func copyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg *sync.WaitGroup) error {
in, err := fifo.OpenFifo(ctx, stdin, syscall.O_RDONLY, 0)
if err != nil {
return err
}
go io.Copy(console, in)
outw, err := fifo.OpenFifo(ctx, stdout, syscall.O_WRONLY, 0)
if err != nil {
return err
}
outr, err := fifo.OpenFifo(ctx, stdout, syscall.O_RDONLY, 0)
if err != nil {
return err
}
wg.Add(1)
go func() {
io.Copy(outw, console)
console.Close()
outr.Close()
outw.Close()
wg.Done()
}()
return nil
}
func copyPipes(ctx context.Context, rio runc.IO, stdin, stdout, stderr string, wg *sync.WaitGroup) error {
for name, dest := range map[string]func(wc io.WriteCloser, rc io.Closer){
stdout: func(wc io.WriteCloser, rc io.Closer) {
wg.Add(1)
go func() {
io.Copy(wc, rio.Stdout())
wg.Done()
wc.Close()
rc.Close()
}()
},
stderr: func(wc io.WriteCloser, rc io.Closer) {
wg.Add(1)
go func() {
io.Copy(wc, rio.Stderr())
wg.Done()
wc.Close()
rc.Close()
}()
},
} {
fw, err := fifo.OpenFifo(ctx, name, syscall.O_WRONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err)
}
fr, err := fifo.OpenFifo(ctx, name, syscall.O_RDONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", name, err)
}
dest(fw, fr)
}
f, err := fifo.OpenFifo(ctx, stdin, syscall.O_RDONLY, 0)
if err != nil {
return fmt.Errorf("containerd-shim: opening %s failed: %s", stdin, err)
}
go func() {
io.Copy(rio.Stdin(), f)
rio.Stdin().Close()
f.Close()
}()
return nil
}

19
shim/process.go Normal file
View file

@ -0,0 +1,19 @@
package shim
import (
"context"
"github.com/crosbymichael/console"
)
type process interface {
// Pid returns the pid for the process
Pid() int
// Resize resizes the process console
Resize(ws console.WinSize) error
// Exited sets the exit status for the process
Exited(status int)
// Status returns the exit status
Status() int
Delete(context.Context) error
}

170
shim/service.go Normal file
View file

@ -0,0 +1,170 @@
package shim
import (
"fmt"
"sync"
"syscall"
"github.com/crosbymichael/console"
apishim "github.com/docker/containerd/api/shim"
"github.com/docker/containerd/utils"
google_protobuf "github.com/golang/protobuf/ptypes/empty"
"golang.org/x/net/context"
)
var emptyResponse = &google_protobuf.Empty{}
// NewService returns a new shim service that can be used via GRPC
func NewService() *Service {
return &Service{
processes: make(map[int]process),
events: make(chan *apishim.Event, 4096),
}
}
type Service struct {
initProcess *initProcess
id string
mu sync.Mutex
processes map[int]process
events chan *apishim.Event
execID int
}
func (s *Service) Create(ctx context.Context, r *apishim.CreateRequest) (*apishim.CreateResponse, error) {
process, err := newInitProcess(ctx, r)
if err != nil {
return nil, err
}
s.mu.Lock()
s.initProcess = process
pid := process.Pid()
s.processes[pid] = process
s.id = r.ID
s.mu.Unlock()
s.events <- &apishim.Event{
Type: apishim.EventType_CREATED,
ID: r.ID,
Pid: uint32(pid),
}
return &apishim.CreateResponse{
Pid: uint32(pid),
}, nil
}
func (s *Service) Start(ctx context.Context, r *apishim.StartRequest) (*google_protobuf.Empty, error) {
if err := s.initProcess.Start(ctx); err != nil {
return nil, err
}
s.events <- &apishim.Event{
Type: apishim.EventType_STARTED,
ID: s.id,
Pid: uint32(s.initProcess.Pid()),
}
return emptyResponse, nil
}
func (s *Service) Delete(ctx context.Context, r *apishim.DeleteRequest) (*apishim.DeleteResponse, error) {
s.mu.Lock()
p, ok := s.processes[int(r.Pid)]
s.mu.Unlock()
if !ok {
return nil, fmt.Errorf("process does not exist %d", r.Pid)
}
if err := p.Delete(ctx); err != nil {
return nil, err
}
s.mu.Lock()
delete(s.processes, p.Pid())
s.mu.Unlock()
return &apishim.DeleteResponse{
ExitStatus: uint32(p.Status()),
}, nil
}
func (s *Service) Exec(ctx context.Context, r *apishim.ExecRequest) (*apishim.ExecResponse, error) {
s.mu.Lock()
defer s.mu.Unlock()
s.execID++
process, err := newExecProcess(ctx, r, s.initProcess, s.execID)
if err != nil {
return nil, err
}
pid := process.Pid()
s.processes[pid] = process
s.events <- &apishim.Event{
Type: apishim.EventType_EXEC_ADDED,
ID: s.id,
Pid: uint32(pid),
}
return &apishim.ExecResponse{
Pid: uint32(pid),
}, nil
}
func (s *Service) Pty(ctx context.Context, r *apishim.PtyRequest) (*google_protobuf.Empty, error) {
if r.Pid == 0 {
return nil, fmt.Errorf("pid not provided in request")
}
ws := console.WinSize{
Width: uint16(r.Width),
Height: uint16(r.Height),
}
s.mu.Lock()
p, ok := s.processes[int(r.Pid)]
s.mu.Unlock()
if !ok {
return nil, fmt.Errorf("process does not exist %d", r.Pid)
}
if err := p.Resize(ws); err != nil {
return nil, err
}
return emptyResponse, nil
}
func (s *Service) Events(r *apishim.EventsRequest, stream apishim.Shim_EventsServer) error {
for e := range s.events {
if err := stream.Send(e); err != nil {
return err
}
}
return nil
}
func (s *Service) State(ctx context.Context, r *apishim.StateRequest) (*apishim.StateResponse, error) {
o := &apishim.StateResponse{
ID: s.id,
Processes: []*apishim.Process{},
}
s.mu.Lock()
defer s.mu.Unlock()
for _, p := range s.processes {
state := apishim.State_RUNNING
if err := syscall.Kill(p.Pid(), 0); err != nil {
if err != syscall.ESRCH {
return nil, err
}
state = apishim.State_STOPPED
}
o.Processes = append(o.Processes, &apishim.Process{
Pid: uint32(p.Pid()),
State: state,
})
}
return o, nil
}
func (s *Service) ProcessExit(e utils.Exit) error {
s.mu.Lock()
if p, ok := s.processes[e.Pid]; ok {
p.Exited(e.Status)
s.events <- &apishim.Event{
Type: apishim.EventType_EXIT,
ID: s.id,
Pid: uint32(p.Pid()),
ExitStatus: uint32(e.Status),
}
}
s.mu.Unlock()
return nil
}

49
utils/reaper.go Normal file
View file

@ -0,0 +1,49 @@
package utils
import "syscall"
// Exit is the wait4 information from an exited process
type Exit struct {
Pid int
Status int
}
// Reap reaps all child processes for the calling process and returns their
// exit information
func Reap(wait bool) (exits []Exit, err error) {
var (
ws syscall.WaitStatus
rus syscall.Rusage
)
flag := syscall.WNOHANG
if wait {
flag = 0
}
for {
pid, err := syscall.Wait4(-1, &ws, flag, &rus)
if err != nil {
if err == syscall.ECHILD {
return exits, nil
}
return exits, err
}
if pid <= 0 {
return exits, nil
}
exits = append(exits, Exit{
Pid: pid,
Status: ExitStatus(ws),
})
}
}
const exitSignalOffset = 128
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status syscall.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}

19
utils/socket.go Normal file
View file

@ -0,0 +1,19 @@
package utils
import (
"net"
"os"
"path/filepath"
"syscall"
)
// CreateUnixSocket creates a unix socket and returns the listener
func CreateUnixSocket(path string) (net.Listener, error) {
if err := os.MkdirAll(filepath.Dir(path), 0660); err != nil {
return nil, err
}
if err := syscall.Unlink(path); err != nil && !os.IsNotExist(err) {
return nil, err
}
return net.Listen("unix", path)
}

View file

@ -1,5 +1,7 @@
# go-runc client for runc; master as of 01/20/2017
github.com/crosbymichael/go-runc afca56d262e694d9056e937a0877a39ab879aeb4
github.com/crosbymichael/go-runc 706de6f422f397fb70b8c98f9b8c8eab2de32ae2
# console pkg;
github.com/crosbymichael/console 4bf9d88357031b516b3794a2594b6d060a29c59c
# go-metrics client to prometheus; master as of 12/16/2016
github.com/docker/go-metrics 0f35294225552d968a13f9c5bc71a3fa44b2eb87
# prometheus client; latest release as of 12/16/2016
@ -31,7 +33,7 @@ github.com/nats-io/go-nats-streaming v0.3.4
# gnatsd; latest release as of 12/16/2016
github.com/nats-io/gnatsd v0.9.6
# runc, latest release as of 12/16/2016
github.com/opencontainers/runc v1.0.0-rc2
github.com/opencontainers/runc ce450bcc6c135cae93ee2a99d41a308c179ff6dc
# OCI runtime spec, latest release as of 12/16/2016
github.com/opencontainers/runtime-spec v1.0.0-rc3
# logrus, latest release as of 12/16/2016

24
vendor/github.com/crosbymichael/console/LICENSE generated vendored Normal file
View file

@ -0,0 +1,24 @@
Copyright (c) 2017-infinity Michael Crosby. crosbymichael@gmail.com
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use, copy,
modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH
THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

54
vendor/github.com/crosbymichael/console/console.go generated vendored Normal file
View file

@ -0,0 +1,54 @@
package console
import (
"errors"
"io"
"os"
)
var ErrNotAConsole = errors.New("provided file is not a console")
type Console interface {
io.Reader
io.Writer
io.Closer
// Resize resizes the console to the provided window size
Resize(WinSize) error
// ResizeFrom resizes the calling console to the size of the
// provided console
ResizeFrom(Console) error
// SetRaw sets the console in raw mode
SetRaw() error
// Reset restores the console to its orignal state
Reset() error
// Size returns the window size of the console
Size() (WinSize, error)
}
// WinSize specifies the window size of the console
type WinSize struct {
// Width of the console
Width uint16
// Height of the console
Height uint16
x uint16
y uint16
}
// Current returns the current processes console
func Current() Console {
return &master{
f: os.Stdin,
}
}
// ConsoleFromFile returns a console using the provided file
func ConsoleFromFile(f *os.File) (Console, error) {
if err := checkConsole(f); err != nil {
return nil, err
}
return &master{
f: f,
}, nil
}

View file

@ -0,0 +1,105 @@
package console
// #include <termios.h>
import "C"
import (
"os"
"syscall"
"unsafe"
)
// NewPty creates a new pty pair
// The master is returned as the first console and a string
// with the path to the pty slave is returned as the second
func NewPty() (Console, string, error) {
f, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, "", err
}
if err := saneTerminal(f); err != nil {
return nil, "", err
}
slave, err := ptsname(f)
if err != nil {
return nil, "", err
}
if err := unlockpt(f); err != nil {
return nil, "", err
}
return &master{
f: f,
}, slave, nil
}
type master struct {
f *os.File
termios *syscall.Termios
}
func (m *master) Read(b []byte) (int, error) {
return m.f.Read(b)
}
func (m *master) Write(b []byte) (int, error) {
return m.f.Write(b)
}
func (m *master) Close() error {
return m.f.Close()
}
func (m *master) Resize(ws WinSize) error {
return ioctl(
m.f.Fd(),
uintptr(syscall.TIOCSWINSZ),
uintptr(unsafe.Pointer(&ws)),
)
}
func (m *master) ResizeFrom(c Console) error {
ws, err := c.Size()
if err != nil {
return err
}
return m.Resize(ws)
}
func (m *master) Reset() error {
if m.termios == nil {
return nil
}
return tcset(m.f.Fd(), m.termios)
}
func (m *master) SetRaw() error {
m.termios = &syscall.Termios{}
if err := tcget(m.f.Fd(), m.termios); err != nil {
return err
}
rawState := *m.termios
C.cfmakeraw((*C.struct_termios)(unsafe.Pointer(&rawState)))
rawState.Oflag = rawState.Oflag | C.OPOST
return tcset(m.f.Fd(), &rawState)
}
func (m *master) Size() (WinSize, error) {
var ws WinSize
if err := ioctl(
m.f.Fd(),
uintptr(syscall.TIOCGWINSZ),
uintptr(unsafe.Pointer(&ws)),
); err != nil {
return ws, err
}
return ws, nil
}
// checkConsole checks if the provided file is a console
func checkConsole(f *os.File) error {
var termios syscall.Termios
if tcget(f.Fd(), &termios) != nil {
return ErrNotAConsole
}
return nil
}

52
vendor/github.com/crosbymichael/console/tc.go generated vendored Normal file
View file

@ -0,0 +1,52 @@
// +build linux
package console
import (
"fmt"
"os"
"syscall"
"unsafe"
)
func tcget(fd uintptr, p *syscall.Termios) error {
return ioctl(fd, syscall.TCGETS, uintptr(unsafe.Pointer(p)))
}
func tcset(fd uintptr, p *syscall.Termios) error {
return ioctl(fd, syscall.TCSETS, uintptr(unsafe.Pointer(p)))
}
func ioctl(fd, flag, data uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
return err
}
return nil
}
// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
// unlockpt should be called before opening the slave side of a pty.
func unlockpt(f *os.File) error {
var u int32
return ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
}
// ptsname retrieves the name of the first available pts for the given master.
func ptsname(f *os.File) (string, error) {
var n int32
if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
return "", err
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}
func saneTerminal(f *os.File) error {
// Go doesn't have a wrapper for any of the termios ioctls.
var termios syscall.Termios
if err := tcget(f.Fd(), &termios); err != nil {
return err
}
// Set -onlcr so we don't have to deal with \r.
termios.Oflag &^= syscall.ONLCR
return tcset(f.Fd(), &termios)
}

73
vendor/github.com/crosbymichael/go-runc/console.go generated vendored Normal file
View file

@ -0,0 +1,73 @@
package runc
import (
"fmt"
"net"
"path/filepath"
"github.com/crosbymichael/console"
"github.com/opencontainers/runc/libcontainer/utils"
)
// NewConsoleSocket creates a new unix socket at the provided path to accept a
// pty master created by runc for use by the container
func NewConsoleSocket(path string) (*ConsoleSocket, error) {
abs, err := filepath.Abs(path)
if err != nil {
return nil, err
}
l, err := net.Listen("unix", abs)
if err != nil {
return nil, err
}
return &ConsoleSocket{
l: l,
path: abs,
}, nil
}
// ConsoleSocket is a unix socket that accepts the pty master created by runc
type ConsoleSocket struct {
path string
l net.Listener
}
// Path returns the path to the unix socket on disk
func (c *ConsoleSocket) Path() string {
return c.path
}
// ReceiveMaster blocks until the socket receives the pty master
func (c *ConsoleSocket) ReceiveMaster() (console.Console, error) {
conn, err := c.l.Accept()
if err != nil {
return nil, err
}
defer conn.Close()
unix, ok := conn.(*net.UnixConn)
if !ok {
return nil, fmt.Errorf("received connection which was not a unix socket")
}
sock, err := unix.File()
if err != nil {
return nil, err
}
f, err := utils.RecvFd(sock)
if err != nil {
return nil, err
}
return console.ConsoleFromFile(f)
}
// Close closes the unix socket
func (c *ConsoleSocket) Close() error {
return c.l.Close()
}
// WinSize specifies the console size
type WinSize struct {
// Width of the console
Width uint16
// Height of the console
Height uint16
}

164
vendor/github.com/crosbymichael/go-runc/io.go generated vendored Normal file
View file

@ -0,0 +1,164 @@
package runc
import (
"io"
"os"
"os/exec"
"syscall"
)
type IO interface {
io.Closer
Stdin() io.WriteCloser
Stdout() io.ReadCloser
Stderr() io.ReadCloser
Set(*exec.Cmd)
}
type StartCloser interface {
CloseAfterStart() error
}
// NewPipeIO creates pipe pairs to be used with runc
func NewPipeIO(uid, gid int) (i IO, err error) {
var pipes []*pipe
// cleanup in case of an error
defer func() {
if err != nil {
for _, p := range pipes {
p.Close()
}
}
}()
stdin, err := newPipe(uid, gid)
if err != nil {
return nil, err
}
pipes = append(pipes, stdin)
stdout, err := newPipe(uid, gid)
if err != nil {
return nil, err
}
pipes = append(pipes, stdout)
stderr, err := newPipe(uid, gid)
if err != nil {
return nil, err
}
pipes = append(pipes, stderr)
return &pipeIO{
in: stdin,
out: stdout,
err: stderr,
}, nil
}
func newPipe(uid, gid int) (*pipe, error) {
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
if err := syscall.Fchown(int(r.Fd()), uid, gid); err != nil {
return nil, err
}
if err := syscall.Fchown(int(w.Fd()), uid, gid); err != nil {
return nil, err
}
return &pipe{
r: r,
w: w,
}, nil
}
type pipe struct {
r *os.File
w *os.File
}
func (p *pipe) Close() error {
err := p.r.Close()
if werr := p.w.Close(); err == nil {
err = werr
}
return err
}
type pipeIO struct {
in *pipe
out *pipe
err *pipe
}
func (i *pipeIO) Stdin() io.WriteCloser {
return i.in.w
}
func (i *pipeIO) Stdout() io.ReadCloser {
return i.out.r
}
func (i *pipeIO) Stderr() io.ReadCloser {
return i.err.r
}
func (i *pipeIO) Close() error {
var err error
for _, v := range []*pipe{
i.in,
i.out,
i.err,
} {
if cerr := v.Close(); err == nil {
err = cerr
}
}
return err
}
func (i *pipeIO) CloseAfterStart() error {
for _, f := range []*os.File{
i.out.w,
i.err.w,
} {
f.Close()
}
return nil
}
// Set sets the io to the exec.Cmd
func (i *pipeIO) Set(cmd *exec.Cmd) {
cmd.Stdin = i.in.r
cmd.Stdout = i.out.w
cmd.Stderr = i.err.w
}
func NewSTDIO() (IO, error) {
return &stdio{}, nil
}
type stdio struct {
}
func (s *stdio) Close() error {
return nil
}
func (s *stdio) Set(cmd *exec.Cmd) {
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
func (s *stdio) Stdin() io.WriteCloser {
return os.Stdin
}
func (s *stdio) Stdout() io.ReadCloser {
return os.Stdout
}
func (s *stdio) Stderr() io.ReadCloser {
return os.Stderr
}

View file

@ -8,6 +8,7 @@ import (
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"time"
@ -67,48 +68,22 @@ type CreateOpts struct {
IO
// PidFile is a path to where a pid file should be created
PidFile string
ConsoleSocket string
ConsoleSocket *ConsoleSocket
Detach bool
NoPivot bool
NoNewKeyring bool
}
type IO struct {
Stdin io.Reader
Stdout io.Writer
Stderr io.Writer
}
func (i *IO) Close() error {
var err error
for _, v := range []interface{}{
i.Stdin,
i.Stderr,
i.Stdout,
} {
if v != nil {
if c, ok := v.(io.Closer); ok {
if cerr := c.Close(); err == nil {
err = cerr
}
}
}
}
return err
}
func (o IO) setSTDIO(cmd *exec.Cmd) {
cmd.Stdin = o.Stdin
cmd.Stdout = o.Stdout
cmd.Stderr = o.Stderr
}
func (o *CreateOpts) args() (out []string) {
func (o *CreateOpts) args() (out []string, err error) {
if o.PidFile != "" {
out = append(out, "--pid-file", o.PidFile)
abs, err := filepath.Abs(o.PidFile)
if err != nil {
return nil, err
}
if o.ConsoleSocket != "" {
out = append(out, "--console-socket", o.ConsoleSocket)
out = append(out, "--pid-file", abs)
}
if o.ConsoleSocket != nil {
out = append(out, "--console-socket", o.ConsoleSocket.Path())
}
if o.NoPivot {
out = append(out, "--no-pivot")
@ -119,20 +94,41 @@ func (o *CreateOpts) args() (out []string) {
if o.Detach {
out = append(out, "--detach")
}
return out
return out, nil
}
// Create creates a new container and returns its pid if it was created successfully
func (r *Runc) Create(context context.Context, id, bundle string, opts *CreateOpts) error {
args := []string{"create", "--bundle", bundle}
if opts != nil {
args = append(args, opts.args()...)
oargs, err := opts.args()
if err != nil {
return err
}
args = append(args, oargs...)
}
cmd := r.command(context, append(args, id)...)
if opts != nil {
opts.setSTDIO(cmd)
if opts != nil && opts.IO != nil {
opts.Set(cmd)
}
return runOrError(cmd)
if cmd.Stdout == nil && cmd.Stderr == nil {
data, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%s: %s", err, data)
}
return nil
}
if err := cmd.Start(); err != nil {
return err
}
if opts != nil && opts.IO != nil {
if c, ok := opts.IO.(StartCloser); ok {
if err := c.CloseAfterStart(); err != nil {
return err
}
}
}
return cmd.Wait()
}
// Start will start an already created container
@ -147,17 +143,17 @@ type ExecOpts struct {
Gid int
Cwd string
Tty bool
ConsoleSocket string
ConsoleSocket *ConsoleSocket
Detach bool
}
func (o *ExecOpts) args() (out []string) {
func (o *ExecOpts) args() (out []string, err error) {
out = append(out, "--user", fmt.Sprintf("%d:%d", o.Uid, o.Gid))
if o.Tty {
out = append(out, "--tty")
}
if o.ConsoleSocket != "" {
out = append(out, "--console-socket", o.ConsoleSocket)
if o.ConsoleSocket != nil {
out = append(out, "--console-socket", o.ConsoleSocket.Path())
}
if o.Cwd != "" {
out = append(out, "--cwd", o.Cwd)
@ -166,9 +162,13 @@ func (o *ExecOpts) args() (out []string) {
out = append(out, "--detach")
}
if o.PidFile != "" {
out = append(out, "--pid-file", o.PidFile)
abs, err := filepath.Abs(o.PidFile)
if err != nil {
return nil, err
}
return out
out = append(out, "--pid-file", abs)
}
return out, nil
}
// Exec executres and additional process inside the container based on a full
@ -186,13 +186,34 @@ func (r *Runc) Exec(context context.Context, id string, spec specs.Process, opts
}
args := []string{"exec", "--process", f.Name()}
if opts != nil {
args = append(args, opts.args()...)
oargs, err := opts.args()
if err != nil {
return err
}
args = append(args, oargs...)
}
cmd := r.command(context, append(args, id)...)
if opts != nil {
opts.setSTDIO(cmd)
if opts != nil && opts.IO != nil {
opts.Set(cmd)
}
return runOrError(cmd)
if cmd.Stdout == nil && cmd.Stderr == nil {
data, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%s: %s", err, data)
}
return nil
}
if err := cmd.Start(); err != nil {
return err
}
if opts != nil && opts.IO != nil {
if c, ok := opts.IO.(StartCloser); ok {
if err := c.CloseAfterStart(); err != nil {
return err
}
}
}
return cmd.Wait()
}
// Run runs the create, start, delete lifecycle of the container
@ -200,11 +221,15 @@ func (r *Runc) Exec(context context.Context, id string, spec specs.Process, opts
func (r *Runc) Run(context context.Context, id, bundle string, opts *CreateOpts) (int, error) {
args := []string{"run", "--bundle", bundle}
if opts != nil {
args = append(args, opts.args()...)
oargs, err := opts.args()
if err != nil {
return -1, err
}
args = append(args, oargs...)
}
cmd := r.command(context, append(args, id)...)
if opts != nil {
opts.setSTDIO(cmd)
opts.Set(cmd)
}
if err := cmd.Start(); err != nil {
return -1, err

View file

@ -0,0 +1,32 @@
#ifndef NSENTER_NAMESPACE_H
#define NSENTER_NAMESPACE_H
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
#include <sched.h>
/* All of these are taken from include/uapi/linux/sched.h */
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#endif
#ifndef CLONE_NEWIPC
# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#endif
#ifndef CLONE_NEWUSER
# define CLONE_NEWUSER 0x10000000 /* New user namespace */
#endif
#ifndef CLONE_NEWPID
# define CLONE_NEWPID 0x20000000 /* New pid namespace */
#endif
#ifndef CLONE_NEWNET
# define CLONE_NEWNET 0x40000000 /* New network namespace */
#endif
#endif /* NSENTER_NAMESPACE_H */

View file

@ -0,0 +1,12 @@
// +build linux,!gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
}
*/
import "C"

View file

@ -0,0 +1,25 @@
// +build linux,gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
}
*/
import "C"
// AlwaysFalse is here to stay false
// (and be exported so the compiler doesn't optimize out its reference)
var AlwaysFalse bool
func init() {
if AlwaysFalse {
// by referencing this C init() in a noop test, it will ensure the compiler
// links in the C function.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
C.init()
}
}

View file

@ -0,0 +1,5 @@
// +build !linux !cgo
package nsenter
import "C"

View file

@ -0,0 +1,759 @@
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <linux/limits.h>
#include <linux/netlink.h>
#include <linux/types.h>
/* Get all of the CLONE_NEW* flags. */
#include "namespace.h"
/* Synchronisation values. */
enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */
/* XXX: This doesn't help with segfaults and other such issues. */
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
#define JUMP_INIT 0xA1
/* JSON buffer. */
#define JSON_MAX 4096
/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
/*
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__ ((aligned(16)));
char stack_ptr[0];
/* There's two children. This is used to execute the different code. */
jmp_buf *env;
int jmpval;
};
struct nlconfig_t {
char *data;
uint32_t cloneflags;
char *uidmap;
size_t uidmap_len;
char *gidmap;
size_t gidmap_len;
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
};
/*
* List of netlink message types sent to us as part of bootstrapping the init.
* These constants are defined in libcontainer/message_linux.go.
*/
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define NS_PATHS_ATTR 27282
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
/*
* Use the raw syscall for versions of glibc which don't include a function for
* it, namely (glibc 2.12).
*/
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
# define _GNU_SOURCE
# include "syscall.h"
# if !defined(SYS_setns) && defined(__NR_setns)
# define SYS_setns __NR_setns
# endif
#ifndef SYS_setns
# error "setns(2) syscall not supported by glibc version"
#endif
int setns(int fd, int nstype)
{
return syscall(SYS_setns, fd, nstype);
}
#endif
/* XXX: This is ugly. */
static int syncfd = -1;
/* TODO(cyphar): Fix this so it correctly deals with syncT. */
#define bail(fmt, ...) \
do { \
int ret = __COUNTER__ + 1; \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
if (syncfd >= 0) { \
enum sync_t s = SYNC_ERR; \
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
fprintf(stderr, "nsenter: failed: write(s)"); \
if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
fprintf(stderr, "nsenter: failed: write(ret)"); \
} \
exit(ret); \
} while(0)
static int write_file(char *data, size_t data_len, char *pathfmt, ...)
{
int fd, len, ret = 0;
char path[PATH_MAX];
va_list ap;
va_start(ap, pathfmt);
len = vsnprintf(path, PATH_MAX, pathfmt, ap);
va_end(ap);
if (len < 0)
return -1;
fd = open(path, O_RDWR);
if (fd < 0) {
ret = -1;
goto out;
}
len = write(fd, data, data_len);
if (len != data_len) {
ret = -1;
goto out;
}
out:
close(fd);
return ret;
}
enum policy_t {
SETGROUPS_DEFAULT = 0,
SETGROUPS_ALLOW,
SETGROUPS_DENY,
};
/* This *must* be called before we touch gid_map. */
static void update_setgroups(int pid, enum policy_t setgroup)
{
char *policy;
switch (setgroup) {
case SETGROUPS_ALLOW:
policy = "allow";
break;
case SETGROUPS_DENY:
policy = "deny";
break;
case SETGROUPS_DEFAULT:
/* Nothing to do. */
return;
}
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
/*
* If the kernel is too old to support /proc/pid/setgroups,
* open(2) or write(2) will return ENOENT. This is fine.
*/
if (errno != ENOENT)
bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
}
}
static void update_uidmap(int pid, char *map, int map_len)
{
if (map == NULL || map_len <= 0)
return;
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
bail("failed to update /proc/%d/uid_map", pid);
}
static void update_gidmap(int pid, char *map, int map_len)
{
if (map == NULL || map_len <= 0)
return;
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
bail("failed to update /proc/%d/gid_map", pid);
}
/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline));
static int child_func(void *arg)
{
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
.env = env,
.jmpval = jmpval,
};
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
}
/*
* Gets the init pipe fd from the environment, which is used to read the
* bootstrap data and tell the parent what the new pid is after we finish
* setting up the environment.
*/
static int initpipe(void)
{
int pipenum;
char *initpipe, *endptr;
initpipe = getenv("_LIBCONTAINER_INITPIPE");
if (initpipe == NULL || *initpipe == '\0')
return -1;
pipenum = strtol(initpipe, &endptr, 10);
if (*endptr != '\0')
bail("unable to parse _LIBCONTAINER_INITPIPE");
return pipenum;
}
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
static int nsflag(char *name)
{
if (!strcmp(name, "cgroup"))
return CLONE_NEWCGROUP;
else if (!strcmp(name, "ipc"))
return CLONE_NEWIPC;
else if (!strcmp(name, "mnt"))
return CLONE_NEWNS;
else if (!strcmp(name, "net"))
return CLONE_NEWNET;
else if (!strcmp(name, "pid"))
return CLONE_NEWPID;
else if (!strcmp(name, "user"))
return CLONE_NEWUSER;
else if (!strcmp(name, "uts"))
return CLONE_NEWUTS;
/* If we don't recognise a name, fallback to 0. */
return 0;
}
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
static uint8_t readint8(char *buf)
{
return *(uint8_t *) buf;
}
static void nl_parse(int fd, struct nlconfig_t *config)
{
size_t len, size;
struct nlmsghdr hdr;
char *data, *current;
/* Retrieve the netlink header. */
len = read(fd, &hdr, NLMSG_HDRLEN);
if (len != NLMSG_HDRLEN)
bail("invalid netlink header length %lu", len);
if (hdr.nlmsg_type == NLMSG_ERROR)
bail("failed to read netlink message");
if (hdr.nlmsg_type != INIT_MSG)
bail("unexpected msg type %d", hdr.nlmsg_type);
/* Retrieve data. */
size = NLMSG_PAYLOAD(&hdr, 0);
current = data = malloc(size);
if (!data)
bail("failed to allocate %zu bytes of memory for nl_payload", size);
len = read(fd, data, size);
if (len != size)
bail("failed to read netlink payload, %lu != %lu", len, size);
/* Parse the netlink payload. */
config->data = data;
while (current < data + size) {
struct nlattr *nlattr = (struct nlattr *)current;
size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
/* Advance to payload. */
current += NLA_HDRLEN;
/* Handle payload. */
switch (nlattr->nla_type) {
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
case NS_PATHS_ATTR:
config->namespaces = current;
config->namespaces_len = payload_len;
break;
case UIDMAP_ATTR:
config->uidmap = current;
config->uidmap_len = payload_len;
break;
case GIDMAP_ATTR:
config->gidmap = current;
config->gidmap_len = payload_len;
break;
case SETGROUP_ATTR:
config->is_setgroup = readint8(current);
break;
default:
bail("unknown netlink message type %d", nlattr->nla_type);
}
current += NLA_ALIGN(payload_len);
}
}
void nl_free(struct nlconfig_t *config)
{
free(config->data);
}
void join_namespaces(char *nslist)
{
int num = 0, i;
char *saveptr = NULL;
char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t {
int fd;
int ns;
char type[PATH_MAX];
char path[PATH_MAX];
} *namespaces = NULL;
if (!namespace || !strlen(namespace) || !strlen(nslist))
bail("ns paths are empty");
/*
* We have to open the file descriptors first, since after
* we join the mnt namespace we might no longer be able to
* access the paths.
*/
do {
int fd;
char *path;
struct namespace_t *ns;
/* Resize the namespace array. */
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
if (!namespaces)
bail("failed to reallocate namespace array");
ns = &namespaces[num - 1];
/* Split 'ns:path'. */
path = strstr(namespace, ":");
if (!path)
bail("failed to parse %s", namespace);
*path++ = '\0';
fd = open(path, O_RDONLY);
if (fd < 0)
bail("failed to open %s", path);
ns->fd = fd;
ns->ns = nsflag(namespace);
strncpy(ns->path, path, PATH_MAX);
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
/*
* The ordering in which we join namespaces is important. We should
* always join the user namespace *first*. This is all guaranteed
* from the container_linux.go side of this, so we're just going to
* follow the order given to us.
*/
for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i];
if (setns(ns.fd, ns.ns) < 0)
bail("failed to setns to %s", ns.path);
close(ns.fd);
}
free(namespaces);
}
void nsexec(void)
{
int pipenum;
jmp_buf env;
int syncpipe[2];
struct nlconfig_t config = {0};
/*
* If we don't have an init pipe, just return to the go routine.
* We'll only get an init pipe for start or exec.
*/
pipenum = initpipe();
if (pipenum == -1)
return;
/* make the process non-dumpable */
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) {
bail("failed to set process as non-dumpable");
}
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
/* Pipe so we can tell the child when we've finished setting up. */
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
bail("failed to setup sync pipe between parent and child");
/* TODO: Currently we aren't dealing with child deaths properly. */
/*
* Okay, so this is quite annoying.
*
* In order for this unsharing code to be more extensible we need to split
* up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
* would be if we did clone(CLONE_NEWUSER) and the other namespaces
* separately, but because of SELinux issues we cannot really do that. But
* we cannot just dump the namespace flags into clone(...) because several
* usecases (such as rootless containers) require more granularity around
* the namespace setup. In addition, some older kernels had issues where
* CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
* handle this while also dealing with SELinux so we choose SELinux support
* over broken kernel support).
*
* However, if we unshare(2) the user namespace *before* we clone(2), then
* all hell breaks loose.
*
* The parent no longer has permissions to do many things (unshare(2) drops
* all capabilities in your old namespace), and the container cannot be set
* up to have more than one {uid,gid} mapping. This is obviously less than
* ideal. In order to fix this, we have to first clone(2) and then unshare.
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
* /proc/pid/{setgroups,uid_map,gid_map}.
*
* And as a result of the above, we also need to setns(2) in the first child
* because if we join a PID namespace in the topmost parent then our child
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/
switch (setjmp(env)) {
/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/
case JUMP_PARENT: {
int len, ready = 0;
pid_t child;
char buf[JSON_MAX];
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
if (child < 0)
bail("unable to fork: child_func");
/*
* State machine for synchronisation with the children.
*
* Father only return when both child and grandchild are
* ready, so we can receive all possible error codes
* generated by children.
*/
while (ready < 2) {
enum sync_t s;
/* This doesn't need to be global, we're in the parent. */
int syncfd = syncpipe[1];
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_ERR: {
/* We have to mirror the error code of the child. */
int ret;
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)");
exit(ret);
}
break;
case SYNC_USERMAP_PLS:
/* Enable setgroups(2) if we've been asked to. */
if (config.is_setgroup)
update_setgroups(child, SETGROUPS_ALLOW);
/* Set up mappings. */
update_uidmap(child, config.uidmap, config.uidmap_len);
update_gidmap(child, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
}
break;
case SYNC_USERMAP_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
break;
case SYNC_RECVPID_PLS: {
pid_t old = child;
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(old, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(old, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
}
ready++;
break;
case SYNC_RECVPID_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
break;
case SYNC_CHILD_READY:
ready++;
break;
default:
bail("unexpected sync value");
break;
}
}
/* Send the init_func pid back to our parent. */
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}
exit(0);
}
/*
* Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all
* of the requested namespaces. If we've been asked to
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up
* our user mappings for us. Then, we create a new child
* (stage 2: JUMP_INIT) for PID namespace. We then send the
* child's PID to our parent (stage 0).
*/
case JUMP_CHILD: {
pid_t child;
enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
/*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*/
if (config.namespaces)
join_namespaces(config.namespaces);
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
bail("failed to unshare namespaces");
/*
* Deal with user namespaces first. They are quite special, as they
* affect our ability to unshare other namespaces and are used as
* context for privilege checks.
*/
if (config.cloneflags & CLONE_NEWUSER) {
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal our parent to hook us up.
*/
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
}
/*
* TODO: What about non-namespace clone flags that we're dropping here?
*
* We fork again because of PID namespace, setns(2) or unshare(2) don't
* change the PID namespace of the calling process, because doing so
* would change the caller's idea of its own PID (as reported by getpid()),
* which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace.
*/
child = clone_parent(&env, JUMP_INIT);
if (child < 0)
bail("unable to fork: init_func");
/* Send the child to our parent, which knows what it's doing. */
s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
}
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(childpid)");
}
/* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
}
if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
}
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
exit(0);
}
/*
* Stage 2: We're the final child process, and the only process that will
* actually return to the Go runtime. Our job is to just do the
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/
case JUMP_INIT: {
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
*/
enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
if (setsid() < 0)
bail("setsid failed");
if (setuid(0) < 0)
bail("setuid failed");
if (setgid(0) < 0)
bail("setgid failed");
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
/* Close sync pipes. */
close(syncpipe[0]);
close(syncpipe[1]);
/* Free netlink data. */
nl_free(&config);
/* Finish executing, let the Go runtime take over. */
return;
}
default:
bail("unexpected jump value");
break;
}
/* Should never be reached. */
bail("should never be reached");
}

View file

@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
if err != nil {
return "", err
}
return parseStartTime(string(data))
}
parts := strings.Split(string(data), " ")
func parseStartTime(stat string) (string, error) {
// the starttime is located at pos 22
// from the man page
//
@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
// (22) The time the process started after system boot. In kernels before Linux 2.6, this
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks
// (divide by sysconf(_SC_CLK_TCK)).
return parts[22-1], nil // starts at 1
//
// NOTE:
// pos 2 could contain space and is inside `(` and `)`:
// (2) comm %s
// The filename of the executable, in parentheses.
// This is visible whether or not the executable is
// swapped out.
//
// the following is an example:
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
// get parts after last `)`:
s := strings.Split(stat, ")")
parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
return parts[22-3], nil // starts at 3 (after the filename pos `2`)
}

View file

@ -343,7 +343,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else if groupArg != "" {
} else {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.

View file

@ -0,0 +1,148 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "cmsg.h"
#define error(fmt, ...) \
({ \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
errno = ECOMM; \
goto err; /* return value */ \
})
/*
* Sends a file descriptor along the sockfd provided. Returns the return
* value of sendmsg(2). Any synchronisation and preparation of state
* should be done external to this (we expect the other side to be in
* recvfd() in the code).
*/
ssize_t sendfd(int sockfd, struct file_t file)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
int *fdptr;
int ret;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/*
* We need to send some other data along with the ancillary data,
* otherwise the other side won't recieve any data. This is very
* well-hidden in the documentation (and only applies to
* SOCK_STREAM). See the bottom part of unix(7).
*/
iov[0].iov_base = file.name;
iov[0].iov_len = strlen(file.name) + 1;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
fdptr = (int *) CMSG_DATA(cmsg);
memcpy(fdptr, &file.fd, sizeof(int));
return sendmsg(sockfd, &msg, 0);
}
/*
* Receives a file descriptor from the sockfd provided. Returns the file
* descriptor as sent from sendfd(). It will return the file descriptor
* or die (literally) trying. Any synchronisation and preparation of
* state should be done external to this (we expect the other side to be
* in sendfd() in the code).
*/
struct file_t recvfd(int sockfd)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
struct file_t file = {0};
int *fdptr;
int olderrno;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/* Allocate a buffer. */
/* TODO: Make this dynamic with MSG_PEEK. */
file.name = malloc(TAG_BUFFER);
if (!file.name)
error("recvfd: failed to allocate file.tag buffer\n");
/*
* We need to "recieve" the non-ancillary data even though we don't
* plan to use it at all. Otherwise, things won't work as expected.
* See unix(7) and other well-hidden documentation.
*/
iov[0].iov_base = file.name;
iov[0].iov_len = TAG_BUFFER;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
ssize_t ret = recvmsg(sockfd, &msg, 0);
if (ret < 0)
goto err;
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
error("recvfd: got NULL from CMSG_FIRSTHDR");
if (cmsg->cmsg_level != SOL_SOCKET)
error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
if (cmsg->cmsg_type != SCM_RIGHTS)
error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
error("recvfd: expected correct CMSG_LEN in cmsg: %lu", cmsg->cmsg_len);
fdptr = (int *) CMSG_DATA(cmsg);
if (!fdptr || *fdptr < 0)
error("recvfd: recieved invalid pointer");
file.fd = *fdptr;
return file;
err:
olderrno = errno;
free(file.name);
errno = olderrno;
return (struct file_t){0};
}

View file

@ -0,0 +1,57 @@
// +build linux
package utils
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
#include <errno.h>
#include <stdlib.h>
#include "cmsg.h"
*/
import "C"
import (
"os"
"unsafe"
)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
file, err := C.recvfd(C.int(socket.Fd()))
if err != nil {
return nil, err
}
defer C.free(unsafe.Pointer(file.name))
return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket, file *os.File) error {
var cfile C.struct_file_t
cfile.fd = C.int(file.Fd())
cfile.name = C.CString(file.Name())
defer C.free(unsafe.Pointer(cfile.name))
_, err := C.sendfd(C.int(socket.Fd()), cfile)
return err
}

View file

@ -0,0 +1,36 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if !defined(CMSG_H)
#define CMSG_H
#include <sys/types.h>
/* TODO: Implement this properly with MSG_PEEK. */
#define TAG_BUFFER 4096
/* This mirrors Go's (*os.File). */
struct file_t {
char *name;
int fd;
};
struct file_t recvfd(int sockfd);
ssize_t sendfd(int sockfd, struct file_t file);
#endif /* !defined(CMSG_H) */

View file

@ -0,0 +1,126 @@
package utils
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"io"
"os"
"path/filepath"
"strings"
"syscall"
"unsafe"
)
const (
exitSignalOffset = 128
)
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
if size > 64 {
size = 64
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {
rootfs, err := filepath.Abs(uncleanRootfs)
if err != nil {
return "", err
}
return filepath.EvalSymlinks(rootfs)
}
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status syscall.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}
func GetIntSize() int {
return int(unsafe.Sizeof(1))
}

View file

@ -0,0 +1,33 @@
// +build !windows
package utils
import (
"io/ioutil"
"strconv"
"syscall"
)
func CloseExecFrom(minFd int) error {
fdList, err := ioutil.ReadDir("/proc/self/fd")
if err != nil {
return err
}
for _, fi := range fdList {
fd, err := strconv.Atoi(fi.Name())
if err != nil {
// ignore non-numeric file names
continue
}
if fd < minFd {
// ignore descriptors lower than our specified minimum
continue
}
// intentionally ignore errors from syscall.CloseOnExec
syscall.CloseOnExec(fd)
// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
}
return nil
}