665 lines
15 KiB
Go
665 lines
15 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
winio "github.com/Microsoft/go-winio"
|
|
"github.com/Microsoft/hcsshim/internal/guid"
|
|
"github.com/Microsoft/hcsshim/internal/hcs"
|
|
"github.com/Microsoft/hcsshim/internal/hcsoci"
|
|
"github.com/Microsoft/hcsshim/internal/regstate"
|
|
"github.com/Microsoft/hcsshim/internal/uvm"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/windows"
|
|
)
|
|
|
|
var errContainerStopped = errors.New("container is stopped")
|
|
|
|
type persistedState struct {
|
|
ID string
|
|
SandboxID string
|
|
HostID string
|
|
Bundle string
|
|
Created time.Time
|
|
Rootfs string
|
|
Spec *specs.Spec
|
|
RequestedNetNS string
|
|
IsHost bool
|
|
UniqueID guid.GUID
|
|
HostUniqueID guid.GUID
|
|
}
|
|
|
|
type containerStatus string
|
|
|
|
const (
|
|
containerRunning containerStatus = "running"
|
|
containerStopped containerStatus = "stopped"
|
|
containerCreated containerStatus = "created"
|
|
containerPaused containerStatus = "paused"
|
|
containerUnknown containerStatus = "unknown"
|
|
|
|
keyState = "state"
|
|
keyResources = "resources"
|
|
keyShimPid = "shim"
|
|
keyInitPid = "pid"
|
|
keyNetNS = "netns"
|
|
)
|
|
|
|
type container struct {
|
|
persistedState
|
|
ShimPid int
|
|
hc *hcs.System
|
|
resources *hcsoci.Resources
|
|
}
|
|
|
|
func getErrorFromPipe(pipe io.Reader, p *os.Process) error {
|
|
serr, err := ioutil.ReadAll(pipe)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if bytes.Equal(serr, shimSuccess) {
|
|
return nil
|
|
}
|
|
|
|
extra := ""
|
|
if p != nil {
|
|
p.Kill()
|
|
state, err := p.Wait()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
extra = fmt.Sprintf(", exit code %d", state.Sys().(syscall.WaitStatus).ExitCode)
|
|
}
|
|
if len(serr) == 0 {
|
|
return fmt.Errorf("unknown shim failure%s", extra)
|
|
}
|
|
|
|
return errors.New(string(serr))
|
|
}
|
|
|
|
func startProcessShim(id, pidFile, logFile string, spec *specs.Process) (_ *os.Process, err error) {
|
|
// Ensure the stdio handles inherit to the child process. This isn't undone
|
|
// after the StartProcess call because the caller never launches another
|
|
// process before exiting.
|
|
for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
|
|
err = windows.SetHandleInformation(windows.Handle(f.Fd()), windows.HANDLE_FLAG_INHERIT, windows.HANDLE_FLAG_INHERIT)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
args := []string{
|
|
"--stdin", strconv.Itoa(int(os.Stdin.Fd())),
|
|
"--stdout", strconv.Itoa(int(os.Stdout.Fd())),
|
|
"--stderr", strconv.Itoa(int(os.Stderr.Fd())),
|
|
}
|
|
if spec != nil {
|
|
args = append(args, "--exec")
|
|
}
|
|
args = append(args, id)
|
|
return launchShim("shim", pidFile, logFile, args, spec)
|
|
}
|
|
|
|
func launchShim(cmd, pidFile, logFile string, args []string, data interface{}) (_ *os.Process, err error) {
|
|
executable, err := os.Executable()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create a pipe to use as stderr for the shim process. This is used to
|
|
// retrieve early error information, up to the point that the shim is ready
|
|
// to launch a process in the container.
|
|
rp, wp, err := os.Pipe()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rp.Close()
|
|
defer wp.Close()
|
|
|
|
// Create a pipe to send the data, if one is provided.
|
|
var rdatap, wdatap *os.File
|
|
if data != nil {
|
|
rdatap, wdatap, err = os.Pipe()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rdatap.Close()
|
|
defer wdatap.Close()
|
|
}
|
|
|
|
var log *os.File
|
|
fullargs := []string{os.Args[0]}
|
|
if logFile != "" {
|
|
log, err = os.OpenFile(logFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer log.Close()
|
|
|
|
fullargs = append(fullargs, "--log-format", logFormat)
|
|
if logrus.GetLevel() == logrus.DebugLevel {
|
|
fullargs = append(fullargs, "--debug")
|
|
}
|
|
}
|
|
fullargs = append(fullargs, cmd)
|
|
fullargs = append(fullargs, args...)
|
|
attr := &os.ProcAttr{
|
|
Files: []*os.File{rdatap, wp, log},
|
|
}
|
|
p, err := os.StartProcess(executable, fullargs, attr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
p.Kill()
|
|
}
|
|
}()
|
|
|
|
wp.Close()
|
|
|
|
// Write the data if provided.
|
|
if data != nil {
|
|
rdatap.Close()
|
|
dataj, err := json.Marshal(data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
_, err = wdatap.Write(dataj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
wdatap.Close()
|
|
}
|
|
|
|
err = getErrorFromPipe(rp, p)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if pidFile != "" {
|
|
if err = createPidFile(pidFile, p.Pid); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return p, nil
|
|
}
|
|
|
|
func parseSandboxAnnotations(spec *specs.Spec) (string, bool) {
|
|
a := spec.Annotations
|
|
var t, id string
|
|
if t = a["io.kubernetes.cri.container-type"]; t != "" {
|
|
id = a["io.kubernetes.cri.sandbox-id"]
|
|
} else if t = a["io.kubernetes.cri-o.ContainerType"]; t != "" {
|
|
id = a["io.kubernetes.cri-o.SandboxID"]
|
|
} else if t = a["io.kubernetes.docker.type"]; t != "" {
|
|
id = a["io.kubernetes.sandbox.id"]
|
|
if t == "podsandbox" {
|
|
t = "sandbox"
|
|
}
|
|
}
|
|
if t == "container" {
|
|
return id, false
|
|
}
|
|
if t == "sandbox" {
|
|
return id, true
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func (c *container) startVMShim(logFile string, consolePipe string) (*os.Process, error) {
|
|
opts := &uvm.UVMOptions{
|
|
ID: vmID(c.ID),
|
|
ConsolePipe: consolePipe,
|
|
}
|
|
if c.Spec.Windows != nil {
|
|
opts.Resources = c.Spec.Windows.Resources
|
|
}
|
|
|
|
if c.Spec.Linux != nil {
|
|
opts.OperatingSystem = "linux"
|
|
} else {
|
|
opts.OperatingSystem = "windows"
|
|
layers := make([]string, len(c.Spec.Windows.LayerFolders))
|
|
for i, f := range c.Spec.Windows.LayerFolders {
|
|
if i == len(c.Spec.Windows.LayerFolders)-1 {
|
|
f = filepath.Join(f, "vm")
|
|
err := os.MkdirAll(f, 0)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
layers[i] = f
|
|
}
|
|
opts.LayerFolders = layers
|
|
}
|
|
return launchShim("vmshim", "", logFile, []string{c.VMPipePath()}, opts)
|
|
}
|
|
|
|
type containerConfig struct {
|
|
ID string
|
|
HostID string
|
|
PidFile string
|
|
ShimLogFile, VMLogFile string
|
|
Spec *specs.Spec
|
|
VMConsolePipe string
|
|
}
|
|
|
|
func createContainer(cfg *containerConfig) (_ *container, err error) {
|
|
// Store the container information in a volatile registry key.
|
|
cwd, err := os.Getwd()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
vmisolated := cfg.Spec.Linux != nil || (cfg.Spec.Windows != nil && cfg.Spec.Windows.HyperV != nil)
|
|
|
|
sandboxID, isSandbox := parseSandboxAnnotations(cfg.Spec)
|
|
hostID := cfg.HostID
|
|
if isSandbox {
|
|
if sandboxID != cfg.ID {
|
|
return nil, errors.New("sandbox ID must match ID")
|
|
}
|
|
} else if sandboxID != "" {
|
|
// Validate that the sandbox container exists.
|
|
sandbox, err := getContainer(sandboxID, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer sandbox.Close()
|
|
if sandbox.SandboxID != sandboxID {
|
|
return nil, fmt.Errorf("container %s is not a sandbox", sandboxID)
|
|
}
|
|
if hostID == "" {
|
|
// Use the sandbox's host.
|
|
hostID = sandbox.HostID
|
|
} else if sandbox.HostID == "" {
|
|
return nil, fmt.Errorf("sandbox container %s is not running in a VM host, but host %s was specified", sandboxID, hostID)
|
|
} else if hostID != sandbox.HostID {
|
|
return nil, fmt.Errorf("sandbox container %s has a different host %s from the requested host %s", sandboxID, sandbox.HostID, hostID)
|
|
}
|
|
if vmisolated && hostID == "" {
|
|
return nil, fmt.Errorf("container %s is not a VM isolated sandbox", sandboxID)
|
|
}
|
|
}
|
|
|
|
uniqueID := guid.New()
|
|
|
|
newvm := false
|
|
var hostUniqueID guid.GUID
|
|
if hostID != "" {
|
|
host, err := getContainer(hostID, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer host.Close()
|
|
if !host.IsHost {
|
|
return nil, fmt.Errorf("host container %s is not a VM host", hostID)
|
|
}
|
|
hostUniqueID = host.UniqueID
|
|
} else if vmisolated && (isSandbox || cfg.Spec.Linux != nil) {
|
|
hostID = cfg.ID
|
|
newvm = true
|
|
hostUniqueID = uniqueID
|
|
}
|
|
|
|
// Make absolute the paths in Root.Path and Windows.LayerFolders.
|
|
rootfs := ""
|
|
if cfg.Spec.Root != nil {
|
|
rootfs = cfg.Spec.Root.Path
|
|
if rootfs != "" && !filepath.IsAbs(rootfs) && !strings.HasPrefix(rootfs, `\\?\`) {
|
|
rootfs = filepath.Join(cwd, rootfs)
|
|
cfg.Spec.Root.Path = rootfs
|
|
}
|
|
}
|
|
|
|
netNS := ""
|
|
if cfg.Spec.Windows != nil {
|
|
for i, f := range cfg.Spec.Windows.LayerFolders {
|
|
if !filepath.IsAbs(f) && !strings.HasPrefix(rootfs, `\\?\`) {
|
|
cfg.Spec.Windows.LayerFolders[i] = filepath.Join(cwd, f)
|
|
}
|
|
}
|
|
|
|
// Determine the network namespace to use.
|
|
if cfg.Spec.Windows.Network != nil && cfg.Spec.Windows.Network.NetworkSharedContainerName != "" {
|
|
err = stateKey.Get(cfg.Spec.Windows.Network.NetworkSharedContainerName, keyNetNS, &netNS)
|
|
if err != nil {
|
|
if _, ok := err.(*regstate.NoStateError); !ok {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Store the initial container state in the registry so that the delete
|
|
// command can clean everything up if something goes wrong.
|
|
c := &container{
|
|
persistedState: persistedState{
|
|
ID: cfg.ID,
|
|
Bundle: cwd,
|
|
Rootfs: rootfs,
|
|
Created: time.Now(),
|
|
Spec: cfg.Spec,
|
|
SandboxID: sandboxID,
|
|
HostID: hostID,
|
|
IsHost: newvm,
|
|
RequestedNetNS: netNS,
|
|
UniqueID: uniqueID,
|
|
HostUniqueID: hostUniqueID,
|
|
},
|
|
}
|
|
err = stateKey.Create(cfg.ID, keyState, &c.persistedState)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
c.Remove()
|
|
}
|
|
}()
|
|
|
|
// Start a VM if necessary.
|
|
if newvm {
|
|
shim, err := c.startVMShim(cfg.VMLogFile, cfg.VMConsolePipe)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
shim.Release()
|
|
}
|
|
|
|
if c.HostID != "" {
|
|
// Call to the VM shim process to create the container. This is done so
|
|
// that the VM process can keep track of the VM's virtual hardware
|
|
// resource use.
|
|
err = c.issueVMRequest(opCreateContainer)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c.hc, err = hcs.OpenComputeSystem(cfg.ID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
// Create the container directly from this process.
|
|
err = createContainerInHost(c, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Create the shim process for the container.
|
|
err = startContainerShim(c, cfg.PidFile, cfg.ShimLogFile)
|
|
if err != nil {
|
|
if e := c.Kill(); e == nil {
|
|
c.Remove()
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
func (c *container) ShimPipePath() string {
|
|
return safePipePath("runhcs-shim-" + c.UniqueID.String())
|
|
}
|
|
|
|
func (c *container) VMPipePath() string {
|
|
return safePipePath("runhcs-vm-" + c.HostUniqueID.String())
|
|
}
|
|
|
|
func (c *container) VMIsolated() bool {
|
|
return c.HostID != ""
|
|
}
|
|
|
|
func (c *container) unmountInHost(vm *uvm.UtilityVM, all bool) error {
|
|
resources := &hcsoci.Resources{}
|
|
err := stateKey.Get(c.ID, keyResources, resources)
|
|
if _, ok := err.(*regstate.NoStateError); ok {
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = hcsoci.ReleaseResources(resources, vm, all)
|
|
if err != nil {
|
|
stateKey.Set(c.ID, keyResources, resources)
|
|
return err
|
|
}
|
|
|
|
err = stateKey.Clear(c.ID, keyResources)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *container) Unmount(all bool) error {
|
|
if c.VMIsolated() {
|
|
op := opUnmountContainerDiskOnly
|
|
if all {
|
|
op = opUnmountContainer
|
|
}
|
|
err := c.issueVMRequest(op)
|
|
if err != nil {
|
|
if _, ok := err.(*noVMError); ok {
|
|
logrus.Warnf("did not unmount resources for container %s because VM shim for %s could not be contacted", c.ID, c.HostID)
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
c.unmountInHost(nil, false)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func createContainerInHost(c *container, vm *uvm.UtilityVM) (err error) {
|
|
if c.hc != nil {
|
|
return errors.New("container already created")
|
|
}
|
|
|
|
// Create the container without starting it.
|
|
opts := &hcsoci.CreateOptions{
|
|
ID: c.ID,
|
|
Spec: c.Spec,
|
|
HostingSystem: vm,
|
|
NetworkNamespace: c.RequestedNetNS,
|
|
}
|
|
vmid := ""
|
|
if vm != nil {
|
|
vmid = vm.ID()
|
|
}
|
|
logrus.Infof("creating container %s (VM: '%s')", c.ID, vmid)
|
|
hc, resources, err := hcsoci.CreateContainer(opts)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
hc.Terminate()
|
|
hc.Wait()
|
|
hcsoci.ReleaseResources(resources, vm, true)
|
|
}
|
|
}()
|
|
|
|
// Record the network namespace to support namespace sharing by container ID.
|
|
if resources.NetNS() != "" {
|
|
err = stateKey.Set(c.ID, keyNetNS, resources.NetNS)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
err = stateKey.Set(c.ID, keyResources, resources)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.hc = hc
|
|
return nil
|
|
}
|
|
|
|
func startContainerShim(c *container, pidFile, logFile string) error {
|
|
// Launch a shim process to later execute a process in the container.
|
|
shim, err := startProcessShim(c.ID, pidFile, logFile, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer shim.Release()
|
|
defer func() {
|
|
if err != nil {
|
|
shim.Kill()
|
|
}
|
|
}()
|
|
|
|
c.ShimPid = shim.Pid
|
|
err = stateKey.Set(c.ID, keyShimPid, shim.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if pidFile != "" {
|
|
if err = createPidFile(pidFile, shim.Pid); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *container) Close() error {
|
|
if c.hc == nil {
|
|
return nil
|
|
}
|
|
return c.hc.Close()
|
|
}
|
|
|
|
func (c *container) Exec() error {
|
|
err := c.hc.Start()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.Spec.Process == nil {
|
|
return nil
|
|
}
|
|
|
|
// Alert the shim that the container is ready.
|
|
pipe, err := winio.DialPipe(c.ShimPipePath(), nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer pipe.Close()
|
|
|
|
shim, err := os.FindProcess(c.ShimPid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer shim.Release()
|
|
|
|
err = getErrorFromPipe(pipe, shim)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func getContainer(id string, notStopped bool) (*container, error) {
|
|
var c container
|
|
err := stateKey.Get(id, keyState, &c.persistedState)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = stateKey.Get(id, keyShimPid, &c.ShimPid)
|
|
if err != nil {
|
|
if _, ok := err.(*regstate.NoStateError); !ok {
|
|
return nil, err
|
|
}
|
|
c.ShimPid = -1
|
|
}
|
|
if notStopped && c.ShimPid == 0 {
|
|
return nil, errContainerStopped
|
|
}
|
|
|
|
hc, err := hcs.OpenComputeSystem(c.ID)
|
|
if err == nil {
|
|
c.hc = hc
|
|
} else if !hcs.IsNotExist(err) {
|
|
return nil, err
|
|
} else if notStopped {
|
|
return nil, errContainerStopped
|
|
}
|
|
|
|
return &c, nil
|
|
}
|
|
|
|
func (c *container) Remove() error {
|
|
// Unmount any layers or mapped volumes.
|
|
err := c.Unmount(!c.IsHost)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Follow kata's example and delay tearing down the VM until the owning
|
|
// container is removed.
|
|
if c.IsHost {
|
|
vm, err := hcs.OpenComputeSystem(vmID(c.ID))
|
|
if err == nil {
|
|
if err := vm.Terminate(); hcs.IsPending(err) {
|
|
vm.Wait()
|
|
}
|
|
}
|
|
}
|
|
return stateKey.Remove(c.ID)
|
|
}
|
|
|
|
func (c *container) Kill() error {
|
|
if c.hc == nil {
|
|
return nil
|
|
}
|
|
err := c.hc.Terminate()
|
|
if hcs.IsPending(err) {
|
|
err = c.hc.Wait()
|
|
}
|
|
if hcs.IsAlreadyStopped(err) {
|
|
err = nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (c *container) Status() (containerStatus, error) {
|
|
if c.hc == nil || c.ShimPid == 0 {
|
|
return containerStopped, nil
|
|
}
|
|
props, err := c.hc.Properties()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
state := containerUnknown
|
|
switch props.State {
|
|
case "", "Created":
|
|
state = containerCreated
|
|
case "Running":
|
|
state = containerRunning
|
|
case "Paused":
|
|
state = containerPaused
|
|
case "Stopped":
|
|
state = containerStopped
|
|
}
|
|
return state, nil
|
|
}
|