cri-o/server/sandbox_run_linux.go

520 lines
15 KiB
Go

// +build linux
package server
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/containers/storage"
"github.com/kubernetes-incubator/cri-o/lib/sandbox"
"github.com/kubernetes-incubator/cri-o/oci"
"github.com/kubernetes-incubator/cri-o/pkg/annotations"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"
"golang.org/x/sys/unix"
pb "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
"k8s.io/kubernetes/pkg/kubelet/leaky"
"k8s.io/kubernetes/pkg/kubelet/types"
)
func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest) (resp *pb.RunPodSandboxResponse, err error) {
const operation = "run_pod_sandbox"
defer func() {
recordOperation(operation, time.Now())
recordError(operation, err)
}()
s.updateLock.RLock()
defer s.updateLock.RUnlock()
if req.GetConfig().GetMetadata() == nil {
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Metadata is nil")
}
logrus.Debugf("RunPodSandboxRequest %+v", req)
var processLabel, mountLabel, resolvPath string
// process req.Name
kubeName := req.GetConfig().GetMetadata().GetName()
if kubeName == "" {
return nil, fmt.Errorf("PodSandboxConfig.Name should not be empty")
}
namespace := req.GetConfig().GetMetadata().GetNamespace()
attempt := req.GetConfig().GetMetadata().GetAttempt()
id, name, err := s.generatePodIDandName(req.GetConfig())
if err != nil {
if strings.Contains(err.Error(), "already reserved for pod") {
matches := conflictRE.FindStringSubmatch(err.Error())
if len(matches) != 2 {
return nil, err
}
dupID := matches[1]
if _, err := s.StopPodSandbox(ctx, &pb.StopPodSandboxRequest{PodSandboxId: dupID}); err != nil {
return nil, err
}
if _, err := s.RemovePodSandbox(ctx, &pb.RemovePodSandboxRequest{PodSandboxId: dupID}); err != nil {
return nil, err
}
id, name, err = s.generatePodIDandName(req.GetConfig())
if err != nil {
return nil, err
}
} else {
return nil, err
}
}
defer func() {
if err != nil {
s.ReleasePodName(name)
}
}()
_, containerName, err := s.generateContainerIDandNameForSandbox(req.GetConfig())
if err != nil {
return nil, err
}
defer func() {
if err != nil {
s.ReleaseContainerName(containerName)
}
}()
podContainer, err := s.StorageRuntimeServer().CreatePodSandbox(s.ImageContext(),
name, id,
s.config.PauseImage, "",
containerName,
req.GetConfig().GetMetadata().GetName(),
req.GetConfig().GetMetadata().GetUid(),
namespace,
attempt,
nil)
if errors.Cause(err) == storage.ErrDuplicateName {
return nil, fmt.Errorf("pod sandbox with name %q already exists", name)
}
if err != nil {
return nil, fmt.Errorf("error creating pod sandbox with name %q: %v", name, err)
}
defer func() {
if err != nil {
if err2 := s.StorageRuntimeServer().RemovePodSandbox(id); err2 != nil {
logrus.Warnf("couldn't cleanup pod sandbox %q: %v", id, err2)
}
}
}()
// TODO: factor generating/updating the spec into something other projects can vendor
// creates a spec Generator with the default spec.
g := generate.New()
// setup defaults for the pod sandbox
g.SetRootReadonly(true)
if s.config.PauseCommand == "" {
if podContainer.Config != nil {
g.SetProcessArgs(podContainer.Config.Config.Cmd)
} else {
g.SetProcessArgs([]string{sandbox.PodInfraCommand})
}
} else {
g.SetProcessArgs([]string{s.config.PauseCommand})
}
// set DNS options
if req.GetConfig().GetDnsConfig() != nil {
dnsServers := req.GetConfig().GetDnsConfig().Servers
dnsSearches := req.GetConfig().GetDnsConfig().Searches
dnsOptions := req.GetConfig().GetDnsConfig().Options
resolvPath = fmt.Sprintf("%s/resolv.conf", podContainer.RunDir)
err = parseDNSOptions(dnsServers, dnsSearches, dnsOptions, resolvPath)
if err != nil {
err1 := removeFile(resolvPath)
if err1 != nil {
err = err1
return nil, fmt.Errorf("%v; failed to remove %s: %v", err, resolvPath, err1)
}
return nil, err
}
if err := label.Relabel(resolvPath, mountLabel, true); err != nil && err != unix.ENOTSUP {
return nil, err
}
mnt := runtimespec.Mount{
Type: "bind",
Source: resolvPath,
Destination: "/etc/resolv.conf",
Options: []string{"ro", "bind"},
}
g.AddMount(mnt)
}
// add metadata
metadata := req.GetConfig().GetMetadata()
metadataJSON, err := json.Marshal(metadata)
if err != nil {
return nil, err
}
// add labels
labels := req.GetConfig().GetLabels()
if err := validateLabels(labels); err != nil {
return nil, err
}
// Add special container name label for the infra container
labelsJSON := []byte{}
if labels != nil {
labels[types.KubernetesContainerNameLabel] = leaky.PodInfraContainerName
labelsJSON, err = json.Marshal(labels)
if err != nil {
return nil, err
}
}
// add annotations
kubeAnnotations := req.GetConfig().GetAnnotations()
kubeAnnotationsJSON, err := json.Marshal(kubeAnnotations)
if err != nil {
return nil, err
}
// set log directory
logDir := req.GetConfig().GetLogDirectory()
if logDir == "" {
logDir = filepath.Join(s.config.LogDir, id)
}
if err = os.MkdirAll(logDir, 0700); err != nil {
return nil, err
}
// This should always be absolute from k8s.
if !filepath.IsAbs(logDir) {
return nil, fmt.Errorf("requested logDir for sbox id %s is a relative path: %s", id, logDir)
}
privileged := s.privilegedSandbox(req)
securityContext := req.GetConfig().GetLinux().GetSecurityContext()
if securityContext == nil {
logrus.Warn("no security context found in config.")
}
nsOptsJSON, err := json.Marshal(securityContext.GetNamespaceOptions())
if err != nil {
return nil, err
}
processLabel, mountLabel, err = getSELinuxLabels(securityContext.GetSelinuxOptions(), privileged)
if err != nil {
return nil, err
}
// Don't use SELinux separation with Host Pid or IPC Namespace or privileged.
if securityContext.GetNamespaceOptions().GetPid() == pb.NamespaceMode_NODE ||
securityContext.GetNamespaceOptions().GetIpc() == pb.NamespaceMode_NODE {
processLabel, mountLabel = "", ""
}
g.SetProcessSelinuxLabel(processLabel)
g.SetLinuxMountLabel(mountLabel)
// create shm mount for the pod containers.
var shmPath string
if securityContext.GetNamespaceOptions().GetIpc() == pb.NamespaceMode_NODE {
shmPath = "/dev/shm"
} else {
shmPath, err = setupShm(podContainer.RunDir, mountLabel)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
if err2 := unix.Unmount(shmPath, unix.MNT_DETACH); err2 != nil {
logrus.Warnf("failed to unmount shm for pod: %v", err2)
}
}
}()
}
err = s.setPodSandboxMountLabel(id, mountLabel)
if err != nil {
return nil, err
}
if err = s.CtrIDIndex().Add(id); err != nil {
return nil, err
}
defer func() {
if err != nil {
if err2 := s.CtrIDIndex().Delete(id); err2 != nil {
logrus.Warnf("couldn't delete ctr id %s from idIndex", id)
}
}
}()
// set log path inside log directory
logPath := filepath.Join(logDir, id+".log")
// Handle https://issues.k8s.io/44043
if err := ensureSaneLogPath(logPath); err != nil {
return nil, err
}
hostNetwork := securityContext.GetNamespaceOptions().GetNetwork() == pb.NamespaceMode_NODE
hostname, err := getHostname(id, req.GetConfig().Hostname, hostNetwork)
if err != nil {
return nil, err
}
g.SetHostname(hostname)
trusted := s.trustedSandbox(req)
g.AddAnnotation(annotations.Metadata, string(metadataJSON))
g.AddAnnotation(annotations.Labels, string(labelsJSON))
g.AddAnnotation(annotations.Annotations, string(kubeAnnotationsJSON))
g.AddAnnotation(annotations.LogPath, logPath)
g.AddAnnotation(annotations.Name, name)
g.AddAnnotation(annotations.Namespace, namespace)
g.AddAnnotation(annotations.ContainerType, annotations.ContainerTypeSandbox)
g.AddAnnotation(annotations.SandboxID, id)
g.AddAnnotation(annotations.ContainerName, containerName)
g.AddAnnotation(annotations.ContainerID, id)
g.AddAnnotation(annotations.ShmPath, shmPath)
g.AddAnnotation(annotations.PrivilegedRuntime, fmt.Sprintf("%v", privileged))
g.AddAnnotation(annotations.TrustedSandbox, fmt.Sprintf("%v", trusted))
g.AddAnnotation(annotations.ResolvPath, resolvPath)
g.AddAnnotation(annotations.HostName, hostname)
g.AddAnnotation(annotations.NamespaceOptions, string(nsOptsJSON))
g.AddAnnotation(annotations.KubeName, kubeName)
if podContainer.Config.Config.StopSignal != "" {
// this key is defined in image-spec conversion document at https://github.com/opencontainers/image-spec/pull/492/files#diff-8aafbe2c3690162540381b8cdb157112R57
g.AddAnnotation("org.opencontainers.image.stopSignal", podContainer.Config.Config.StopSignal)
}
created := time.Now()
g.AddAnnotation(annotations.Created, created.Format(time.RFC3339Nano))
portMappings := convertPortMappings(req.GetConfig().GetPortMappings())
portMappingsJSON, err := json.Marshal(portMappings)
if err != nil {
return nil, err
}
g.AddAnnotation(annotations.PortMappings, string(portMappingsJSON))
// setup cgroup settings
cgroupParent := req.GetConfig().GetLinux().GetCgroupParent()
if cgroupParent != "" {
if s.config.CgroupManager == oci.SystemdCgroupsManager {
if len(cgroupParent) <= 6 || !strings.HasSuffix(path.Base(cgroupParent), ".slice") {
return nil, fmt.Errorf("cri-o configured with systemd cgroup manager, but did not receive slice as parent: %s", cgroupParent)
}
cgPath, err := convertCgroupFsNameToSystemd(cgroupParent)
if err != nil {
return nil, err
}
g.SetLinuxCgroupsPath(cgPath + ":" + "crio" + ":" + id)
cgroupParent = cgPath
} else {
if strings.HasSuffix(path.Base(cgroupParent), ".slice") {
return nil, fmt.Errorf("cri-o configured with cgroupfs cgroup manager, but received systemd slice as parent: %s", cgroupParent)
}
cgPath := filepath.Join(cgroupParent, scopePrefix+"-"+id)
g.SetLinuxCgroupsPath(cgPath)
}
}
g.AddAnnotation(annotations.CgroupParent, cgroupParent)
sb, err := sandbox.New(id, namespace, name, kubeName, logDir, labels, kubeAnnotations, processLabel, mountLabel, metadata, shmPath, cgroupParent, privileged, trusted, resolvPath, hostname, portMappings)
if err != nil {
return nil, err
}
s.addSandbox(sb)
defer func() {
if err != nil {
s.removeSandbox(id)
}
}()
if err = s.PodIDIndex().Add(id); err != nil {
return nil, err
}
defer func() {
if err != nil {
if err := s.PodIDIndex().Delete(id); err != nil {
logrus.Warnf("couldn't delete pod id %s from idIndex", id)
}
}
}()
for k, v := range kubeAnnotations {
g.AddAnnotation(k, v)
}
for k, v := range labels {
g.AddAnnotation(k, v)
}
// extract linux sysctls from annotations and pass down to oci runtime
for key, value := range req.GetConfig().GetLinux().GetSysctls() {
g.AddLinuxSysctl(key, value)
}
// Set OOM score adjust of the infra container to be very low
// so it doesn't get killed.
g.SetProcessOOMScoreAdj(PodInfraOOMAdj)
g.SetLinuxResourcesCPUShares(PodInfraCPUshares)
// set up namespaces
if hostNetwork {
err = g.RemoveLinuxNamespace(string(runtimespec.NetworkNamespace))
if err != nil {
return nil, err
}
} else {
// Create the sandbox network namespace
if err = sb.NetNsCreate(); err != nil {
return nil, err
}
defer func() {
if err == nil {
return
}
if netnsErr := sb.NetNsRemove(); netnsErr != nil {
logrus.Warnf("Failed to remove networking namespace: %v", netnsErr)
}
}()
// Pass the created namespace path to the runtime
err = g.AddOrReplaceLinuxNamespace(string(runtimespec.NetworkNamespace), sb.NetNsPath())
if err != nil {
return nil, err
}
}
if securityContext.GetNamespaceOptions().GetPid() == pb.NamespaceMode_NODE {
err = g.RemoveLinuxNamespace(string(runtimespec.PIDNamespace))
if err != nil {
return nil, err
}
}
if securityContext.GetNamespaceOptions().GetIpc() == pb.NamespaceMode_NODE {
err = g.RemoveLinuxNamespace(string(runtimespec.IPCNamespace))
if err != nil {
return nil, err
}
}
if !s.seccompEnabled {
g.Spec().Linux.Seccomp = nil
}
saveOptions := generate.ExportOptions{}
mountPoint, err := s.StorageRuntimeServer().StartContainer(id)
if err != nil {
return nil, fmt.Errorf("failed to mount container %s in pod sandbox %s(%s): %v", containerName, sb.Name(), id, err)
}
g.AddAnnotation(annotations.MountPoint, mountPoint)
g.SetRootPath(mountPoint)
hostnamePath := fmt.Sprintf("%s/hostname", podContainer.RunDir)
if err := ioutil.WriteFile(hostnamePath, []byte(hostname+"\n"), 0644); err != nil {
return nil, err
}
if err := label.Relabel(hostnamePath, mountLabel, true); err != nil && err != unix.ENOTSUP {
return nil, err
}
mnt := runtimespec.Mount{
Type: "bind",
Source: hostnamePath,
Destination: "/etc/hostname",
Options: []string{"ro", "bind"},
}
g.AddMount(mnt)
g.AddAnnotation(annotations.HostnamePath, hostnamePath)
sb.AddHostnamePath(hostnamePath)
container, err := oci.NewContainer(id, containerName, podContainer.RunDir, logPath, sb.NetNs().Path(), labels, g.Spec().Annotations, kubeAnnotations, "", "", "", nil, id, false, false, false, sb.Privileged(), sb.Trusted(), podContainer.Dir, created, podContainer.Config.Config.StopSignal)
if err != nil {
return nil, err
}
container.SetSpec(g.Spec())
container.SetMountPoint(mountPoint)
sb.SetInfraContainer(container)
var ip string
ip, err = s.networkStart(hostNetwork, sb)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
s.networkStop(hostNetwork, sb)
}
}()
g.AddAnnotation(annotations.IP, ip)
sb.AddIP(ip)
sb.SetNamespaceOptions(securityContext.GetNamespaceOptions())
spp := req.GetConfig().GetLinux().GetSecurityContext().GetSeccompProfilePath()
g.AddAnnotation(annotations.SeccompProfilePath, spp)
sb.SetSeccompProfilePath(spp)
if !privileged {
if err = s.setupSeccomp(&g, spp); err != nil {
return nil, err
}
}
err = g.SaveToFile(filepath.Join(podContainer.Dir, "config.json"), saveOptions)
if err != nil {
return nil, fmt.Errorf("failed to save template configuration for pod sandbox %s(%s): %v", sb.Name(), id, err)
}
if err = g.SaveToFile(filepath.Join(podContainer.RunDir, "config.json"), saveOptions); err != nil {
return nil, fmt.Errorf("failed to write runtime configuration for pod sandbox %s(%s): %v", sb.Name(), id, err)
}
if err = s.runContainer(container, sb.CgroupParent()); err != nil {
return nil, err
}
s.addInfraContainer(container)
s.ContainerStateToDisk(container)
resp = &pb.RunPodSandboxResponse{PodSandboxId: id}
logrus.Debugf("RunPodSandboxResponse: %+v", resp)
return resp, nil
}
func setupShm(podSandboxRunDir, mountLabel string) (shmPath string, err error) {
shmPath = filepath.Join(podSandboxRunDir, "shm")
if err = os.Mkdir(shmPath, 0700); err != nil {
return "", err
}
shmOptions := "mode=1777,size=" + strconv.Itoa(sandbox.DefaultShmSize)
if err = unix.Mount("shm", shmPath, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV,
label.FormatMountLabel(shmOptions, mountLabel)); err != nil {
return "", fmt.Errorf("failed to mount shm tmpfs for pod: %v", err)
}
return shmPath, nil
}