4ac92d73e4
The way we build the OCI Process Args slice is incorrect. With the current implementation we may for example end up building this slice with only the entry point arguments, if the kubelet passed information is missing the Command slice. We also will end up building the Args slice with the Image config process arguments, without the defined entry point, if kubelet does not tell us anything about the container process command to be run. This patch fixes that by favoring the kubelet ContainerConfig information. If that is missing, we try to complete it with the container image information. We always use ContainerConfig.Command[] or ImageConfig.EntryPoint[] as the first OCI Process Args slice entries. Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
515 lines
15 KiB
Go
515 lines
15 KiB
Go
package server
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/docker/pkg/stringid"
|
|
"github.com/kubernetes-incubator/cri-o/oci"
|
|
"github.com/kubernetes-incubator/cri-o/server/apparmor"
|
|
"github.com/kubernetes-incubator/cri-o/server/seccomp"
|
|
"github.com/opencontainers/image-spec/specs-go/v1"
|
|
"github.com/opencontainers/runc/libcontainer/label"
|
|
"github.com/opencontainers/runtime-tools/generate"
|
|
"golang.org/x/net/context"
|
|
pb "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
|
|
)
|
|
|
|
const (
|
|
seccompUnconfined = "unconfined"
|
|
seccompRuntimeDefault = "runtime/default"
|
|
seccompLocalhostPrefix = "localhost/"
|
|
)
|
|
|
|
func addOciBindMounts(sb *sandbox, containerConfig *pb.ContainerConfig, specgen *generate.Generator) error {
|
|
mounts := containerConfig.GetMounts()
|
|
for _, mount := range mounts {
|
|
dest := mount.ContainerPath
|
|
if dest == "" {
|
|
return fmt.Errorf("Mount.ContainerPath is empty")
|
|
}
|
|
|
|
src := mount.HostPath
|
|
if src == "" {
|
|
return fmt.Errorf("Mount.HostPath is empty")
|
|
}
|
|
|
|
options := []string{"rw"}
|
|
if mount.Readonly {
|
|
options = []string{"ro"}
|
|
}
|
|
|
|
if mount.SelinuxRelabel {
|
|
// Need a way in kubernetes to determine if the volume is shared or private
|
|
if err := label.Relabel(src, sb.mountLabel, true); err != nil && err != syscall.ENOTSUP {
|
|
return fmt.Errorf("relabel failed %s: %v", src, err)
|
|
}
|
|
}
|
|
|
|
specgen.AddBindMount(src, dest, options)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// buildOCIProcessArgs build an OCI compatible process arguments slice.
|
|
func buildOCIProcessArgs(containerKubeConfig *pb.ContainerConfig, imageOCIConfig *v1.Image) ([]string, error) {
|
|
processArgs := []string{}
|
|
|
|
kubeCommands := containerKubeConfig.Command
|
|
kubeArgs := containerKubeConfig.Args
|
|
|
|
if imageOCIConfig == nil {
|
|
// HACK We should error out here, not being able to get an Image config is fatal.
|
|
// When https://github.com/kubernetes-incubator/cri-o/issues/395 is fixed
|
|
// we'll remove that one and return an error here.
|
|
if containerKubeConfig.Metadata != nil {
|
|
logrus.Errorf("empty image config for %s", containerKubeConfig.Metadata.Name)
|
|
|
|
// HACK until https://github.com/kubernetes-incubator/cri-o/issues/395 is fixed.
|
|
// If the container is kubeadm's dummy, imageOCIConfig is nil, and both
|
|
// kubeCommands and kubeArgs are empty. So we set processArgs to /pause as the
|
|
// dummy container is just a pause one.
|
|
// (See https://github.com/kubernetes/kubernetes/blob/master/cmd/kubeadm/app/master/templates.go)
|
|
if containerKubeConfig.Metadata.Name == "dummy" {
|
|
return []string{podInfraCommand}, nil
|
|
}
|
|
} else {
|
|
logrus.Errorf("empty image config for %s", containerKubeConfig.Image.Image)
|
|
}
|
|
}
|
|
|
|
// We got an OCI Image configuration.
|
|
// We will only use it if the kubelet information is incomplete.
|
|
if kubeCommands == nil {
|
|
if imageOCIConfig != nil && imageOCIConfig.Config.Entrypoint != nil {
|
|
processArgs = append(processArgs, imageOCIConfig.Config.Entrypoint...)
|
|
}
|
|
} else {
|
|
processArgs = append(processArgs, kubeCommands...)
|
|
}
|
|
|
|
if kubeArgs == nil {
|
|
// Our kubelet command arguments slice is empty.
|
|
// We will use the OCI Image configuration only if we already
|
|
// have found a process command, i.e. if processArgs is no
|
|
// longer empty. No doing so will have us create a process
|
|
// arguments slice starting with the OCI Image command arguments
|
|
// as the entry point.
|
|
if processArgs != nil &&
|
|
imageOCIConfig != nil &&
|
|
imageOCIConfig.Config.Entrypoint != nil && imageOCIConfig.Config.Cmd != nil {
|
|
processArgs = append(processArgs, imageOCIConfig.Config.Cmd...)
|
|
}
|
|
} else {
|
|
processArgs = append(processArgs, kubeArgs...)
|
|
}
|
|
|
|
if processArgs == nil {
|
|
// Use a reasonable default if everything failed.
|
|
processArgs = []string{"/bin/sh"}
|
|
}
|
|
|
|
logrus.Debugf("OCI process args %v", processArgs)
|
|
|
|
return processArgs, nil
|
|
}
|
|
|
|
// CreateContainer creates a new container in specified PodSandbox
|
|
func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerRequest) (res *pb.CreateContainerResponse, err error) {
|
|
logrus.Debugf("CreateContainerRequest %+v", req)
|
|
s.Update()
|
|
sbID := req.PodSandboxId
|
|
if sbID == "" {
|
|
return nil, fmt.Errorf("PodSandboxId should not be empty")
|
|
}
|
|
|
|
sandboxID, err := s.podIDIndex.Get(sbID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("PodSandbox with ID starting with %s not found: %v", sbID, err)
|
|
}
|
|
|
|
sb := s.getSandbox(sandboxID)
|
|
if sb == nil {
|
|
return nil, fmt.Errorf("specified sandbox not found: %s", sandboxID)
|
|
}
|
|
|
|
// The config of the container
|
|
containerConfig := req.GetConfig()
|
|
if containerConfig == nil {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig is nil")
|
|
}
|
|
|
|
name := containerConfig.GetMetadata().Name
|
|
if name == "" {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Name is empty")
|
|
}
|
|
|
|
attempt := containerConfig.GetMetadata().Attempt
|
|
containerID, containerName, err := s.generateContainerIDandName(sb.name, name, attempt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
s.releaseContainerName(containerName)
|
|
}
|
|
}()
|
|
|
|
container, err := s.createSandboxContainer(ctx, containerID, containerName, sb, req.GetSandboxConfig(), containerConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
err2 := s.storage.DeleteContainer(containerID)
|
|
if err2 != nil {
|
|
logrus.Warnf("Failed to cleanup container directory: %v", err2)
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err = s.runtime.CreateContainer(container, sb.cgroupParent); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.runtime.UpdateStatus(container); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
s.addContainer(container)
|
|
|
|
if err = s.ctrIDIndex.Add(containerID); err != nil {
|
|
s.removeContainer(container)
|
|
return nil, err
|
|
}
|
|
|
|
resp := &pb.CreateContainerResponse{
|
|
ContainerId: containerID,
|
|
}
|
|
|
|
logrus.Debugf("CreateContainerResponse: %+v", resp)
|
|
return resp, nil
|
|
}
|
|
|
|
func (s *Server) createSandboxContainer(ctx context.Context, containerID string, containerName string, sb *sandbox, SandboxConfig *pb.PodSandboxConfig, containerConfig *pb.ContainerConfig) (*oci.Container, error) {
|
|
if sb == nil {
|
|
return nil, errors.New("createSandboxContainer needs a sandbox")
|
|
}
|
|
|
|
// TODO: simplify this function (cyclomatic complexity here is high)
|
|
// TODO: factor generating/updating the spec into something other projects can vendor
|
|
|
|
// creates a spec Generator with the default spec.
|
|
specgen := generate.New()
|
|
|
|
cwd := containerConfig.WorkingDir
|
|
if cwd == "" {
|
|
cwd = "/"
|
|
}
|
|
specgen.SetProcessCwd(cwd)
|
|
|
|
envs := containerConfig.GetEnvs()
|
|
if envs != nil {
|
|
for _, item := range envs {
|
|
key := item.Key
|
|
value := item.Value
|
|
if key == "" {
|
|
continue
|
|
}
|
|
specgen.AddProcessEnv(key, value)
|
|
}
|
|
}
|
|
|
|
if err := addOciBindMounts(sb, containerConfig, &specgen); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
labels := containerConfig.GetLabels()
|
|
|
|
metadata := containerConfig.GetMetadata()
|
|
|
|
annotations := containerConfig.GetAnnotations()
|
|
if annotations != nil {
|
|
for k, v := range annotations {
|
|
specgen.AddAnnotation(k, v)
|
|
}
|
|
}
|
|
|
|
// set this container's apparmor profile if it is set by sandbox
|
|
if s.appArmorEnabled {
|
|
appArmorProfileName := s.getAppArmorProfileName(sb.annotations, metadata.Name)
|
|
if appArmorProfileName != "" {
|
|
// reload default apparmor profile if it is unloaded.
|
|
if s.appArmorProfile == apparmor.DefaultApparmorProfile {
|
|
if err := apparmor.EnsureDefaultApparmorProfile(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
specgen.SetProcessApparmorProfile(appArmorProfileName)
|
|
}
|
|
}
|
|
if containerConfig.GetLinux().GetSecurityContext() != nil {
|
|
if containerConfig.GetLinux().GetSecurityContext().Privileged {
|
|
specgen.SetupPrivileged(true)
|
|
}
|
|
|
|
if containerConfig.GetLinux().GetSecurityContext().ReadonlyRootfs {
|
|
specgen.SetRootReadonly(true)
|
|
}
|
|
}
|
|
|
|
logPath := containerConfig.LogPath
|
|
specgen.SetProcessTerminal(containerConfig.Tty)
|
|
|
|
linux := containerConfig.GetLinux()
|
|
if linux != nil {
|
|
resources := linux.GetResources()
|
|
if resources != nil {
|
|
cpuPeriod := resources.CpuPeriod
|
|
if cpuPeriod != 0 {
|
|
specgen.SetLinuxResourcesCPUPeriod(uint64(cpuPeriod))
|
|
}
|
|
|
|
cpuQuota := resources.CpuQuota
|
|
if cpuQuota != 0 {
|
|
specgen.SetLinuxResourcesCPUQuota(uint64(cpuQuota))
|
|
}
|
|
|
|
cpuShares := resources.CpuShares
|
|
if cpuShares != 0 {
|
|
specgen.SetLinuxResourcesCPUShares(uint64(cpuShares))
|
|
}
|
|
|
|
memoryLimit := resources.MemoryLimitInBytes
|
|
if memoryLimit != 0 {
|
|
specgen.SetLinuxResourcesMemoryLimit(uint64(memoryLimit))
|
|
}
|
|
|
|
oomScoreAdj := resources.OomScoreAdj
|
|
specgen.SetLinuxResourcesOOMScoreAdj(int(oomScoreAdj))
|
|
}
|
|
|
|
if sb.cgroupParent != "" {
|
|
if s.config.CgroupManager == "systemd" {
|
|
cgPath := sb.cgroupParent + ":" + "ocid" + ":" + containerID
|
|
specgen.SetLinuxCgroupsPath(cgPath)
|
|
} else {
|
|
specgen.SetLinuxCgroupsPath(sb.cgroupParent + "/" + containerID)
|
|
}
|
|
}
|
|
|
|
capabilities := linux.GetSecurityContext().GetCapabilities()
|
|
if capabilities != nil {
|
|
addCaps := capabilities.AddCapabilities
|
|
if addCaps != nil {
|
|
for _, cap := range addCaps {
|
|
if err := specgen.AddProcessCapability(cap); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
dropCaps := capabilities.DropCapabilities
|
|
if dropCaps != nil {
|
|
for _, cap := range dropCaps {
|
|
if err := specgen.DropProcessCapability(cap); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
specgen.SetProcessSelinuxLabel(sb.processLabel)
|
|
specgen.SetLinuxMountLabel(sb.mountLabel)
|
|
|
|
if linux.GetSecurityContext() != nil {
|
|
user := linux.GetSecurityContext().GetRunAsUser()
|
|
specgen.SetProcessUID(uint32(user.Value))
|
|
specgen.SetProcessGID(uint32(user.Value))
|
|
groups := linux.GetSecurityContext().SupplementalGroups
|
|
for _, group := range groups {
|
|
specgen.AddProcessAdditionalGid(uint32(group))
|
|
}
|
|
}
|
|
}
|
|
// Join the namespace paths for the pod sandbox container.
|
|
podInfraState := s.runtime.ContainerStatus(sb.infraContainer)
|
|
|
|
logrus.Debugf("pod container state %+v", podInfraState)
|
|
|
|
ipcNsPath := fmt.Sprintf("/proc/%d/ns/ipc", podInfraState.Pid)
|
|
if err := specgen.AddOrReplaceLinuxNamespace("ipc", ipcNsPath); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
netNsPath := sb.netNsPath()
|
|
if netNsPath == "" {
|
|
// The sandbox does not have a permanent namespace,
|
|
// it's on the host one.
|
|
netNsPath = fmt.Sprintf("/proc/%d/ns/net", podInfraState.Pid)
|
|
}
|
|
|
|
if err := specgen.AddOrReplaceLinuxNamespace("network", netNsPath); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
imageSpec := containerConfig.GetImage()
|
|
if imageSpec == nil {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image is nil")
|
|
}
|
|
|
|
image := imageSpec.Image
|
|
if image == "" {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image.Image is empty")
|
|
}
|
|
|
|
// bind mount the pod shm
|
|
specgen.AddBindMount(sb.shmPath, "/dev/shm", []string{"rw"})
|
|
|
|
specgen.AddAnnotation("ocid/name", containerName)
|
|
specgen.AddAnnotation("ocid/sandbox_id", sb.id)
|
|
specgen.AddAnnotation("ocid/sandbox_name", sb.infraContainer.Name())
|
|
specgen.AddAnnotation("ocid/container_type", containerTypeContainer)
|
|
specgen.AddAnnotation("ocid/log_path", logPath)
|
|
specgen.AddAnnotation("ocid/tty", fmt.Sprintf("%v", containerConfig.Tty))
|
|
specgen.AddAnnotation("ocid/image", image)
|
|
|
|
metadataJSON, err := json.Marshal(metadata)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.AddAnnotation("ocid/metadata", string(metadataJSON))
|
|
|
|
labelsJSON, err := json.Marshal(labels)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.AddAnnotation("ocid/labels", string(labelsJSON))
|
|
|
|
annotationsJSON, err := json.Marshal(annotations)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.AddAnnotation("ocid/annotations", string(annotationsJSON))
|
|
|
|
if err = s.setupSeccomp(&specgen, containerName, sb.annotations); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
metaname := metadata.Name
|
|
attempt := metadata.Attempt
|
|
containerInfo, err := s.storage.CreateContainer(s.imageContext,
|
|
sb.name, sb.id,
|
|
image, image,
|
|
containerName, containerID,
|
|
metaname,
|
|
attempt,
|
|
sb.mountLabel,
|
|
nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mountPoint, err := s.storage.StartContainer(containerID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to mount container %s(%s): %v", containerName, containerID, err)
|
|
}
|
|
|
|
processArgs, err := buildOCIProcessArgs(containerConfig, containerInfo.Config)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.SetProcessArgs(processArgs)
|
|
|
|
// by default, the root path is an empty string. set it now.
|
|
specgen.SetRootPath(mountPoint)
|
|
|
|
saveOptions := generate.ExportOptions{}
|
|
if err = specgen.SaveToFile(filepath.Join(containerInfo.Dir, "config.json"), saveOptions); err != nil {
|
|
return nil, err
|
|
}
|
|
if err = specgen.SaveToFile(filepath.Join(containerInfo.RunDir, "config.json"), saveOptions); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
container, err := oci.NewContainer(containerID, containerName, containerInfo.RunDir, logPath, sb.netNs(), labels, annotations, imageSpec, metadata, sb.id, containerConfig.Tty, sb.privileged)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return container, nil
|
|
}
|
|
|
|
func (s *Server) setupSeccomp(specgen *generate.Generator, cname string, sbAnnotations map[string]string) error {
|
|
profile, ok := sbAnnotations["security.alpha.kubernetes.io/seccomp/container/"+cname]
|
|
if !ok {
|
|
profile, ok = sbAnnotations["security.alpha.kubernetes.io/seccomp/pod"]
|
|
if !ok {
|
|
// running w/o seccomp, aka unconfined
|
|
profile = seccompUnconfined
|
|
}
|
|
}
|
|
if !s.seccompEnabled {
|
|
if profile != seccompUnconfined {
|
|
return fmt.Errorf("seccomp is not enabled in your kernel, cannot run with a profile")
|
|
}
|
|
logrus.Warn("seccomp is not enabled in your kernel, running container without profile")
|
|
}
|
|
if profile == seccompUnconfined {
|
|
// running w/o seccomp, aka unconfined
|
|
specgen.Spec().Linux.Seccomp = nil
|
|
return nil
|
|
}
|
|
if profile == seccompRuntimeDefault {
|
|
return seccomp.LoadProfileFromStruct(s.seccompProfile, specgen)
|
|
}
|
|
if !strings.HasPrefix(profile, seccompLocalhostPrefix) {
|
|
return fmt.Errorf("unknown seccomp profile option: %q", profile)
|
|
}
|
|
//file, err := ioutil.ReadFile(filepath.Join(s.seccompProfileRoot, strings.TrimPrefix(profile, seccompLocalhostPrefix)))
|
|
//if err != nil {
|
|
//return err
|
|
//}
|
|
// TODO(runcom): setup from provided node's seccomp profile
|
|
// can't do this yet, see https://issues.k8s.io/36997
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) generateContainerIDandName(podName string, name string, attempt uint32) (string, string, error) {
|
|
var (
|
|
err error
|
|
id = stringid.GenerateNonCryptoID()
|
|
)
|
|
nameStr := fmt.Sprintf("%s-%s-%v", podName, name, attempt)
|
|
if name == "infra" {
|
|
nameStr = fmt.Sprintf("%s-%s", podName, name)
|
|
}
|
|
if name, err = s.reserveContainerName(id, nameStr); err != nil {
|
|
return "", "", err
|
|
}
|
|
return id, name, err
|
|
}
|
|
|
|
// getAppArmorProfileName gets the profile name for the given container.
|
|
func (s *Server) getAppArmorProfileName(annotations map[string]string, ctrName string) string {
|
|
profile := apparmor.GetProfileNameFromPodAnnotations(annotations, ctrName)
|
|
|
|
if profile == "" {
|
|
return ""
|
|
}
|
|
|
|
if profile == apparmor.ProfileRuntimeDefault {
|
|
// If the value is runtime/default, then return default profile.
|
|
return s.appArmorProfile
|
|
}
|
|
|
|
return strings.TrimPrefix(profile, apparmor.ProfileNamePrefix)
|
|
}
|