60123a77ce
VM base container runtimes (e.g. Clear Containers) will run each pod in a VM and will create containers within that pod VM. Unfortunately those runtimes will get called by ocid with the same commands (create and start) for both the pause containers and subsequent containers to be added to the pod namespace. Unless they work around that by e.g. infering that a container which rootfs is under "/pause" would represent a pod, they have no way to decide if they need to create/start a VM or if they need to add a container to an already running VM pod. This patch tries to formalize this difference through pod annotations. When starting a container or a sandbox, we now add 2 annotations for the container type (Infrastructure or not) and the sandbox name. This will allow VM based container runtimes to handle 2 things: - Decide if they need to create a pod VM or not. - Keep track of which pod ID runs in a given VM, so that they know to which sandbox they have to add containers. Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
375 lines
10 KiB
Go
375 lines
10 KiB
Go
package server
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/docker/pkg/stringid"
|
|
"github.com/kubernetes-incubator/cri-o/oci"
|
|
"github.com/kubernetes-incubator/cri-o/server/seccomp"
|
|
"github.com/kubernetes-incubator/cri-o/utils"
|
|
"github.com/opencontainers/runc/libcontainer/label"
|
|
"github.com/opencontainers/runtime-tools/generate"
|
|
"golang.org/x/net/context"
|
|
pb "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
|
|
)
|
|
|
|
const (
|
|
seccompUnconfined = "unconfined"
|
|
seccompRuntimeDefault = "runtime/default"
|
|
seccompLocalhostPrefix = "localhost/"
|
|
)
|
|
|
|
// CreateContainer creates a new container in specified PodSandbox
|
|
func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerRequest) (res *pb.CreateContainerResponse, err error) {
|
|
logrus.Debugf("CreateContainerRequest %+v", req)
|
|
sbID := req.GetPodSandboxId()
|
|
if sbID == "" {
|
|
return nil, fmt.Errorf("PodSandboxId should not be empty")
|
|
}
|
|
|
|
sandboxID, err := s.podIDIndex.Get(sbID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("PodSandbox with ID starting with %s not found: %v", sbID, err)
|
|
}
|
|
|
|
sb := s.getSandbox(sandboxID)
|
|
if sb == nil {
|
|
return nil, fmt.Errorf("specified sandbox not found: %s", sandboxID)
|
|
}
|
|
|
|
// The config of the container
|
|
containerConfig := req.GetConfig()
|
|
if containerConfig == nil {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig is nil")
|
|
}
|
|
|
|
name := containerConfig.GetMetadata().GetName()
|
|
if name == "" {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Name is empty")
|
|
}
|
|
|
|
attempt := containerConfig.GetMetadata().GetAttempt()
|
|
containerID, containerName, err := s.generateContainerIDandName(sb.name, name, attempt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// containerDir is the dir for the container bundle.
|
|
containerDir := filepath.Join(s.runtime.ContainerDir(), containerID)
|
|
defer func() {
|
|
if err != nil {
|
|
s.releaseContainerName(containerName)
|
|
err1 := os.RemoveAll(containerDir)
|
|
if err1 != nil {
|
|
logrus.Warnf("Failed to cleanup container directory: %v")
|
|
}
|
|
}
|
|
}()
|
|
|
|
if _, err = os.Stat(containerDir); err == nil {
|
|
return nil, fmt.Errorf("container (%s) already exists", containerDir)
|
|
}
|
|
|
|
if err = os.MkdirAll(containerDir, 0755); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
container, err := s.createSandboxContainer(containerID, containerName, sb, containerDir, containerConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.runtime.CreateContainer(container); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.runtime.UpdateStatus(container); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
s.addContainer(container)
|
|
|
|
if err = s.ctrIDIndex.Add(containerID); err != nil {
|
|
s.removeContainer(container)
|
|
return nil, err
|
|
}
|
|
|
|
resp := &pb.CreateContainerResponse{
|
|
ContainerId: &containerID,
|
|
}
|
|
|
|
logrus.Debugf("CreateContainerResponse: %+v", resp)
|
|
return resp, nil
|
|
}
|
|
|
|
func (s *Server) createSandboxContainer(containerID string, containerName string, sb *sandbox, containerDir string, containerConfig *pb.ContainerConfig) (*oci.Container, error) {
|
|
if sb == nil {
|
|
return nil, errors.New("createSandboxContainer needs a sandbox")
|
|
}
|
|
// creates a spec Generator with the default spec.
|
|
specgen := generate.New()
|
|
|
|
// by default, the root path is an empty string.
|
|
// here set it to be "rootfs".
|
|
specgen.SetRootPath("rootfs")
|
|
|
|
args := containerConfig.GetArgs()
|
|
if args == nil {
|
|
args = []string{"/bin/sh"}
|
|
}
|
|
specgen.SetProcessArgs(args)
|
|
|
|
cwd := containerConfig.GetWorkingDir()
|
|
if cwd == "" {
|
|
cwd = "/"
|
|
}
|
|
specgen.SetProcessCwd(cwd)
|
|
|
|
envs := containerConfig.GetEnvs()
|
|
if envs != nil {
|
|
for _, item := range envs {
|
|
key := item.GetKey()
|
|
value := item.GetValue()
|
|
if key == "" {
|
|
continue
|
|
}
|
|
env := fmt.Sprintf("%s=%s", key, value)
|
|
specgen.AddProcessEnv(env)
|
|
}
|
|
}
|
|
|
|
mounts := containerConfig.GetMounts()
|
|
for _, mount := range mounts {
|
|
dest := mount.GetContainerPath()
|
|
if dest == "" {
|
|
return nil, fmt.Errorf("Mount.ContainerPath is empty")
|
|
}
|
|
|
|
src := mount.GetHostPath()
|
|
if src == "" {
|
|
return nil, fmt.Errorf("Mount.HostPath is empty")
|
|
}
|
|
|
|
options := []string{"rw"}
|
|
if mount.GetReadonly() {
|
|
options = []string{"ro"}
|
|
}
|
|
|
|
if mount.GetSelinuxRelabel() {
|
|
// Need a way in kubernetes to determine if the volume is shared or private
|
|
if err := label.Relabel(src, sb.mountLabel, true); err != nil && err != syscall.ENOTSUP {
|
|
return nil, fmt.Errorf("relabel failed %s: %v", src, err)
|
|
}
|
|
}
|
|
|
|
specgen.AddBindMount(src, dest, options)
|
|
}
|
|
|
|
labels := containerConfig.GetLabels()
|
|
|
|
metadata := containerConfig.GetMetadata()
|
|
|
|
annotations := containerConfig.GetAnnotations()
|
|
if annotations != nil {
|
|
for k, v := range annotations {
|
|
specgen.AddAnnotation(k, v)
|
|
}
|
|
}
|
|
if containerConfig.GetLinux().GetSecurityContext().GetPrivileged() {
|
|
specgen.SetupPrivileged(true)
|
|
}
|
|
|
|
if containerConfig.GetLinux().GetSecurityContext().GetReadonlyRootfs() {
|
|
specgen.SetRootReadonly(true)
|
|
}
|
|
|
|
logPath := containerConfig.GetLogPath()
|
|
|
|
if containerConfig.GetTty() {
|
|
specgen.SetProcessTerminal(true)
|
|
}
|
|
|
|
linux := containerConfig.GetLinux()
|
|
if linux != nil {
|
|
resources := linux.GetResources()
|
|
if resources != nil {
|
|
cpuPeriod := resources.GetCpuPeriod()
|
|
if cpuPeriod != 0 {
|
|
specgen.SetLinuxResourcesCPUPeriod(uint64(cpuPeriod))
|
|
}
|
|
|
|
cpuQuota := resources.GetCpuQuota()
|
|
if cpuQuota != 0 {
|
|
specgen.SetLinuxResourcesCPUQuota(uint64(cpuQuota))
|
|
}
|
|
|
|
cpuShares := resources.GetCpuShares()
|
|
if cpuShares != 0 {
|
|
specgen.SetLinuxResourcesCPUShares(uint64(cpuShares))
|
|
}
|
|
|
|
memoryLimit := resources.GetMemoryLimitInBytes()
|
|
if memoryLimit != 0 {
|
|
specgen.SetLinuxResourcesMemoryLimit(uint64(memoryLimit))
|
|
}
|
|
|
|
oomScoreAdj := resources.GetOomScoreAdj()
|
|
specgen.SetLinuxResourcesOOMScoreAdj(int(oomScoreAdj))
|
|
}
|
|
|
|
capabilities := linux.GetSecurityContext().GetCapabilities()
|
|
if capabilities != nil {
|
|
addCaps := capabilities.GetAddCapabilities()
|
|
if addCaps != nil {
|
|
for _, cap := range addCaps {
|
|
if err := specgen.AddProcessCapability(cap); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
dropCaps := capabilities.GetDropCapabilities()
|
|
if dropCaps != nil {
|
|
for _, cap := range dropCaps {
|
|
if err := specgen.DropProcessCapability(cap); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
specgen.SetProcessSelinuxLabel(sb.processLabel)
|
|
specgen.SetLinuxMountLabel(sb.mountLabel)
|
|
|
|
user := linux.GetSecurityContext().GetRunAsUser()
|
|
specgen.SetProcessUID(uint32(user))
|
|
|
|
specgen.SetProcessGID(uint32(user))
|
|
|
|
groups := linux.GetSecurityContext().GetSupplementalGroups()
|
|
for _, group := range groups {
|
|
specgen.AddProcessAdditionalGid(uint32(group))
|
|
}
|
|
}
|
|
// Join the namespace paths for the pod sandbox container.
|
|
podInfraState := s.runtime.ContainerStatus(sb.infraContainer)
|
|
|
|
logrus.Debugf("pod container state %+v", podInfraState)
|
|
|
|
for nsType, nsFile := range map[string]string{
|
|
"ipc": "ipc",
|
|
"network": "net",
|
|
} {
|
|
nsPath := fmt.Sprintf("/proc/%d/ns/%s", podInfraState.Pid, nsFile)
|
|
if err := specgen.AddOrReplaceLinuxNamespace(nsType, nsPath); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
specgen.AddAnnotation("ocid/name", containerName)
|
|
specgen.AddAnnotation("ocid/sandbox_id", sb.id)
|
|
specgen.AddAnnotation("ocid/sandbox_name", sb.infraContainer.Name())
|
|
specgen.AddAnnotation("ocid/container_type", containerTypeContainer)
|
|
specgen.AddAnnotation("ocid/log_path", logPath)
|
|
specgen.AddAnnotation("ocid/tty", fmt.Sprintf("%v", containerConfig.GetTty()))
|
|
|
|
metadataJSON, err := json.Marshal(metadata)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.AddAnnotation("ocid/metadata", string(metadataJSON))
|
|
|
|
labelsJSON, err := json.Marshal(labels)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
specgen.AddAnnotation("ocid/labels", string(labelsJSON))
|
|
|
|
if err = s.setupSeccomp(&specgen, containerName, sb.annotations); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = specgen.SaveToFile(filepath.Join(containerDir, "config.json"), generate.ExportOptions{}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
imageSpec := containerConfig.GetImage()
|
|
if imageSpec == nil {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image is nil")
|
|
}
|
|
|
|
image := imageSpec.GetImage()
|
|
if image == "" {
|
|
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image.Image is empty")
|
|
}
|
|
|
|
// TODO: copy the rootfs into the bundle.
|
|
// Currently, utils.CreateFakeRootfs is used to populate the rootfs.
|
|
if err = utils.CreateFakeRootfs(containerDir, image); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
container, err := oci.NewContainer(containerID, containerName, containerDir, logPath, labels, metadata, sb.id, containerConfig.GetTty())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return container, nil
|
|
}
|
|
|
|
func (s *Server) setupSeccomp(specgen *generate.Generator, cname string, sbAnnotations map[string]string) error {
|
|
profile, ok := sbAnnotations["security.alpha.kubernetes.io/seccomp/container/"+cname]
|
|
if !ok {
|
|
profile, ok = sbAnnotations["security.alpha.kubernetes.io/seccomp/pod"]
|
|
if !ok {
|
|
// running w/o seccomp, aka unconfined
|
|
profile = seccompUnconfined
|
|
}
|
|
}
|
|
if !s.seccompEnabled {
|
|
if profile != seccompUnconfined {
|
|
return fmt.Errorf("seccomp is not enabled in your kernel, cannot run with a profile")
|
|
}
|
|
logrus.Warn("seccomp is not enabled in your kernel, running container without profile")
|
|
}
|
|
if profile == seccompUnconfined {
|
|
// running w/o seccomp, aka unconfined
|
|
specgen.Spec().Linux.Seccomp = nil
|
|
return nil
|
|
}
|
|
if profile == seccompRuntimeDefault {
|
|
return seccomp.LoadProfileFromStruct(s.seccompProfile, specgen)
|
|
}
|
|
if !strings.HasPrefix(profile, seccompLocalhostPrefix) {
|
|
return fmt.Errorf("unknown seccomp profile option: %q", profile)
|
|
}
|
|
//file, err := ioutil.ReadFile(filepath.Join(s.seccompProfileRoot, strings.TrimPrefix(profile, seccompLocalhostPrefix)))
|
|
//if err != nil {
|
|
//return err
|
|
//}
|
|
// TODO(runcom): setup from provided node's seccomp profile
|
|
// can't do this yet, see https://issues.k8s.io/36997
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) generateContainerIDandName(podName string, name string, attempt uint32) (string, string, error) {
|
|
var (
|
|
err error
|
|
id = stringid.GenerateNonCryptoID()
|
|
)
|
|
nameStr := fmt.Sprintf("%s-%s-%v", podName, name, attempt)
|
|
if name == "infra" {
|
|
nameStr = fmt.Sprintf("%s-%s", podName, name)
|
|
}
|
|
if name, err = s.reserveContainerName(id, nameStr); err != nil {
|
|
return "", "", err
|
|
}
|
|
return id, name, err
|
|
}
|