cri-o/server/container_create.go
Antonio Murdaca a11b8cd8ec
sandbox, ctr: SELinux fixes
server: fix selinux labels for pod and containers

Signed-off-by: Antonio Murdaca <runcom@redhat.com>

sandbox: set selinux labels from request, not defaults

Signed-off-by: Antonio Murdaca <runcom@redhat.com>

container_create: use sandbox's selinux if container's nil

Signed-off-by: Antonio Murdaca <runcom@redhat.com>

sandbox: correctly init selinux labels

First, we weren't correctly initializing selinux labels. If any of
(level, user, role, type) was missing from kube selinux options, we
were erroring out. This is wrong as kube sends just `level=s0`
sometimes and docker itself allows `--security-opt label=level:s0`.
This patch directly initializes selinux labels, correctly, and adds a
test to verify it.

Signed-off-by: Antonio Murdaca <runcom@redhat.com>

test: testdata: use container_runtime_t selinux type

RHEL SELinux policy doesn't have `container_t` type but we're using it
in our fixtures. That means Fedora integration tests pass because
`container_t` is in Fedora's container policy but RHEL is broken.
Fix it by using `container_runtime_t` which is aliased in Fedora policy
to `container_t`.

Signed-off-by: Antonio Murdaca <runcom@redhat.com>
2017-09-17 22:40:17 +02:00

1108 lines
32 KiB
Go

package server
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
"github.com/docker/distribution/reference"
"github.com/docker/docker/pkg/stringid"
"github.com/docker/docker/pkg/symlink"
"github.com/kubernetes-incubator/cri-o/libkpod"
"github.com/kubernetes-incubator/cri-o/libkpod/sandbox"
"github.com/kubernetes-incubator/cri-o/oci"
"github.com/kubernetes-incubator/cri-o/pkg/annotations"
"github.com/kubernetes-incubator/cri-o/pkg/storage"
"github.com/kubernetes-incubator/cri-o/server/apparmor"
"github.com/kubernetes-incubator/cri-o/server/seccomp"
"github.com/opencontainers/image-spec/specs-go/v1"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/user"
rspec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"
"golang.org/x/sys/unix"
pb "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
)
const (
seccompUnconfined = "unconfined"
seccompRuntimeDefault = "runtime/default"
seccompLocalhostPrefix = "localhost/"
scopePrefix = "crio"
defaultCgroupfsParent = "/crio"
defaultSystemdParent = "system.slice"
)
func addOCIBindMounts(mountLabel string, containerConfig *pb.ContainerConfig, specgen *generate.Generator) ([]oci.ContainerVolume, error) {
volumes := []oci.ContainerVolume{}
mounts := containerConfig.GetMounts()
for _, mount := range mounts {
dest := mount.ContainerPath
if dest == "" {
return nil, fmt.Errorf("Mount.ContainerPath is empty")
}
src := mount.HostPath
if src == "" {
return nil, fmt.Errorf("Mount.HostPath is empty")
}
if _, err := os.Stat(src); err != nil && os.IsNotExist(err) {
if err1 := os.MkdirAll(src, 0644); err1 != nil {
return nil, fmt.Errorf("Failed to mkdir %s: %s", src, err)
}
}
options := []string{"rw"}
if mount.Readonly {
options = []string{"ro"}
}
options = append(options, []string{"rbind", "rprivate"}...)
if mount.SelinuxRelabel {
// Need a way in kubernetes to determine if the volume is shared or private
if err := label.Relabel(src, mountLabel, true); err != nil && err != unix.ENOTSUP {
return nil, fmt.Errorf("relabel failed %s: %v", src, err)
}
}
volumes = append(volumes, oci.ContainerVolume{
ContainerPath: dest,
HostPath: src,
Readonly: mount.Readonly,
})
specgen.AddBindMount(src, dest, options)
}
return volumes, nil
}
func addImageVolumes(rootfs string, s *Server, containerInfo *storage.ContainerInfo, specgen *generate.Generator, mountLabel string) error {
for dest := range containerInfo.Config.Config.Volumes {
fp, err := symlink.FollowSymlinkInScope(filepath.Join(rootfs, dest), rootfs)
if err != nil {
return err
}
switch s.config.ImageVolumes {
case libkpod.ImageVolumesMkdir:
if err1 := os.MkdirAll(fp, 0644); err1 != nil {
return err1
}
case libkpod.ImageVolumesBind:
volumeDirName := stringid.GenerateNonCryptoID()
src := filepath.Join(containerInfo.RunDir, "mounts", volumeDirName)
if err1 := os.MkdirAll(src, 0644); err1 != nil {
return err1
}
// Label the source with the sandbox selinux mount label
if mountLabel != "" {
if err1 := label.Relabel(src, mountLabel, true); err1 != nil && err1 != unix.ENOTSUP {
return fmt.Errorf("relabel failed %s: %v", src, err1)
}
}
logrus.Debugf("Adding bind mounted volume: %s to %s", src, dest)
specgen.AddBindMount(src, dest, []string{"rw"})
case libkpod.ImageVolumesIgnore:
logrus.Debugf("Ignoring volume %v", dest)
default:
logrus.Fatalf("Unrecognized image volumes setting")
}
}
return nil
}
// resolveSymbolicLink resolves a possbile symlink path. If the path is a symlink, returns resolved
// path; if not, returns the original path.
func resolveSymbolicLink(path string) (string, error) {
info, err := os.Lstat(path)
if err != nil {
return "", err
}
if info.Mode()&os.ModeSymlink != os.ModeSymlink {
return path, nil
}
return filepath.EvalSymlinks(path)
}
func addDevices(sb *sandbox.Sandbox, containerConfig *pb.ContainerConfig, specgen *generate.Generator) error {
sp := specgen.Spec()
if containerConfig.GetLinux().GetSecurityContext().Privileged {
hostDevices, err := devices.HostDevices()
if err != nil {
return err
}
for _, hostDevice := range hostDevices {
rd := rspec.LinuxDevice{
Path: hostDevice.Path,
Type: string(hostDevice.Type),
Major: hostDevice.Major,
Minor: hostDevice.Minor,
UID: &hostDevice.Uid,
GID: &hostDevice.Gid,
}
if hostDevice.Major == 0 && hostDevice.Minor == 0 {
// Invalid device, most likely a symbolic link, skip it.
continue
}
specgen.AddDevice(rd)
}
sp.Linux.Resources.Devices = []rspec.LinuxDeviceCgroup{
{
Allow: true,
Access: "rwm",
},
}
return nil
}
for _, device := range containerConfig.GetDevices() {
path, err := resolveSymbolicLink(device.HostPath)
if err != nil {
return err
}
dev, err := devices.DeviceFromPath(path, device.Permissions)
// if there was no error, return the device
if err == nil {
rd := rspec.LinuxDevice{
Path: device.ContainerPath,
Type: string(dev.Type),
Major: dev.Major,
Minor: dev.Minor,
UID: &dev.Uid,
GID: &dev.Gid,
}
specgen.AddDevice(rd)
sp.Linux.Resources.Devices = append(sp.Linux.Resources.Devices, rspec.LinuxDeviceCgroup{
Allow: true,
Type: string(dev.Type),
Major: &dev.Major,
Minor: &dev.Minor,
Access: dev.Permissions,
})
continue
}
// if the device is not a device node
// try to see if it's a directory holding many devices
if err == devices.ErrNotADevice {
// check if it is a directory
if src, e := os.Stat(path); e == nil && src.IsDir() {
// mount the internal devices recursively
filepath.Walk(path, func(dpath string, f os.FileInfo, e error) error {
childDevice, e := devices.DeviceFromPath(dpath, device.Permissions)
if e != nil {
// ignore the device
return nil
}
cPath := strings.Replace(dpath, path, device.ContainerPath, 1)
rd := rspec.LinuxDevice{
Path: cPath,
Type: string(childDevice.Type),
Major: childDevice.Major,
Minor: childDevice.Minor,
UID: &childDevice.Uid,
GID: &childDevice.Gid,
}
specgen.AddDevice(rd)
sp.Linux.Resources.Devices = append(sp.Linux.Resources.Devices, rspec.LinuxDeviceCgroup{
Allow: true,
Type: string(childDevice.Type),
Major: &childDevice.Major,
Minor: &childDevice.Minor,
Access: childDevice.Permissions,
})
return nil
})
}
}
}
return nil
}
// buildOCIProcessArgs build an OCI compatible process arguments slice.
func buildOCIProcessArgs(containerKubeConfig *pb.ContainerConfig, imageOCIConfig *v1.Image) ([]string, error) {
//# Start the nginx container using the default command, but use custom
//arguments (arg1 .. argN) for that command.
//kubectl run nginx --image=nginx -- <arg1> <arg2> ... <argN>
//# Start the nginx container using a different command and custom arguments.
//kubectl run nginx --image=nginx --command -- <cmd> <arg1> ... <argN>
kubeCommands := containerKubeConfig.Command
kubeArgs := containerKubeConfig.Args
// merge image config and kube config
// same as docker does today...
if imageOCIConfig != nil {
if len(kubeCommands) == 0 {
if len(kubeArgs) == 0 {
kubeArgs = imageOCIConfig.Config.Cmd
}
if kubeCommands == nil {
kubeCommands = imageOCIConfig.Config.Entrypoint
}
}
}
if len(kubeCommands) == 0 && len(kubeArgs) == 0 {
return nil, fmt.Errorf("no command specified")
}
// create entrypoint and args
var entrypoint string
var args []string
if len(kubeCommands) != 0 {
entrypoint = kubeCommands[0]
args = append(kubeCommands[1:], kubeArgs...)
} else {
entrypoint = kubeArgs[0]
args = kubeArgs[1:]
}
processArgs := append([]string{entrypoint}, args...)
logrus.Debugf("OCI process args %v", processArgs)
return processArgs, nil
}
// addOCIHook look for hooks programs installed in hooksDirPath and add them to spec
func addOCIHook(specgen *generate.Generator, hook libkpod.HookParams) error {
logrus.Debugf("AddOCIHook", hook)
for _, stage := range hook.Stage {
switch stage {
case "prestart":
specgen.AddPreStartHook(hook.Hook, []string{hook.Hook, "prestart"})
case "poststart":
specgen.AddPostStartHook(hook.Hook, []string{hook.Hook, "poststart"})
case "poststop":
specgen.AddPostStopHook(hook.Hook, []string{hook.Hook, "poststop"})
}
}
return nil
}
// setupContainerUser sets the UID, GID and supplemental groups in OCI runtime config
func setupContainerUser(specgen *generate.Generator, rootfs string, sc *pb.LinuxContainerSecurityContext, imageConfig *v1.Image) error {
if sc != nil {
containerUser := ""
// Case 1: run as user is set by kubelet
if sc.GetRunAsUser() != nil {
containerUser = strconv.FormatInt(sc.GetRunAsUser().Value, 10)
} else {
// Case 2: run as username is set by kubelet
userName := sc.GetRunAsUsername()
if userName != "" {
containerUser = userName
} else {
// Case 3: get user from image config
if imageConfig != nil {
imageUser := imageConfig.Config.User
if imageUser != "" {
containerUser = imageUser
}
}
}
}
logrus.Debugf("CONTAINER USER: %+v", containerUser)
// Add uid, gid and groups from user
uid, gid, addGroups, err1 := getUserInfo(rootfs, containerUser)
if err1 != nil {
return err1
}
logrus.Debugf("UID: %v, GID: %v, Groups: %+v", uid, gid, addGroups)
specgen.SetProcessUID(uid)
specgen.SetProcessGID(gid)
for _, group := range addGroups {
specgen.AddProcessAdditionalGid(group)
}
// Add groups from CRI
groups := sc.GetSupplementalGroups()
for _, group := range groups {
specgen.AddProcessAdditionalGid(uint32(group))
}
}
return nil
}
func hostNetwork(containerConfig *pb.ContainerConfig) bool {
securityContext := containerConfig.GetLinux().GetSecurityContext()
if securityContext == nil || securityContext.GetNamespaceOptions() == nil {
return false
}
return securityContext.GetNamespaceOptions().HostNetwork
}
// ensureSaneLogPath is a hack to fix https://issues.k8s.io/44043 which causes
// logPath to be a broken symlink to some magical Docker path. Ideally we
// wouldn't have to deal with this, but until that issue is fixed we have to
// remove the path if it's a broken symlink.
func ensureSaneLogPath(logPath string) error {
// If the path exists but the resolved path does not, then we have a broken
// symlink and we need to remove it.
fi, err := os.Lstat(logPath)
if err != nil || fi.Mode()&os.ModeSymlink == 0 {
// Non-existent files and non-symlinks aren't our problem.
return nil
}
_, err = os.Stat(logPath)
if os.IsNotExist(err) {
err = os.RemoveAll(logPath)
if err != nil {
return fmt.Errorf("ensureSaneLogPath remove bad logPath: %s", err)
}
}
return nil
}
// CreateContainer creates a new container in specified PodSandbox
func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerRequest) (res *pb.CreateContainerResponse, err error) {
logrus.Debugf("CreateContainerRequest %+v", req)
s.updateLock.RLock()
defer s.updateLock.RUnlock()
sbID := req.PodSandboxId
if sbID == "" {
return nil, fmt.Errorf("PodSandboxId should not be empty")
}
sandboxID, err := s.PodIDIndex().Get(sbID)
if err != nil {
return nil, fmt.Errorf("PodSandbox with ID starting with %s not found: %v", sbID, err)
}
sb := s.getSandbox(sandboxID)
if sb == nil {
return nil, fmt.Errorf("specified sandbox not found: %s", sandboxID)
}
// The config of the container
containerConfig := req.GetConfig()
if containerConfig == nil {
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig is nil")
}
name := containerConfig.GetMetadata().Name
if name == "" {
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Name is empty")
}
containerID, containerName, err := s.generateContainerIDandName(sb.Metadata(), containerConfig)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
s.ReleaseContainerName(containerName)
}
}()
container, err := s.createSandboxContainer(ctx, containerID, containerName, sb, req.GetSandboxConfig(), containerConfig)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
err2 := s.StorageRuntimeServer().DeleteContainer(containerID)
if err2 != nil {
logrus.Warnf("Failed to cleanup container directory: %v", err2)
}
}
}()
if err = s.Runtime().CreateContainer(container, sb.CgroupParent()); err != nil {
return nil, err
}
s.addContainer(container)
if err = s.CtrIDIndex().Add(containerID); err != nil {
s.removeContainer(container)
return nil, err
}
s.ContainerStateToDisk(container)
resp := &pb.CreateContainerResponse{
ContainerId: containerID,
}
logrus.Debugf("CreateContainerResponse: %+v", resp)
return resp, nil
}
func (s *Server) setupOCIHooks(specgen *generate.Generator, sb *sandbox.Sandbox, containerConfig *pb.ContainerConfig, command string) error {
mounts := containerConfig.GetMounts()
addedHooks := map[string]struct{}{}
addHook := func(hook libkpod.HookParams) error {
// Only add a hook once
if _, ok := addedHooks[hook.Hook]; !ok {
if err := addOCIHook(specgen, hook); err != nil {
return err
}
addedHooks[hook.Hook] = struct{}{}
}
return nil
}
for _, hook := range s.Hooks() {
logrus.Debugf("SetupOCIHooks", hook)
if hook.HasBindMounts && len(mounts) > 0 {
if err := addHook(hook); err != nil {
return err
}
continue
}
for _, cmd := range hook.Cmds {
match, err := regexp.MatchString(cmd, command)
if err != nil {
logrus.Errorf("Invalid regex %q:%q", cmd, err)
continue
}
if match {
if err := addHook(hook); err != nil {
return err
}
}
}
for _, annotationRegex := range hook.Annotations {
for _, annotation := range sb.Annotations() {
match, err := regexp.MatchString(annotationRegex, annotation)
if err != nil {
logrus.Errorf("Invalid regex %q:%q", annotationRegex, err)
continue
}
if match {
if err := addHook(hook); err != nil {
return err
}
}
}
}
}
return nil
}
func (s *Server) createSandboxContainer(ctx context.Context, containerID string, containerName string, sb *sandbox.Sandbox, SandboxConfig *pb.PodSandboxConfig, containerConfig *pb.ContainerConfig) (*oci.Container, error) {
if sb == nil {
return nil, errors.New("createSandboxContainer needs a sandbox")
}
// TODO: simplify this function (cyclomatic complexity here is high)
// TODO: factor generating/updating the spec into something other projects can vendor
// creates a spec Generator with the default spec.
specgen := generate.New()
specgen.HostSpecific = true
specgen.ClearProcessRlimits()
mountLabel := sb.MountLabel()
processLabel := sb.ProcessLabel()
selinuxConfig := containerConfig.GetLinux().GetSecurityContext().GetSelinuxOptions()
if selinuxConfig != nil {
var err error
processLabel, mountLabel, err = getSELinuxLabels(selinuxConfig)
if err != nil {
return nil, err
}
}
containerVolumes, err := addOCIBindMounts(mountLabel, containerConfig, &specgen)
if err != nil {
return nil, err
}
volumesJSON, err := json.Marshal(containerVolumes)
if err != nil {
return nil, err
}
specgen.AddAnnotation(annotations.Volumes, string(volumesJSON))
// Add cgroup mount so container process can introspect its own limits
specgen.AddCgroupsMount("ro")
if err := addDevices(sb, containerConfig, &specgen); err != nil {
return nil, err
}
labels := containerConfig.GetLabels()
metadata := containerConfig.GetMetadata()
kubeAnnotations := containerConfig.GetAnnotations()
if kubeAnnotations != nil {
for k, v := range kubeAnnotations {
specgen.AddAnnotation(k, v)
}
}
if labels != nil {
for k, v := range labels {
specgen.AddAnnotation(k, v)
}
}
var readOnlyRootfs bool
var privileged bool
if containerConfig.GetLinux().GetSecurityContext() != nil {
if containerConfig.GetLinux().GetSecurityContext().Privileged {
privileged = true
}
if containerConfig.GetLinux().GetSecurityContext().ReadonlyRootfs {
readOnlyRootfs = true
specgen.SetRootReadonly(true)
}
}
// set this container's apparmor profile if it is set by sandbox
if s.appArmorEnabled && !privileged {
appArmorProfileName := s.getAppArmorProfileName(sb.Annotations(), metadata.Name)
if appArmorProfileName != "" {
// reload default apparmor profile if it is unloaded.
if s.appArmorProfile == apparmor.DefaultApparmorProfile {
if err := apparmor.EnsureDefaultApparmorProfile(); err != nil {
return nil, err
}
}
specgen.SetProcessApparmorProfile(appArmorProfileName)
}
}
logPath := containerConfig.LogPath
if logPath == "" {
// TODO: Should we use sandboxConfig.GetLogDirectory() here?
logPath = filepath.Join(sb.LogDir(), containerID+".log")
}
if !filepath.IsAbs(logPath) {
// XXX: It's not really clear what this should be versus the sbox logDirectory.
logrus.Warnf("requested logPath for ctr id %s is a relative path: %s", containerID, logPath)
logPath = filepath.Join(sb.LogDir(), logPath)
}
// Handle https://issues.k8s.io/44043
if err := ensureSaneLogPath(logPath); err != nil {
return nil, err
}
logrus.WithFields(logrus.Fields{
"sbox.logdir": sb.LogDir(),
"ctr.logfile": containerConfig.LogPath,
"log_path": logPath,
}).Debugf("setting container's log_path")
specgen.SetProcessTerminal(containerConfig.Tty)
if containerConfig.Tty {
specgen.AddProcessEnv("TERM", "xterm")
}
linux := containerConfig.GetLinux()
if linux != nil {
resources := linux.GetResources()
if resources != nil {
cpuPeriod := resources.CpuPeriod
if cpuPeriod != 0 {
specgen.SetLinuxResourcesCPUPeriod(uint64(cpuPeriod))
}
cpuQuota := resources.CpuQuota
if cpuQuota != 0 {
specgen.SetLinuxResourcesCPUQuota(cpuQuota)
}
cpuShares := resources.CpuShares
if cpuShares != 0 {
specgen.SetLinuxResourcesCPUShares(uint64(cpuShares))
}
memoryLimit := resources.MemoryLimitInBytes
if memoryLimit != 0 {
specgen.SetLinuxResourcesMemoryLimit(memoryLimit)
}
oomScoreAdj := resources.OomScoreAdj
specgen.SetProcessOOMScoreAdj(int(oomScoreAdj))
}
var cgPath string
parent := defaultCgroupfsParent
useSystemd := s.config.CgroupManager == oci.SystemdCgroupsManager
if useSystemd {
parent = defaultSystemdParent
}
if sb.CgroupParent() != "" {
parent = sb.CgroupParent()
}
if useSystemd {
cgPath = parent + ":" + scopePrefix + ":" + containerID
} else {
cgPath = filepath.Join(parent, scopePrefix+"-"+containerID)
}
specgen.SetLinuxCgroupsPath(cgPath)
capabilities := linux.GetSecurityContext().GetCapabilities()
if privileged {
// this is setting correct capabilities as well for privileged mode
specgen.SetupPrivileged(true)
} else {
toCAPPrefixed := func(cap string) string {
if !strings.HasPrefix(strings.ToLower(cap), "cap_") {
return "CAP_" + strings.ToUpper(cap)
}
return cap
}
// Add/drop all capabilities if "all" is specified, so that
// following individual add/drop could still work. E.g.
// AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"}
// will be all capabilities without `CAP_CHOWN`.
// see https://github.com/kubernetes/kubernetes/issues/51980
if inStringSlice(capabilities.GetAddCapabilities(), "ALL") {
for _, c := range getOCICapabilitiesList() {
if err := specgen.AddProcessCapability(c); err != nil {
return nil, err
}
}
}
if inStringSlice(capabilities.GetDropCapabilities(), "ALL") {
for _, c := range getOCICapabilitiesList() {
if err := specgen.DropProcessCapability(c); err != nil {
return nil, err
}
}
}
if capabilities != nil {
for _, cap := range capabilities.GetAddCapabilities() {
if strings.ToUpper(cap) == "ALL" {
continue
}
if err := specgen.AddProcessCapability(toCAPPrefixed(cap)); err != nil {
return nil, err
}
}
for _, cap := range capabilities.GetDropCapabilities() {
if strings.ToUpper(cap) == "ALL" {
continue
}
if err := specgen.DropProcessCapability(toCAPPrefixed(cap)); err != nil {
return nil, fmt.Errorf("failed to drop cap %s %v", toCAPPrefixed(cap), err)
}
}
}
specgen.SetProcessSelinuxLabel(processLabel)
}
specgen.SetLinuxMountLabel(sb.MountLabel())
if containerConfig.GetLinux().GetSecurityContext() != nil &&
!containerConfig.GetLinux().GetSecurityContext().Privileged {
for _, mp := range []string{
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
} {
specgen.AddLinuxMaskedPaths(mp)
}
for _, rp := range []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
} {
specgen.AddLinuxReadonlyPaths(rp)
}
}
}
// Join the namespace paths for the pod sandbox container.
podInfraState := s.Runtime().ContainerStatus(sb.InfraContainer())
logrus.Debugf("pod container state %+v", podInfraState)
ipcNsPath := fmt.Sprintf("/proc/%d/ns/ipc", podInfraState.Pid)
if err := specgen.AddOrReplaceLinuxNamespace("ipc", ipcNsPath); err != nil {
return nil, err
}
netNsPath := sb.NetNsPath()
if netNsPath == "" {
// The sandbox does not have a permanent namespace,
// it's on the host one.
netNsPath = fmt.Sprintf("/proc/%d/ns/net", podInfraState.Pid)
}
if err := specgen.AddOrReplaceLinuxNamespace("network", netNsPath); err != nil {
return nil, err
}
imageSpec := containerConfig.GetImage()
if imageSpec == nil {
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image is nil")
}
image := imageSpec.Image
if image == "" {
return nil, fmt.Errorf("CreateContainerRequest.ContainerConfig.Image.Image is empty")
}
images, err := s.StorageImageServer().ResolveNames(image)
if err != nil {
// This means we got an image ID
if strings.Contains(err.Error(), "cannot specify 64-byte hexadecimal strings") {
images = append(images, image)
} else {
return nil, err
}
}
image = images[0]
// Get imageName and imageRef that are requested in container status
imageName := image
status, err := s.StorageImageServer().ImageStatus(s.ImageContext(), image)
if err != nil {
return nil, err
}
imageRef := status.ID
//
// TODO: https://github.com/kubernetes-incubator/cri-o/issues/531
//
//for _, n := range status.Names {
//r, err := reference.ParseNormalizedNamed(n)
//if err != nil {
//return nil, fmt.Errorf("failed to normalize image name for ImageRef: %v", err)
//}
//if digested, isDigested := r.(reference.Canonical); isDigested {
//imageRef = reference.FamiliarString(digested)
//break
//}
//}
for _, n := range status.Names {
r, err := reference.ParseNormalizedNamed(n)
if err != nil {
return nil, fmt.Errorf("failed to normalize image name for Image: %v", err)
}
if tagged, isTagged := r.(reference.Tagged); isTagged {
imageName = reference.FamiliarString(tagged)
break
}
}
specgen.AddAnnotation(annotations.ImageName, imageName)
specgen.AddAnnotation(annotations.ImageRef, imageRef)
specgen.AddAnnotation(annotations.IP, sb.IP())
// bind mount the pod shm
specgen.AddBindMount(sb.ShmPath(), "/dev/shm", []string{"rw"})
options := []string{"rw"}
if readOnlyRootfs {
options = []string{"ro"}
}
if sb.ResolvPath() != "" {
if err := label.Relabel(sb.ResolvPath(), mountLabel, true); err != nil && err != unix.ENOTSUP {
return nil, err
}
// bind mount the pod resolver file
specgen.AddBindMount(sb.ResolvPath(), "/etc/resolv.conf", options)
}
if sb.HostnamePath() != "" {
if err := label.Relabel(sb.HostnamePath(), mountLabel, true); err != nil && err != unix.ENOTSUP {
return nil, err
}
specgen.AddBindMount(sb.HostnamePath(), "/etc/hostname", options)
}
// Bind mount /etc/hosts for host networking containers
if hostNetwork(containerConfig) {
specgen.AddBindMount("/etc/hosts", "/etc/hosts", options)
}
specgen.SetHostname(sb.Hostname())
specgen.AddAnnotation(annotations.Name, containerName)
specgen.AddAnnotation(annotations.ContainerID, containerID)
specgen.AddAnnotation(annotations.SandboxID, sb.ID())
specgen.AddAnnotation(annotations.SandboxName, sb.InfraContainer().Name())
specgen.AddAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer)
specgen.AddAnnotation(annotations.LogPath, logPath)
specgen.AddAnnotation(annotations.TTY, fmt.Sprintf("%v", containerConfig.Tty))
specgen.AddAnnotation(annotations.Stdin, fmt.Sprintf("%v", containerConfig.Stdin))
specgen.AddAnnotation(annotations.StdinOnce, fmt.Sprintf("%v", containerConfig.StdinOnce))
specgen.AddAnnotation(annotations.Image, image)
created := time.Now()
specgen.AddAnnotation(annotations.Created, created.Format(time.RFC3339Nano))
metadataJSON, err := json.Marshal(metadata)
if err != nil {
return nil, err
}
specgen.AddAnnotation(annotations.Metadata, string(metadataJSON))
labelsJSON, err := json.Marshal(labels)
if err != nil {
return nil, err
}
specgen.AddAnnotation(annotations.Labels, string(labelsJSON))
kubeAnnotationsJSON, err := json.Marshal(kubeAnnotations)
if err != nil {
return nil, err
}
specgen.AddAnnotation(annotations.Annotations, string(kubeAnnotationsJSON))
if !privileged {
if err = s.setupSeccomp(&specgen, containerName, sb.Annotations()); err != nil {
return nil, err
}
}
metaname := metadata.Name
attempt := metadata.Attempt
containerInfo, err := s.StorageRuntimeServer().CreateContainer(s.ImageContext(),
sb.Name(), sb.ID(),
image, image,
containerName, containerID,
metaname,
attempt,
mountLabel,
nil)
if err != nil {
return nil, err
}
mountPoint, err := s.StorageRuntimeServer().StartContainer(containerID)
if err != nil {
return nil, fmt.Errorf("failed to mount container %s(%s): %v", containerName, containerID, err)
}
specgen.AddAnnotation(annotations.MountPoint, mountPoint)
containerImageConfig := containerInfo.Config
if containerImageConfig == nil {
return nil, fmt.Errorf("empty image config for %s", image)
}
if containerImageConfig.Config.StopSignal != "" {
// this key is defined in image-spec conversion document at https://github.com/opencontainers/image-spec/pull/492/files#diff-8aafbe2c3690162540381b8cdb157112R57
specgen.AddAnnotation("org.opencontainers.image.stopSignal", containerImageConfig.Config.StopSignal)
}
// Add image volumes
if err := addImageVolumes(mountPoint, s, &containerInfo, &specgen, mountLabel); err != nil {
return nil, err
}
processArgs, err := buildOCIProcessArgs(containerConfig, containerImageConfig)
if err != nil {
return nil, err
}
specgen.SetProcessArgs(processArgs)
// Add environment variables from CRI and image config
envs := containerConfig.GetEnvs()
if envs != nil {
for _, item := range envs {
key := item.Key
value := item.Value
if key == "" {
continue
}
specgen.AddProcessEnv(key, value)
}
}
if containerImageConfig != nil {
for _, item := range containerImageConfig.Config.Env {
parts := strings.SplitN(item, "=", 2)
if len(parts) != 2 {
return nil, fmt.Errorf("invalid env from image: %s", item)
}
if parts[0] == "" {
continue
}
specgen.AddProcessEnv(parts[0], parts[1])
}
}
// Set working directory
// Pick it up from image config first and override if specified in CRI
containerCwd := "/"
if containerImageConfig != nil {
imageCwd := containerImageConfig.Config.WorkingDir
if imageCwd != "" {
containerCwd = imageCwd
}
}
runtimeCwd := containerConfig.WorkingDir
if runtimeCwd != "" {
containerCwd = runtimeCwd
}
specgen.SetProcessCwd(containerCwd)
if err := s.setupOCIHooks(&specgen, sb, containerConfig, processArgs[0]); err != nil {
return nil, err
}
// Setup user and groups
if linux != nil {
if err = setupContainerUser(&specgen, mountPoint, linux.GetSecurityContext(), containerImageConfig); err != nil {
return nil, err
}
}
// Set up pids limit if pids cgroup is mounted
_, err = cgroups.FindCgroupMountpoint("pids")
if err == nil {
specgen.SetLinuxResourcesPidsLimit(s.config.PidsLimit)
}
// by default, the root path is an empty string. set it now.
specgen.SetRootPath(mountPoint)
saveOptions := generate.ExportOptions{}
if err = specgen.SaveToFile(filepath.Join(containerInfo.Dir, "config.json"), saveOptions); err != nil {
return nil, err
}
if err = specgen.SaveToFile(filepath.Join(containerInfo.RunDir, "config.json"), saveOptions); err != nil {
return nil, err
}
container, err := oci.NewContainer(containerID, containerName, containerInfo.RunDir, logPath, sb.NetNs(), labels, kubeAnnotations, image, imageName, imageRef, metadata, sb.ID(), containerConfig.Tty, containerConfig.Stdin, containerConfig.StdinOnce, sb.Privileged(), sb.Trusted(), containerInfo.Dir, created, containerImageConfig.Config.StopSignal)
if err != nil {
return nil, err
}
container.SetMountPoint(mountPoint)
for _, cv := range containerVolumes {
container.AddVolume(cv)
}
return container, nil
}
func (s *Server) setupSeccomp(specgen *generate.Generator, cname string, sbAnnotations map[string]string) error {
profile, ok := sbAnnotations["security.alpha.kubernetes.io/seccomp/container/"+cname]
if !ok {
profile, ok = sbAnnotations["security.alpha.kubernetes.io/seccomp/pod"]
if !ok {
// running w/o seccomp, aka unconfined
profile = seccompUnconfined
}
}
if !s.seccompEnabled {
if profile != seccompUnconfined {
return fmt.Errorf("seccomp is not enabled in your kernel, cannot run with a profile")
}
logrus.Warn("seccomp is not enabled in your kernel, running container without profile")
}
if profile == seccompUnconfined {
// running w/o seccomp, aka unconfined
specgen.Spec().Linux.Seccomp = nil
return nil
}
if profile == seccompRuntimeDefault {
return seccomp.LoadProfileFromStruct(s.seccompProfile, specgen)
}
if !strings.HasPrefix(profile, seccompLocalhostPrefix) {
return fmt.Errorf("unknown seccomp profile option: %q", profile)
}
//file, err := ioutil.ReadFile(filepath.Join(s.seccompProfileRoot, strings.TrimPrefix(profile, seccompLocalhostPrefix)))
//if err != nil {
//return err
//}
// TODO(runcom): setup from provided node's seccomp profile
// can't do this yet, see https://issues.k8s.io/36997
return nil
}
// getAppArmorProfileName gets the profile name for the given container.
func (s *Server) getAppArmorProfileName(annotations map[string]string, ctrName string) string {
profile := apparmor.GetProfileNameFromPodAnnotations(annotations, ctrName)
if profile == "" {
return ""
}
if profile == apparmor.ProfileRuntimeDefault {
// If the value is runtime/default, then return default profile.
return s.appArmorProfile
}
return strings.TrimPrefix(profile, apparmor.ProfileNamePrefix)
}
// openContainerFile opens a file inside a container rootfs safely
func openContainerFile(rootfs string, path string) (io.ReadCloser, error) {
fp, err := symlink.FollowSymlinkInScope(filepath.Join(rootfs, path), rootfs)
if err != nil {
return nil, err
}
return os.Open(fp)
}
// getUserInfo returns UID, GID and additional groups for specified user
// by looking them up in /etc/passwd and /etc/group
func getUserInfo(rootfs string, userName string) (uint32, uint32, []uint32, error) {
// We don't care if we can't open the file because
// not all images will have these files
passwdFile, err := openContainerFile(rootfs, "/etc/passwd")
if err != nil {
logrus.Warnf("Failed to open /etc/passwd: %v", err)
} else {
defer passwdFile.Close()
}
groupFile, err := openContainerFile(rootfs, "/etc/group")
if err != nil {
logrus.Warnf("Failed to open /etc/group: %v", err)
} else {
defer groupFile.Close()
}
execUser, err := user.GetExecUser(userName, nil, passwdFile, groupFile)
if err != nil {
return 0, 0, nil, err
}
uid := uint32(execUser.Uid)
gid := uint32(execUser.Gid)
var additionalGids []uint32
for _, g := range execUser.Sgids {
additionalGids = append(additionalGids, uint32(g))
}
return uid, gid, additionalGids, nil
}