4cab8ed06a
Because they need to prepare the hypervisor networking interfaces and have them match the ones created in the pod networking namespace (typically to bridge TAP and veth interfaces), hypervisor based container runtimes need the sandbox pod networking namespace to be set up before it's created. They can then prepare and start the hypervisor interfaces when creating the pod virtual machine. In order to do so, we need to create per pod persitent networking namespaces that we pass to the CNI plugin. This patch leverages the CNI ns package to create such namespaces under /var/run/netns, and assign them to all pod containers. The persitent namespace is removed when either the pod is stopped or removed. Since the StopPodSandbox() API can be called multiple times from kubelet, we track the pod networking namespace state (closed or not) so that we don't get a containernetworking/ns package error when calling its Close() routine multiple times as well. Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
417 lines
11 KiB
Go
417 lines
11 KiB
Go
package server
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"syscall"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/docker/pkg/registrar"
|
|
"github.com/docker/docker/pkg/truncindex"
|
|
"github.com/kubernetes-incubator/cri-o/oci"
|
|
"github.com/kubernetes-incubator/cri-o/server/apparmor"
|
|
"github.com/kubernetes-incubator/cri-o/server/seccomp"
|
|
"github.com/kubernetes-incubator/cri-o/utils"
|
|
"github.com/opencontainers/runc/libcontainer/label"
|
|
rspec "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/rajatchopra/ocicni"
|
|
pb "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
|
|
)
|
|
|
|
const (
|
|
runtimeAPIVersion = "v1alpha1"
|
|
)
|
|
|
|
// Server implements the RuntimeService and ImageService
|
|
type Server struct {
|
|
config Config
|
|
runtime *oci.Runtime
|
|
stateLock sync.Mutex
|
|
state *serverState
|
|
netPlugin ocicni.CNIPlugin
|
|
podNameIndex *registrar.Registrar
|
|
podIDIndex *truncindex.TruncIndex
|
|
ctrNameIndex *registrar.Registrar
|
|
ctrIDIndex *truncindex.TruncIndex
|
|
|
|
seccompEnabled bool
|
|
seccompProfile seccomp.Seccomp
|
|
|
|
appArmorEnabled bool
|
|
appArmorProfile string
|
|
}
|
|
|
|
func (s *Server) loadContainer(id string) error {
|
|
config, err := ioutil.ReadFile(filepath.Join(s.runtime.ContainerDir(), id, "config.json"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var m rspec.Spec
|
|
if err = json.Unmarshal(config, &m); err != nil {
|
|
return err
|
|
}
|
|
labels := make(map[string]string)
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/labels"]), &labels); err != nil {
|
|
return err
|
|
}
|
|
name := m.Annotations["ocid/name"]
|
|
name, err = s.reserveContainerName(id, name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var metadata pb.ContainerMetadata
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/metadata"]), &metadata); err != nil {
|
|
return err
|
|
}
|
|
sb := s.getSandbox(m.Annotations["ocid/sandbox_id"])
|
|
if sb == nil {
|
|
logrus.Warnf("could not get sandbox with id %s, skipping", m.Annotations["ocid/sandbox_id"])
|
|
return nil
|
|
}
|
|
|
|
var tty bool
|
|
if v := m.Annotations["ocid/tty"]; v == "true" {
|
|
tty = true
|
|
}
|
|
containerPath := filepath.Join(s.runtime.ContainerDir(), id)
|
|
|
|
var img *pb.ImageSpec
|
|
image, ok := m.Annotations["ocid/image"]
|
|
if ok {
|
|
img = &pb.ImageSpec{
|
|
Image: &image,
|
|
}
|
|
}
|
|
|
|
annotations := make(map[string]string)
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/annotations"]), &annotations); err != nil {
|
|
return err
|
|
}
|
|
|
|
ctr, err := oci.NewContainer(id, name, containerPath, m.Annotations["ocid/log_path"], sb.netNs(), labels, annotations, img, &metadata, sb.id, tty)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.addContainer(ctr)
|
|
if err = s.runtime.UpdateStatus(ctr); err != nil {
|
|
logrus.Warnf("error updating status for container %s: %v", ctr.ID(), err)
|
|
}
|
|
if err = s.ctrIDIndex.Add(id); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func configNetNsPath(spec rspec.Spec) (string, error) {
|
|
for _, ns := range spec.Linux.Namespaces {
|
|
if ns.Type != rspec.NetworkNamespace {
|
|
continue
|
|
}
|
|
|
|
if ns.Path == "" {
|
|
return "", fmt.Errorf("empty networking namespace")
|
|
}
|
|
|
|
return ns.Path, nil
|
|
}
|
|
|
|
return "", fmt.Errorf("missing networking namespace")
|
|
}
|
|
|
|
func (s *Server) loadSandbox(id string) error {
|
|
config, err := ioutil.ReadFile(filepath.Join(s.config.SandboxDir, id, "config.json"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var m rspec.Spec
|
|
if err = json.Unmarshal(config, &m); err != nil {
|
|
return err
|
|
}
|
|
labels := make(map[string]string)
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/labels"]), &labels); err != nil {
|
|
return err
|
|
}
|
|
name := m.Annotations["ocid/name"]
|
|
name, err = s.reservePodName(id, name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var metadata pb.PodSandboxMetadata
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/metadata"]), &metadata); err != nil {
|
|
return err
|
|
}
|
|
|
|
processLabel, mountLabel, err := label.InitLabels(label.DupSecOpt(m.Process.SelinuxLabel))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
annotations := make(map[string]string)
|
|
if err = json.Unmarshal([]byte(m.Annotations["ocid/annotations"]), &annotations); err != nil {
|
|
return err
|
|
}
|
|
|
|
sb := &sandbox{
|
|
id: id,
|
|
name: name,
|
|
logDir: m.Annotations["ocid/log_path"],
|
|
labels: labels,
|
|
containers: oci.NewMemoryStore(),
|
|
processLabel: processLabel,
|
|
mountLabel: mountLabel,
|
|
annotations: annotations,
|
|
metadata: &metadata,
|
|
shmPath: m.Annotations["ocid/shm_path"],
|
|
}
|
|
|
|
// We add a netNS only if we can load a permanent one.
|
|
// Otherwise, the sandbox will live in the host namespace.
|
|
netNsPath, err := configNetNsPath(m)
|
|
if err == nil {
|
|
netNS, nsErr := netNsGet(netNsPath)
|
|
// If we can't load the networking namespace
|
|
// because it's closed, we just set the sb netns
|
|
// pointer to nil. Otherwise we return an error.
|
|
if nsErr != nil && nsErr != errSandboxClosedNetNS {
|
|
return nsErr
|
|
}
|
|
|
|
sb.netns = netNS
|
|
}
|
|
|
|
s.addSandbox(sb)
|
|
|
|
sandboxPath := filepath.Join(s.config.SandboxDir, id)
|
|
|
|
if err = label.ReserveLabel(processLabel); err != nil {
|
|
return err
|
|
}
|
|
|
|
cname, err := s.reserveContainerName(m.Annotations["ocid/container_id"], m.Annotations["ocid/container_name"])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
scontainer, err := oci.NewContainer(m.Annotations["ocid/container_id"], cname, sandboxPath, sandboxPath, sb.netNs(), labels, annotations, nil, nil, id, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sb.infraContainer = scontainer
|
|
if err = s.runtime.UpdateStatus(scontainer); err != nil {
|
|
logrus.Warnf("error updating status for container %s: %v", scontainer.ID(), err)
|
|
}
|
|
if err = s.ctrIDIndex.Add(scontainer.ID()); err != nil {
|
|
return err
|
|
}
|
|
if err = s.podIDIndex.Add(id); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) restore() {
|
|
sandboxDir, err := ioutil.ReadDir(s.config.SandboxDir)
|
|
if err != nil && !os.IsNotExist(err) {
|
|
logrus.Warnf("could not read sandbox directory %s: %v", sandboxDir, err)
|
|
}
|
|
for _, v := range sandboxDir {
|
|
if !v.IsDir() {
|
|
continue
|
|
}
|
|
if err = s.loadSandbox(v.Name()); err != nil {
|
|
logrus.Warnf("could not restore sandbox %s: %v", v.Name(), err)
|
|
}
|
|
}
|
|
containerDir, err := ioutil.ReadDir(s.runtime.ContainerDir())
|
|
if err != nil && !os.IsNotExist(err) {
|
|
logrus.Warnf("could not read container directory %s: %v", s.runtime.ContainerDir(), err)
|
|
}
|
|
for _, v := range containerDir {
|
|
if !v.IsDir() {
|
|
continue
|
|
}
|
|
if err := s.loadContainer(v.Name()); err != nil {
|
|
logrus.Warnf("could not restore container %s: %v", v.Name(), err)
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *Server) reservePodName(id, name string) (string, error) {
|
|
if err := s.podNameIndex.Reserve(name, id); err != nil {
|
|
if err == registrar.ErrNameReserved {
|
|
id, err := s.podNameIndex.Get(name)
|
|
if err != nil {
|
|
logrus.Warnf("conflict, pod name %q already reserved", name)
|
|
return "", err
|
|
}
|
|
return "", fmt.Errorf("conflict, name %q already reserved for pod %q", name, id)
|
|
}
|
|
return "", fmt.Errorf("error reserving pod name %q", name)
|
|
}
|
|
return name, nil
|
|
}
|
|
|
|
func (s *Server) releasePodName(name string) {
|
|
s.podNameIndex.Release(name)
|
|
}
|
|
|
|
func (s *Server) reserveContainerName(id, name string) (string, error) {
|
|
if err := s.ctrNameIndex.Reserve(name, id); err != nil {
|
|
if err == registrar.ErrNameReserved {
|
|
id, err := s.ctrNameIndex.Get(name)
|
|
if err != nil {
|
|
logrus.Warnf("conflict, ctr name %q already reserved", name)
|
|
return "", err
|
|
}
|
|
return "", fmt.Errorf("conflict, name %q already reserved for ctr %q", name, id)
|
|
}
|
|
return "", fmt.Errorf("error reserving ctr name %s", name)
|
|
}
|
|
return name, nil
|
|
}
|
|
|
|
func (s *Server) releaseContainerName(name string) {
|
|
s.ctrNameIndex.Release(name)
|
|
}
|
|
|
|
const (
|
|
// SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER.
|
|
SeccompModeFilter = uintptr(2)
|
|
)
|
|
|
|
func seccompEnabled() bool {
|
|
var enabled bool
|
|
// Check if Seccomp is supported, via CONFIG_SECCOMP.
|
|
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL {
|
|
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
|
|
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL {
|
|
enabled = true
|
|
}
|
|
}
|
|
return enabled
|
|
}
|
|
|
|
// New creates a new Server with options provided
|
|
func New(config *Config) (*Server, error) {
|
|
// TODO: This will go away later when we have wrapper process or systemd acting as
|
|
// subreaper.
|
|
if err := utils.SetSubreaper(1); err != nil {
|
|
return nil, fmt.Errorf("failed to set server as subreaper: %v", err)
|
|
}
|
|
|
|
utils.StartReaper()
|
|
|
|
if err := os.MkdirAll(config.ImageDir, 0755); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := os.MkdirAll(config.SandboxDir, 0755); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
r, err := oci.New(config.Runtime, config.ContainerDir, config.Conmon, config.ConmonEnv)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
sandboxes := make(map[string]*sandbox)
|
|
containers := oci.NewMemoryStore()
|
|
netPlugin, err := ocicni.InitCNI("")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s := &Server{
|
|
runtime: r,
|
|
netPlugin: netPlugin,
|
|
config: *config,
|
|
state: &serverState{
|
|
sandboxes: sandboxes,
|
|
containers: containers,
|
|
},
|
|
seccompEnabled: seccompEnabled(),
|
|
appArmorEnabled: apparmor.IsEnabled(),
|
|
}
|
|
seccompProfile, err := ioutil.ReadFile(config.SeccompProfile)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("opening seccomp profile (%s) failed: %v", config.SeccompProfile, err)
|
|
}
|
|
var seccompConfig seccomp.Seccomp
|
|
if err := json.Unmarshal(seccompProfile, &seccompConfig); err != nil {
|
|
return nil, fmt.Errorf("decoding seccomp profile failed: %v", err)
|
|
}
|
|
s.seccompProfile = seccompConfig
|
|
|
|
if s.appArmorEnabled {
|
|
apparmor.InstallDefaultAppArmorProfile()
|
|
}
|
|
s.appArmorProfile = config.ApparmorProfile
|
|
|
|
s.podIDIndex = truncindex.NewTruncIndex([]string{})
|
|
s.podNameIndex = registrar.NewRegistrar()
|
|
s.ctrIDIndex = truncindex.NewTruncIndex([]string{})
|
|
s.ctrNameIndex = registrar.NewRegistrar()
|
|
|
|
s.restore()
|
|
|
|
logrus.Debugf("sandboxes: %v", s.state.sandboxes)
|
|
logrus.Debugf("containers: %v", s.state.containers)
|
|
return s, nil
|
|
}
|
|
|
|
type serverState struct {
|
|
sandboxes map[string]*sandbox
|
|
containers oci.Store
|
|
}
|
|
|
|
func (s *Server) addSandbox(sb *sandbox) {
|
|
s.stateLock.Lock()
|
|
s.state.sandboxes[sb.id] = sb
|
|
s.stateLock.Unlock()
|
|
}
|
|
|
|
func (s *Server) getSandbox(id string) *sandbox {
|
|
s.stateLock.Lock()
|
|
sb := s.state.sandboxes[id]
|
|
s.stateLock.Unlock()
|
|
return sb
|
|
}
|
|
|
|
func (s *Server) hasSandbox(id string) bool {
|
|
s.stateLock.Lock()
|
|
_, ok := s.state.sandboxes[id]
|
|
s.stateLock.Unlock()
|
|
return ok
|
|
}
|
|
|
|
func (s *Server) removeSandbox(id string) {
|
|
s.stateLock.Lock()
|
|
delete(s.state.sandboxes, id)
|
|
s.stateLock.Unlock()
|
|
}
|
|
|
|
func (s *Server) addContainer(c *oci.Container) {
|
|
s.stateLock.Lock()
|
|
sandbox := s.state.sandboxes[c.Sandbox()]
|
|
// TODO(runcom): handle !ok above!!! otherwise it panics!
|
|
sandbox.addContainer(c)
|
|
s.state.containers.Add(c.ID(), c)
|
|
s.stateLock.Unlock()
|
|
}
|
|
|
|
func (s *Server) getContainer(id string) *oci.Container {
|
|
s.stateLock.Lock()
|
|
c := s.state.containers.Get(id)
|
|
s.stateLock.Unlock()
|
|
return c
|
|
}
|
|
|
|
func (s *Server) removeContainer(c *oci.Container) {
|
|
s.stateLock.Lock()
|
|
sandbox := s.state.sandboxes[c.Sandbox()]
|
|
sandbox.removeContainer(c)
|
|
s.state.containers.Delete(c.ID())
|
|
s.stateLock.Unlock()
|
|
}
|