cri-o/server/server.go
Samuel Ortiz 4cab8ed06a
sandbox: Use persistent networking namespace
Because they need to prepare the hypervisor networking interfaces
and have them match the ones created in the pod networking
namespace (typically to bridge TAP and veth interfaces), hypervisor
based container runtimes need the sandbox pod networking namespace
to be set up before it's created. They can then prepare and start
the hypervisor interfaces when creating the pod virtual machine.

In order to do so, we need to create per pod persitent networking
namespaces that we pass to the CNI plugin. This patch leverages
the CNI ns package to create such namespaces under /var/run/netns,
and assign them to all pod containers.
The persitent namespace is removed when either the pod is stopped
or removed.

Since the StopPodSandbox() API can be called multiple times from
kubelet, we track the pod networking namespace state (closed or
not) so that we don't get a containernetworking/ns package error
when calling its Close() routine multiple times as well.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
2016-12-12 19:48:23 +01:00

417 lines
11 KiB
Go

package server
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"sync"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/docker/docker/pkg/registrar"
"github.com/docker/docker/pkg/truncindex"
"github.com/kubernetes-incubator/cri-o/oci"
"github.com/kubernetes-incubator/cri-o/server/apparmor"
"github.com/kubernetes-incubator/cri-o/server/seccomp"
"github.com/kubernetes-incubator/cri-o/utils"
"github.com/opencontainers/runc/libcontainer/label"
rspec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/rajatchopra/ocicni"
pb "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
)
const (
runtimeAPIVersion = "v1alpha1"
)
// Server implements the RuntimeService and ImageService
type Server struct {
config Config
runtime *oci.Runtime
stateLock sync.Mutex
state *serverState
netPlugin ocicni.CNIPlugin
podNameIndex *registrar.Registrar
podIDIndex *truncindex.TruncIndex
ctrNameIndex *registrar.Registrar
ctrIDIndex *truncindex.TruncIndex
seccompEnabled bool
seccompProfile seccomp.Seccomp
appArmorEnabled bool
appArmorProfile string
}
func (s *Server) loadContainer(id string) error {
config, err := ioutil.ReadFile(filepath.Join(s.runtime.ContainerDir(), id, "config.json"))
if err != nil {
return err
}
var m rspec.Spec
if err = json.Unmarshal(config, &m); err != nil {
return err
}
labels := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations["ocid/labels"]), &labels); err != nil {
return err
}
name := m.Annotations["ocid/name"]
name, err = s.reserveContainerName(id, name)
if err != nil {
return err
}
var metadata pb.ContainerMetadata
if err = json.Unmarshal([]byte(m.Annotations["ocid/metadata"]), &metadata); err != nil {
return err
}
sb := s.getSandbox(m.Annotations["ocid/sandbox_id"])
if sb == nil {
logrus.Warnf("could not get sandbox with id %s, skipping", m.Annotations["ocid/sandbox_id"])
return nil
}
var tty bool
if v := m.Annotations["ocid/tty"]; v == "true" {
tty = true
}
containerPath := filepath.Join(s.runtime.ContainerDir(), id)
var img *pb.ImageSpec
image, ok := m.Annotations["ocid/image"]
if ok {
img = &pb.ImageSpec{
Image: &image,
}
}
annotations := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations["ocid/annotations"]), &annotations); err != nil {
return err
}
ctr, err := oci.NewContainer(id, name, containerPath, m.Annotations["ocid/log_path"], sb.netNs(), labels, annotations, img, &metadata, sb.id, tty)
if err != nil {
return err
}
s.addContainer(ctr)
if err = s.runtime.UpdateStatus(ctr); err != nil {
logrus.Warnf("error updating status for container %s: %v", ctr.ID(), err)
}
if err = s.ctrIDIndex.Add(id); err != nil {
return err
}
return nil
}
func configNetNsPath(spec rspec.Spec) (string, error) {
for _, ns := range spec.Linux.Namespaces {
if ns.Type != rspec.NetworkNamespace {
continue
}
if ns.Path == "" {
return "", fmt.Errorf("empty networking namespace")
}
return ns.Path, nil
}
return "", fmt.Errorf("missing networking namespace")
}
func (s *Server) loadSandbox(id string) error {
config, err := ioutil.ReadFile(filepath.Join(s.config.SandboxDir, id, "config.json"))
if err != nil {
return err
}
var m rspec.Spec
if err = json.Unmarshal(config, &m); err != nil {
return err
}
labels := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations["ocid/labels"]), &labels); err != nil {
return err
}
name := m.Annotations["ocid/name"]
name, err = s.reservePodName(id, name)
if err != nil {
return err
}
var metadata pb.PodSandboxMetadata
if err = json.Unmarshal([]byte(m.Annotations["ocid/metadata"]), &metadata); err != nil {
return err
}
processLabel, mountLabel, err := label.InitLabels(label.DupSecOpt(m.Process.SelinuxLabel))
if err != nil {
return err
}
annotations := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations["ocid/annotations"]), &annotations); err != nil {
return err
}
sb := &sandbox{
id: id,
name: name,
logDir: m.Annotations["ocid/log_path"],
labels: labels,
containers: oci.NewMemoryStore(),
processLabel: processLabel,
mountLabel: mountLabel,
annotations: annotations,
metadata: &metadata,
shmPath: m.Annotations["ocid/shm_path"],
}
// We add a netNS only if we can load a permanent one.
// Otherwise, the sandbox will live in the host namespace.
netNsPath, err := configNetNsPath(m)
if err == nil {
netNS, nsErr := netNsGet(netNsPath)
// If we can't load the networking namespace
// because it's closed, we just set the sb netns
// pointer to nil. Otherwise we return an error.
if nsErr != nil && nsErr != errSandboxClosedNetNS {
return nsErr
}
sb.netns = netNS
}
s.addSandbox(sb)
sandboxPath := filepath.Join(s.config.SandboxDir, id)
if err = label.ReserveLabel(processLabel); err != nil {
return err
}
cname, err := s.reserveContainerName(m.Annotations["ocid/container_id"], m.Annotations["ocid/container_name"])
if err != nil {
return err
}
scontainer, err := oci.NewContainer(m.Annotations["ocid/container_id"], cname, sandboxPath, sandboxPath, sb.netNs(), labels, annotations, nil, nil, id, false)
if err != nil {
return err
}
sb.infraContainer = scontainer
if err = s.runtime.UpdateStatus(scontainer); err != nil {
logrus.Warnf("error updating status for container %s: %v", scontainer.ID(), err)
}
if err = s.ctrIDIndex.Add(scontainer.ID()); err != nil {
return err
}
if err = s.podIDIndex.Add(id); err != nil {
return err
}
return nil
}
func (s *Server) restore() {
sandboxDir, err := ioutil.ReadDir(s.config.SandboxDir)
if err != nil && !os.IsNotExist(err) {
logrus.Warnf("could not read sandbox directory %s: %v", sandboxDir, err)
}
for _, v := range sandboxDir {
if !v.IsDir() {
continue
}
if err = s.loadSandbox(v.Name()); err != nil {
logrus.Warnf("could not restore sandbox %s: %v", v.Name(), err)
}
}
containerDir, err := ioutil.ReadDir(s.runtime.ContainerDir())
if err != nil && !os.IsNotExist(err) {
logrus.Warnf("could not read container directory %s: %v", s.runtime.ContainerDir(), err)
}
for _, v := range containerDir {
if !v.IsDir() {
continue
}
if err := s.loadContainer(v.Name()); err != nil {
logrus.Warnf("could not restore container %s: %v", v.Name(), err)
}
}
}
func (s *Server) reservePodName(id, name string) (string, error) {
if err := s.podNameIndex.Reserve(name, id); err != nil {
if err == registrar.ErrNameReserved {
id, err := s.podNameIndex.Get(name)
if err != nil {
logrus.Warnf("conflict, pod name %q already reserved", name)
return "", err
}
return "", fmt.Errorf("conflict, name %q already reserved for pod %q", name, id)
}
return "", fmt.Errorf("error reserving pod name %q", name)
}
return name, nil
}
func (s *Server) releasePodName(name string) {
s.podNameIndex.Release(name)
}
func (s *Server) reserveContainerName(id, name string) (string, error) {
if err := s.ctrNameIndex.Reserve(name, id); err != nil {
if err == registrar.ErrNameReserved {
id, err := s.ctrNameIndex.Get(name)
if err != nil {
logrus.Warnf("conflict, ctr name %q already reserved", name)
return "", err
}
return "", fmt.Errorf("conflict, name %q already reserved for ctr %q", name, id)
}
return "", fmt.Errorf("error reserving ctr name %s", name)
}
return name, nil
}
func (s *Server) releaseContainerName(name string) {
s.ctrNameIndex.Release(name)
}
const (
// SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER.
SeccompModeFilter = uintptr(2)
)
func seccompEnabled() bool {
var enabled bool
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL {
enabled = true
}
}
return enabled
}
// New creates a new Server with options provided
func New(config *Config) (*Server, error) {
// TODO: This will go away later when we have wrapper process or systemd acting as
// subreaper.
if err := utils.SetSubreaper(1); err != nil {
return nil, fmt.Errorf("failed to set server as subreaper: %v", err)
}
utils.StartReaper()
if err := os.MkdirAll(config.ImageDir, 0755); err != nil {
return nil, err
}
if err := os.MkdirAll(config.SandboxDir, 0755); err != nil {
return nil, err
}
r, err := oci.New(config.Runtime, config.ContainerDir, config.Conmon, config.ConmonEnv)
if err != nil {
return nil, err
}
sandboxes := make(map[string]*sandbox)
containers := oci.NewMemoryStore()
netPlugin, err := ocicni.InitCNI("")
if err != nil {
return nil, err
}
s := &Server{
runtime: r,
netPlugin: netPlugin,
config: *config,
state: &serverState{
sandboxes: sandboxes,
containers: containers,
},
seccompEnabled: seccompEnabled(),
appArmorEnabled: apparmor.IsEnabled(),
}
seccompProfile, err := ioutil.ReadFile(config.SeccompProfile)
if err != nil {
return nil, fmt.Errorf("opening seccomp profile (%s) failed: %v", config.SeccompProfile, err)
}
var seccompConfig seccomp.Seccomp
if err := json.Unmarshal(seccompProfile, &seccompConfig); err != nil {
return nil, fmt.Errorf("decoding seccomp profile failed: %v", err)
}
s.seccompProfile = seccompConfig
if s.appArmorEnabled {
apparmor.InstallDefaultAppArmorProfile()
}
s.appArmorProfile = config.ApparmorProfile
s.podIDIndex = truncindex.NewTruncIndex([]string{})
s.podNameIndex = registrar.NewRegistrar()
s.ctrIDIndex = truncindex.NewTruncIndex([]string{})
s.ctrNameIndex = registrar.NewRegistrar()
s.restore()
logrus.Debugf("sandboxes: %v", s.state.sandboxes)
logrus.Debugf("containers: %v", s.state.containers)
return s, nil
}
type serverState struct {
sandboxes map[string]*sandbox
containers oci.Store
}
func (s *Server) addSandbox(sb *sandbox) {
s.stateLock.Lock()
s.state.sandboxes[sb.id] = sb
s.stateLock.Unlock()
}
func (s *Server) getSandbox(id string) *sandbox {
s.stateLock.Lock()
sb := s.state.sandboxes[id]
s.stateLock.Unlock()
return sb
}
func (s *Server) hasSandbox(id string) bool {
s.stateLock.Lock()
_, ok := s.state.sandboxes[id]
s.stateLock.Unlock()
return ok
}
func (s *Server) removeSandbox(id string) {
s.stateLock.Lock()
delete(s.state.sandboxes, id)
s.stateLock.Unlock()
}
func (s *Server) addContainer(c *oci.Container) {
s.stateLock.Lock()
sandbox := s.state.sandboxes[c.Sandbox()]
// TODO(runcom): handle !ok above!!! otherwise it panics!
sandbox.addContainer(c)
s.state.containers.Add(c.ID(), c)
s.stateLock.Unlock()
}
func (s *Server) getContainer(id string) *oci.Container {
s.stateLock.Lock()
c := s.state.containers.Get(id)
s.stateLock.Unlock()
return c
}
func (s *Server) removeContainer(c *oci.Container) {
s.stateLock.Lock()
sandbox := s.state.sandboxes[c.Sandbox()]
sandbox.removeContainer(c)
s.state.containers.Delete(c.ID())
s.stateLock.Unlock()
}