cri-o/server/server.go
Andrew Pilloud 17359e34a6 server: init iptables without kube
Signed-off-by: Andrew Pilloud <andrewpilloud@igneoussystems.com>
2017-06-22 08:51:50 -07:00

728 lines
21 KiB
Go

package server
import (
"encoding/json"
"fmt"
"io/ioutil"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/Sirupsen/logrus"
"github.com/containers/image/types"
sstorage "github.com/containers/storage"
"github.com/docker/docker/pkg/ioutils"
"github.com/docker/docker/pkg/registrar"
"github.com/docker/docker/pkg/truncindex"
"github.com/kubernetes-incubator/cri-o/oci"
"github.com/kubernetes-incubator/cri-o/pkg/annotations"
"github.com/kubernetes-incubator/cri-o/pkg/ocicni"
"github.com/kubernetes-incubator/cri-o/pkg/storage"
"github.com/kubernetes-incubator/cri-o/server/apparmor"
"github.com/kubernetes-incubator/cri-o/server/seccomp"
rspec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
knet "k8s.io/apimachinery/pkg/util/net"
pb "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
"k8s.io/kubernetes/pkg/kubelet/server/streaming"
iptablesproxy "k8s.io/kubernetes/pkg/proxy/iptables"
utildbus "k8s.io/kubernetes/pkg/util/dbus"
utilexec "k8s.io/kubernetes/pkg/util/exec"
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
)
const (
runtimeAPIVersion = "v1alpha1"
shutdownFile = "/var/lib/crio/crio.shutdown"
)
func isTrue(annotaton string) bool {
return annotaton == "true"
}
// streamService implements streaming.Runtime.
type streamService struct {
runtimeServer *Server // needed by Exec() endpoint
streamServer streaming.Server
streaming.Runtime
}
// Server implements the RuntimeService and ImageService
type Server struct {
config Config
runtime *oci.Runtime
store sstorage.Store
storageImageServer storage.ImageServer
storageRuntimeServer storage.RuntimeServer
stateLock sync.Mutex
updateLock sync.RWMutex
state *serverState
netPlugin ocicni.CNIPlugin
hostportManager hostport.HostPortManager
podNameIndex *registrar.Registrar
podIDIndex *truncindex.TruncIndex
ctrNameIndex *registrar.Registrar
ctrIDIndex *truncindex.TruncIndex
imageContext *types.SystemContext
seccompEnabled bool
seccompProfile seccomp.Seccomp
appArmorEnabled bool
appArmorProfile string
stream streamService
}
// GetExec returns exec stream request
func (s *Server) GetExec(req *pb.ExecRequest) (*pb.ExecResponse, error) {
return s.stream.streamServer.GetExec(req)
}
// GetAttach returns attach stream request
func (s *Server) GetAttach(req *pb.AttachRequest) (*pb.AttachResponse, error) {
return s.stream.streamServer.GetAttach(req)
}
// GetPortForward returns port forward stream request
func (s *Server) GetPortForward(req *pb.PortForwardRequest) (*pb.PortForwardResponse, error) {
return s.stream.streamServer.GetPortForward(req)
}
func (s *Server) loadContainer(id string) error {
config, err := s.store.FromContainerDirectory(id, "config.json")
if err != nil {
return err
}
var m rspec.Spec
if err = json.Unmarshal(config, &m); err != nil {
return err
}
labels := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations[annotations.Labels]), &labels); err != nil {
return err
}
name := m.Annotations[annotations.Name]
name, err = s.reserveContainerName(id, name)
if err != nil {
return err
}
defer func() {
if err != nil {
s.releaseContainerName(name)
}
}()
var metadata pb.ContainerMetadata
if err = json.Unmarshal([]byte(m.Annotations[annotations.Metadata]), &metadata); err != nil {
return err
}
sb := s.getSandbox(m.Annotations[annotations.SandboxID])
if sb == nil {
return fmt.Errorf("could not get sandbox with id %s, skipping", m.Annotations[annotations.SandboxID])
}
tty := isTrue(m.Annotations[annotations.TTY])
stdin := isTrue(m.Annotations[annotations.Stdin])
stdinOnce := isTrue(m.Annotations[annotations.StdinOnce])
containerPath, err := s.store.ContainerRunDirectory(id)
if err != nil {
return err
}
containerDir, err := s.store.ContainerDirectory(id)
if err != nil {
return err
}
var img *pb.ImageSpec
image, ok := m.Annotations[annotations.Image]
if ok {
img = &pb.ImageSpec{
Image: image,
}
}
kubeAnnotations := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations[annotations.Annotations]), &kubeAnnotations); err != nil {
return err
}
created, err := time.Parse(time.RFC3339Nano, m.Annotations[annotations.Created])
if err != nil {
return err
}
ctr, err := oci.NewContainer(id, name, containerPath, m.Annotations[annotations.LogPath], sb.netNs(), labels, kubeAnnotations, img, &metadata, sb.id, tty, stdin, stdinOnce, sb.privileged, sb.trusted, containerDir, created, m.Annotations["org.opencontainers.image.stopSignal"])
if err != nil {
return err
}
s.containerStateFromDisk(ctr)
s.addContainer(ctr)
return s.ctrIDIndex.Add(id)
}
func (s *Server) containerStateFromDisk(c *oci.Container) error {
if err := c.FromDisk(); err != nil {
return err
}
// ignore errors, this is a best effort to have up-to-date info about
// a given container before its state gets stored
s.runtime.UpdateStatus(c)
return nil
}
func (s *Server) containerStateToDisk(c *oci.Container) error {
// ignore errors, this is a best effort to have up-to-date info about
// a given container before its state gets stored
s.runtime.UpdateStatus(c)
jsonSource, err := ioutils.NewAtomicFileWriter(c.StatePath(), 0644)
if err != nil {
return err
}
defer jsonSource.Close()
enc := json.NewEncoder(jsonSource)
return enc.Encode(s.runtime.ContainerStatus(c))
}
func configNetNsPath(spec rspec.Spec) (string, error) {
for _, ns := range spec.Linux.Namespaces {
if ns.Type != rspec.NetworkNamespace {
continue
}
if ns.Path == "" {
return "", fmt.Errorf("empty networking namespace")
}
return ns.Path, nil
}
return "", fmt.Errorf("missing networking namespace")
}
func (s *Server) loadSandbox(id string) error {
config, err := s.store.FromContainerDirectory(id, "config.json")
if err != nil {
return err
}
var m rspec.Spec
if err = json.Unmarshal(config, &m); err != nil {
return err
}
labels := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations[annotations.Labels]), &labels); err != nil {
return err
}
name := m.Annotations[annotations.Name]
name, err = s.reservePodName(id, name)
if err != nil {
return err
}
defer func() {
if err != nil {
s.releasePodName(name)
}
}()
var metadata pb.PodSandboxMetadata
if err = json.Unmarshal([]byte(m.Annotations[annotations.Metadata]), &metadata); err != nil {
return err
}
processLabel, mountLabel, err := label.InitLabels(label.DupSecOpt(m.Process.SelinuxLabel))
if err != nil {
return err
}
kubeAnnotations := make(map[string]string)
if err = json.Unmarshal([]byte(m.Annotations[annotations.Annotations]), &kubeAnnotations); err != nil {
return err
}
privileged := isTrue(m.Annotations[annotations.PrivilegedRuntime])
trusted := isTrue(m.Annotations[annotations.TrustedSandbox])
sb := &sandbox{
id: id,
name: name,
kubeName: m.Annotations[annotations.KubeName],
logDir: filepath.Dir(m.Annotations[annotations.LogPath]),
labels: labels,
containers: oci.NewMemoryStore(),
processLabel: processLabel,
mountLabel: mountLabel,
annotations: kubeAnnotations,
metadata: &metadata,
shmPath: m.Annotations[annotations.ShmPath],
privileged: privileged,
trusted: trusted,
resolvPath: m.Annotations[annotations.ResolvPath],
}
// We add a netNS only if we can load a permanent one.
// Otherwise, the sandbox will live in the host namespace.
netNsPath, err := configNetNsPath(m)
if err == nil {
netNS, nsErr := netNsGet(netNsPath, sb.name)
// If we can't load the networking namespace
// because it's closed, we just set the sb netns
// pointer to nil. Otherwise we return an error.
if nsErr != nil && nsErr != errSandboxClosedNetNS {
return nsErr
}
sb.netns = netNS
}
s.addSandbox(sb)
defer func() {
if err != nil {
s.removeSandbox(sb.id)
}
}()
sandboxPath, err := s.store.ContainerRunDirectory(id)
if err != nil {
return err
}
sandboxDir, err := s.store.ContainerDirectory(id)
if err != nil {
return err
}
cname, err := s.reserveContainerName(m.Annotations[annotations.ContainerID], m.Annotations[annotations.ContainerName])
if err != nil {
return err
}
defer func() {
if err != nil {
s.releaseContainerName(cname)
}
}()
created, err := time.Parse(time.RFC3339Nano, m.Annotations[annotations.Created])
if err != nil {
return err
}
scontainer, err := oci.NewContainer(m.Annotations[annotations.ContainerID], cname, sandboxPath, m.Annotations[annotations.LogPath], sb.netNs(), labels, kubeAnnotations, nil, nil, id, false, false, false, privileged, trusted, sandboxDir, created, m.Annotations["org.opencontainers.image.stopSignal"])
if err != nil {
return err
}
s.containerStateFromDisk(scontainer)
if err = label.ReserveLabel(processLabel); err != nil {
return err
}
sb.infraContainer = scontainer
if err = s.ctrIDIndex.Add(scontainer.ID()); err != nil {
return err
}
if err = s.podIDIndex.Add(id); err != nil {
return err
}
return nil
}
func (s *Server) restore() {
containers, err := s.store.Containers()
if err != nil && !os.IsNotExist(err) {
logrus.Warnf("could not read containers and sandboxes: %v", err)
}
pods := map[string]*storage.RuntimeContainerMetadata{}
podContainers := map[string]*storage.RuntimeContainerMetadata{}
for _, container := range containers {
metadata, err2 := s.storageRuntimeServer.GetContainerMetadata(container.ID)
if err2 != nil {
logrus.Warnf("error parsing metadata for %s: %v, ignoring", container.ID, err2)
continue
}
if metadata.Pod {
pods[container.ID] = &metadata
} else {
podContainers[container.ID] = &metadata
}
}
for containerID, metadata := range pods {
if err = s.loadSandbox(containerID); err != nil {
logrus.Warnf("could not restore sandbox %s container %s: %v", metadata.PodID, containerID, err)
}
}
for containerID := range podContainers {
if err := s.loadContainer(containerID); err != nil {
logrus.Warnf("could not restore container %s: %v", containerID, err)
}
}
}
// Update makes changes to the server's state (lists of pods and containers) to
// reflect the list of pods and containers that are stored on disk, possibly
// having been modified by other parties
func (s *Server) Update() {
logrus.Debugf("updating sandbox and container information")
if err := s.update(); err != nil {
logrus.Errorf("error updating sandbox and container information: %v", err)
}
}
func (s *Server) update() error {
s.updateLock.Lock()
defer s.updateLock.Unlock()
containers, err := s.store.Containers()
if err != nil && !os.IsNotExist(err) {
logrus.Warnf("could not read containers and sandboxes: %v", err)
return err
}
newPods := map[string]*storage.RuntimeContainerMetadata{}
oldPods := map[string]string{}
removedPods := map[string]string{}
newPodContainers := map[string]*storage.RuntimeContainerMetadata{}
oldPodContainers := map[string]string{}
removedPodContainers := map[string]string{}
for _, container := range containers {
if s.hasSandbox(container.ID) {
// FIXME: do we need to reload/update any info about the sandbox?
oldPods[container.ID] = container.ID
oldPodContainers[container.ID] = container.ID
continue
}
if s.getContainer(container.ID) != nil {
// FIXME: do we need to reload/update any info about the container?
oldPodContainers[container.ID] = container.ID
continue
}
// not previously known, so figure out what it is
metadata, err2 := s.storageRuntimeServer.GetContainerMetadata(container.ID)
if err2 != nil {
logrus.Errorf("error parsing metadata for %s: %v, ignoring", container.ID, err2)
continue
}
if metadata.Pod {
newPods[container.ID] = &metadata
} else {
newPodContainers[container.ID] = &metadata
}
}
s.ctrIDIndex.Iterate(func(id string) {
if _, ok := oldPodContainers[id]; !ok {
// this container's ID wasn't in the updated list -> removed
removedPodContainers[id] = id
}
})
for removedPodContainer := range removedPodContainers {
// forget this container
c := s.getContainer(removedPodContainer)
if c == nil {
logrus.Warnf("bad state when getting container removed %+v", removedPodContainer)
continue
}
s.releaseContainerName(c.Name())
s.removeContainer(c)
if err = s.ctrIDIndex.Delete(c.ID()); err != nil {
return err
}
logrus.Debugf("forgetting removed pod container %s", c.ID())
}
s.podIDIndex.Iterate(func(id string) {
if _, ok := oldPods[id]; !ok {
// this pod's ID wasn't in the updated list -> removed
removedPods[id] = id
}
})
for removedPod := range removedPods {
// forget this pod
sb := s.getSandbox(removedPod)
if sb == nil {
logrus.Warnf("bad state when getting pod to remove %+v", removedPod)
continue
}
podInfraContainer := sb.infraContainer
s.releaseContainerName(podInfraContainer.Name())
s.removeContainer(podInfraContainer)
if err = s.ctrIDIndex.Delete(podInfraContainer.ID()); err != nil {
return err
}
sb.infraContainer = nil
s.releasePodName(sb.name)
s.removeSandbox(sb.id)
if err = s.podIDIndex.Delete(sb.id); err != nil {
return err
}
logrus.Debugf("forgetting removed pod %s", sb.id)
}
for sandboxID := range newPods {
// load this pod
if err = s.loadSandbox(sandboxID); err != nil {
logrus.Warnf("could not load new pod sandbox %s: %v, ignoring", sandboxID, err)
} else {
logrus.Debugf("loaded new pod sandbox %s", sandboxID, err)
}
}
for containerID := range newPodContainers {
// load this container
if err = s.loadContainer(containerID); err != nil {
logrus.Warnf("could not load new sandbox container %s: %v, ignoring", containerID, err)
} else {
logrus.Debugf("loaded new pod container %s", containerID, err)
}
}
return nil
}
func (s *Server) reservePodName(id, name string) (string, error) {
if err := s.podNameIndex.Reserve(name, id); err != nil {
if err == registrar.ErrNameReserved {
id, err := s.podNameIndex.Get(name)
if err != nil {
logrus.Warnf("conflict, pod name %q already reserved", name)
return "", err
}
return "", fmt.Errorf("conflict, name %q already reserved for pod %q", name, id)
}
return "", fmt.Errorf("error reserving pod name %q", name)
}
return name, nil
}
func (s *Server) releasePodName(name string) {
s.podNameIndex.Release(name)
}
func (s *Server) reserveContainerName(id, name string) (string, error) {
if err := s.ctrNameIndex.Reserve(name, id); err != nil {
if err == registrar.ErrNameReserved {
id, err := s.ctrNameIndex.Get(name)
if err != nil {
logrus.Warnf("conflict, ctr name %q already reserved", name)
return "", err
}
return "", fmt.Errorf("conflict, name %q already reserved for ctr %q", name, id)
}
return "", fmt.Errorf("error reserving ctr name %s", name)
}
return name, nil
}
func (s *Server) releaseContainerName(name string) {
s.ctrNameIndex.Release(name)
}
// cleanupSandboxesOnShutdown Remove all running Sandboxes on system shutdown
func (s *Server) cleanupSandboxesOnShutdown() {
_, err := os.Stat(shutdownFile)
if err == nil || !os.IsNotExist(err) {
logrus.Debugf("shutting down all sandboxes, on shutdown")
s.StopAllPodSandboxes()
err = os.Remove(shutdownFile)
if err != nil {
logrus.Warnf("Failed to remove %q", shutdownFile)
}
}
}
// Shutdown attempts to shut down the server's storage cleanly
func (s *Server) Shutdown() error {
// why do this on clean shutdown! we want containers left running when ocid
// is down for whatever reason no?!
// notice this won't trigger just on system halt but also on normal
// ocid.service restart!!!
s.cleanupSandboxesOnShutdown()
_, err := s.store.Shutdown(false)
return err
}
// New creates a new Server with options provided
func New(config *Config) (*Server, error) {
store, err := sstorage.GetStore(sstorage.StoreOptions{
RunRoot: config.RunRoot,
GraphRoot: config.Root,
GraphDriverName: config.Storage,
GraphDriverOptions: config.StorageOptions,
})
if err != nil {
return nil, err
}
imageService, err := storage.GetImageService(store, config.DefaultTransport, config.InsecureRegistries)
if err != nil {
return nil, err
}
storageRuntimeService := storage.GetRuntimeService(imageService, config.PauseImage)
if err != nil {
return nil, err
}
if err := os.MkdirAll("/var/run/crio", 0755); err != nil {
return nil, err
}
r, err := oci.New(config.Runtime, config.RuntimeUntrustedWorkload, config.DefaultWorkloadTrust, config.Conmon, config.ConmonEnv, config.CgroupManager)
if err != nil {
return nil, err
}
sandboxes := make(map[string]*sandbox)
containers := oci.NewMemoryStore()
netPlugin, err := ocicni.InitCNI(config.NetworkDir, config.PluginDir)
if err != nil {
return nil, err
}
iptInterface := utiliptables.New(utilexec.New(), utildbus.New(), utiliptables.ProtocolIpv4)
iptInterface.EnsureChain(utiliptables.TableNAT, iptablesproxy.KubeMarkMasqChain)
hostportManager := hostport.NewHostportManager()
s := &Server{
runtime: r,
store: store,
storageImageServer: imageService,
storageRuntimeServer: storageRuntimeService,
netPlugin: netPlugin,
hostportManager: hostportManager,
config: *config,
state: &serverState{
sandboxes: sandboxes,
containers: containers,
},
seccompEnabled: seccomp.IsEnabled(),
appArmorEnabled: apparmor.IsEnabled(),
appArmorProfile: config.ApparmorProfile,
}
if s.seccompEnabled {
seccompProfile, fileErr := ioutil.ReadFile(config.SeccompProfile)
if fileErr != nil {
return nil, fmt.Errorf("opening seccomp profile (%s) failed: %v", config.SeccompProfile, fileErr)
}
var seccompConfig seccomp.Seccomp
if jsonErr := json.Unmarshal(seccompProfile, &seccompConfig); jsonErr != nil {
return nil, fmt.Errorf("decoding seccomp profile failed: %v", jsonErr)
}
s.seccompProfile = seccompConfig
}
if s.appArmorEnabled && s.appArmorProfile == apparmor.DefaultApparmorProfile {
if apparmorErr := apparmor.EnsureDefaultApparmorProfile(); apparmorErr != nil {
return nil, fmt.Errorf("ensuring the default apparmor profile is installed failed: %v", apparmorErr)
}
}
s.podIDIndex = truncindex.NewTruncIndex([]string{})
s.podNameIndex = registrar.NewRegistrar()
s.ctrIDIndex = truncindex.NewTruncIndex([]string{})
s.ctrNameIndex = registrar.NewRegistrar()
s.imageContext = &types.SystemContext{
SignaturePolicyPath: config.ImageConfig.SignaturePolicyPath,
}
s.restore()
s.cleanupSandboxesOnShutdown()
bindAddress := net.ParseIP(config.StreamAddress)
if bindAddress == nil {
bindAddress, err = knet.ChooseBindAddress(net.IP{0, 0, 0, 0})
if err != nil {
return nil, err
}
}
_, err = net.LookupPort("tcp", config.StreamPort)
if err != nil {
return nil, err
}
// Prepare streaming server
streamServerConfig := streaming.DefaultConfig
streamServerConfig.Addr = net.JoinHostPort(bindAddress.String(), config.StreamPort)
s.stream.runtimeServer = s
s.stream.streamServer, err = streaming.NewServer(streamServerConfig, s.stream)
if err != nil {
return nil, fmt.Errorf("unable to create streaming server")
}
// TODO: Is it should be started somewhere else?
go func() {
s.stream.streamServer.Start(true)
}()
logrus.Debugf("sandboxes: %v", s.state.sandboxes)
logrus.Debugf("containers: %v", s.state.containers)
return s, nil
}
type serverState struct {
sandboxes map[string]*sandbox
containers oci.ContainerStorer
}
func (s *Server) addSandbox(sb *sandbox) {
s.stateLock.Lock()
s.state.sandboxes[sb.id] = sb
s.stateLock.Unlock()
}
func (s *Server) getSandbox(id string) *sandbox {
s.stateLock.Lock()
sb := s.state.sandboxes[id]
s.stateLock.Unlock()
return sb
}
func (s *Server) hasSandbox(id string) bool {
s.stateLock.Lock()
_, ok := s.state.sandboxes[id]
s.stateLock.Unlock()
return ok
}
func (s *Server) removeSandbox(id string) {
s.stateLock.Lock()
delete(s.state.sandboxes, id)
s.stateLock.Unlock()
}
func (s *Server) addContainer(c *oci.Container) {
s.stateLock.Lock()
sandbox := s.state.sandboxes[c.Sandbox()]
// TODO(runcom): handle !ok above!!! otherwise it panics!
sandbox.addContainer(c)
s.state.containers.Add(c.ID(), c)
s.stateLock.Unlock()
}
func (s *Server) getContainer(id string) *oci.Container {
s.stateLock.Lock()
c := s.state.containers.Get(id)
s.stateLock.Unlock()
return c
}
// GetSandboxContainer returns the infra container for a given sandbox
func (s *Server) GetSandboxContainer(id string) *oci.Container {
sb := s.getSandbox(id)
return sb.infraContainer
}
// GetContainer returns a container by its ID
func (s *Server) GetContainer(id string) *oci.Container {
return s.getContainer(id)
}
func (s *Server) removeContainer(c *oci.Container) {
s.stateLock.Lock()
sandbox := s.state.sandboxes[c.Sandbox()]
sandbox.removeContainer(c)
s.state.containers.Delete(c.ID())
s.stateLock.Unlock()
}