bump runc@b263a43430ac6996a4302b891688544225197294

Signed-off-by: Antonio Murdaca <runcom@redhat.com>
This commit is contained in:
Antonio Murdaca 2017-02-06 21:16:36 +01:00
parent 73a0881dbb
commit c258a2d8f0
No known key found for this signature in database
GPG key ID: B2BEAD150DE936B9
386 changed files with 9394 additions and 39467 deletions

View file

@ -1,3 +1,7 @@
# libcontainer
[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls.
It allows you to manage the lifecycle of the container performing additional operations
@ -16,7 +20,14 @@ the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap".
In addition to the go init function the early stage bootstrap is handled by importing
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
```go
import (
_ "github.com/opencontainers/runc/libcontainer/nsenter"
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
@ -77,12 +88,13 @@ config := &configs.Config{
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: false,
AllowAllDevices: nil,
AllowedDevices: configs.DefaultAllowedDevices,
},
},
MaskPaths: []string{
"/proc/kcore",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
@ -184,10 +196,10 @@ process := &libcontainer.Process{
Stderr: os.Stderr,
}
err := container.Start(process)
err := container.Run(process)
if err != nil {
logrus.Fatal(err)
container.Destroy()
logrus.Fatal(err)
return
}
@ -219,6 +231,15 @@ container.Resume()
// send signal to container's init process.
container.Signal(signal)
// update container resource constraints.
container.Set(config)
// get current status of the container.
status, err := container.Status()
// get current container's state information.
state, err := container.State()
```

View file

@ -71,7 +71,6 @@ that are required for executing a container's process.
| /dev/tty | 0666 | rwm |
| /dev/random | 0666 | rwm |
| /dev/urandom | 0666 | rwm |
| /dev/fuse | 0666 | rwm |
**ptmx**
@ -90,7 +89,7 @@ in tmpfs.
After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null`
to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs.
@ -297,7 +296,7 @@ a container.
| -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) |
| Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process |

View file

@ -7,6 +7,7 @@ package apparmor
// #include <stdlib.h>
import "C"
import (
"fmt"
"io/ioutil"
"os"
"unsafe"
@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
cName := C.CString(name)
defer C.free(unsafe.Pointer(cName))
if _, err := C.aa_change_onexec(cName); err != nil {
return err
return fmt.Errorf("apparmor failed to apply profile: %s", err)
}
return nil
}

View file

@ -0,0 +1,7 @@
// +build linux,ambient
package libcontainer
import "github.com/syndtr/gocapability/capability"
const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS

View file

@ -10,8 +10,6 @@ import (
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDS
var capabilityMap map[string]capability.Cap
func init() {

View file

@ -0,0 +1,7 @@
// +build !ambient,linux
package libcontainer
import "github.com/syndtr/gocapability/capability"
const allCapabilityTypes = capability.CAPS | capability.BOUNDS

View file

@ -27,9 +27,9 @@ type Manager interface {
// Destroys the cgroup set
Destroy() error
// NewCgroupManager() and LoadCgroupManager() require following attributes:
// The option func SystemdCgroups() and Cgroupfs() require following attributes:
// Paths map[string]string
// Cgroups *cgroups.Cgroup
// Cgroups *configs.Cgroup
// Paths maps cgroup subsystem to path at which it is mounted.
// Cgroups specifies specific cgroup settings for the various subsystems
@ -37,7 +37,7 @@ type Manager interface {
// restore the object later.
GetPaths() map[string]string
// Set the cgroup as configured.
// Sets the cgroup as configured.
Set(container *configs.Config) error
}

View file

@ -9,7 +9,6 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -33,7 +32,6 @@ var (
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize()
)
@ -106,6 +104,8 @@ func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
var c = m.Cgroups
@ -114,8 +114,8 @@ func (m *Manager) Apply(pid int) (err error) {
return err
}
m.Paths = make(map[string]string)
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
@ -124,32 +124,30 @@ func (m *Manager) Apply(pid int) (err error) {
}
return err
}
paths[name] = path
m.Paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
m.mu.Lock()
defer m.mu.Unlock()
paths := make(map[string]string)
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
}
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
if cgroups.IsNotFound(err) {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
}
paths[sys.Name()] = p
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
return err
}
}
m.Paths = paths
return nil
}
@ -190,18 +188,15 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
// Generate fake cgroup data.
d, err := getCgroupData(container.Cgroups, -1)
if err != nil {
return err
}
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
paths := m.GetPaths()
for _, sys := range subsystems {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
@ -218,14 +213,8 @@ func (m *Manager) Set(container *configs.Config) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state configs.FreezerState) error {
d, err := getCgroupData(m.Cgroups, 0)
if err != nil {
return err
}
dir, err := d.path("freezer")
if err != nil {
return err
}
paths := m.GetPaths()
dir := paths["freezer"]
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
@ -241,28 +230,13 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
}
func (m *Manager) GetPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
paths := m.GetPaths()
return cgroups.GetPids(paths["devices"])
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func getCgroupPath(c *configs.Cgroup) (string, error) {
d, err := getCgroupData(c, 0)
if err != nil {
return "", err
}
return d.path("devices")
paths := m.GetPaths()
return cgroups.GetAllPids(paths["devices"])
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
@ -319,7 +293,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
@ -339,7 +313,7 @@ func (raw *cgroupData) join(subsystem string) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil {
if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
return "", err
}
return path, nil
@ -349,7 +323,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)

View file

@ -22,10 +22,48 @@ func (s *CpuGroup) Name() string {
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
_, err := d.join("cpu")
path, err := d.path("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
return nil
}
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
@ -45,15 +83,8 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
return nil

View file

@ -106,13 +106,13 @@ func TestCpuStats(t *testing.T) {
defer helper.cleanup()
const (
kNrPeriods = 2000
kNrThrottled = 200
kThrottledTime = uint64(18446744073709551615)
nrPeriods = 2000
nrThrottled = 200
throttledTime = uint64(18446744073709551615)
)
cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
kNrPeriods, kNrThrottled, kThrottledTime)
nrPeriods, nrThrottled, throttledTime)
helper.writeFileContents(map[string]string{
"cpu.stat": cpuStatContent,
})
@ -125,9 +125,9 @@ func TestCpuStats(t *testing.T) {
}
expectedStats := cgroups.ThrottlingData{
Periods: kNrPeriods,
ThrottledPeriods: kNrThrottled,
ThrottledTime: kThrottledTime}
Periods: nrPeriods,
ThrottledPeriods: nrThrottled,
ThrottledTime: throttledTime}
expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
}
@ -161,3 +161,49 @@ func TestInvalidCpuStat(t *testing.T) {
t.Fatal("Expected failed stat parsing.")
}
}
func TestCpuSetRtSchedAtApply(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
const (
rtRuntimeBefore = 0
rtRuntimeAfter = 5000
rtPeriodBefore = 0
rtPeriodAfter = 7000
)
helper.writeFileContents(map[string]string{
"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
"cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
})
helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
cpu := &CpuGroup{}
if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
t.Fatal(err)
}
rtRuntime, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
}
if rtRuntime != rtRuntimeAfter {
t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
}
rtPeriod, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
}
if rtPeriod != rtPeriodAfter {
t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
}
pid, err := getCgroupParamUint(helper.CgroupPath, "cgroup.procs")
if err != nil {
t.Fatalf("Failed to parse cgroup.procs - %s", err)
}
if pid != 1234 {
t.Fatal("Got the wrong value, set cgroup.procs failed.")
}
}

View file

@ -8,7 +8,6 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@ -62,12 +61,29 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err != nil {
return err
}
if err := s.ensureParent(dir, root); err != nil {
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
return err
}
@ -137,3 +153,10 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
}
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
}

View file

@ -43,21 +43,23 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
}
return nil
}
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
if cgroup.Resources.AllowAllDevices != nil {
if *cgroup.Resources.AllowAllDevices == false {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
}
return nil
}
if err := writeFile(path, "devices.allow", "a"); err != nil {
return err
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if err := writeFile(path, "devices.allow", "a"); err != nil {
return err
}
}
for _, dev := range cgroup.Resources.DeniedDevices {

View file

@ -40,8 +40,8 @@ func TestDevicesSetAllow(t *testing.T) {
helper.writeFileContents(map[string]string{
"devices.deny": "a",
})
helper.CgroupData.config.Resources.AllowAllDevices = false
allowAllDevices := false
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
@ -56,6 +56,19 @@ func TestDevicesSetAllow(t *testing.T) {
if value != allowedList {
t.Fatal("Got the wrong value, set devices.allow failed.")
}
// When AllowAllDevices is nil, devices.allow file should not be modified.
helper.CgroupData.config.Resources.AllowAllDevices = nil
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err = getCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
}
}
func TestDevicesSetDeny(t *testing.T) {
@ -66,7 +79,8 @@ func TestDevicesSetDeny(t *testing.T) {
"devices.allow": "a",
})
helper.CgroupData.config.Resources.AllowAllDevices = true
allowAllDevices := true
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {

View file

@ -5,15 +5,19 @@ package fs
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
type MemoryGroup struct {
}
@ -32,13 +36,12 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return err
}
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
if d.config.KernelMemory != 0 {
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
@ -54,17 +57,44 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == syscall.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
@ -113,11 +143,18 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err

View file

@ -230,7 +230,7 @@ func TestMemorySetKernelMemory(t *testing.T) {
helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
memory := &MemoryGroup{}
if err := memory.SetKernelMemory(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
@ -471,11 +471,11 @@ func TestMemorySetOomControl(t *testing.T) {
defer helper.cleanup()
const (
oom_kill_disable = 1 // disable oom killer, default is 0
oomKillDisable = 1 // disable oom killer, default is 0
)
helper.writeFileContents(map[string]string{
"memory.oom_control": strconv.Itoa(oom_kill_disable),
"memory.oom_control": strconv.Itoa(oomKillDisable),
})
memory := &MemoryGroup{}
@ -488,7 +488,7 @@ func TestMemorySetOomControl(t *testing.T) {
t.Fatalf("Failed to parse memory.oom_control - %s", err)
}
if value != oom_kill_disable {
if value != oomKillDisable {
t.Fatalf("Got the wrong value, set memory.oom_control failed.")
}
}

View file

@ -3,6 +3,8 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error {
}
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil {
if cgroup.Resources.NetClsClassid != 0 {
if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err
}
}

View file

@ -3,12 +3,13 @@
package fs
import (
"strconv"
"testing"
)
const (
classidBefore = "0x100002"
classidAfter = "0x100001"
classidBefore = 0x100002
classidAfter = 0x100001
)
func TestNetClsSetClassid(t *testing.T) {
@ -16,7 +17,7 @@ func TestNetClsSetClassid(t *testing.T) {
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"net_cls.classid": classidBefore,
"net_cls.classid": strconv.FormatUint(classidBefore, 10),
})
helper.CgroupData.config.Resources.NetClsClassid = classidAfter
@ -28,7 +29,7 @@ func TestNetClsSetClassid(t *testing.T) {
// As we are in mock environment, we can't get correct value of classid from
// net_cls.classid.
// So. we just judge if we successfully write classid into file
value, err := getCgroupParamString(helper.CgroupPath, "net_cls.classid")
value, err := getCgroupParamUint(helper.CgroupPath, "net_cls.classid")
if err != nil {
t.Fatalf("Failed to parse net_cls.classid - %s", err)
}

View file

@ -12,7 +12,6 @@ import (
)
var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format")
)

View file

@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
// Total CPU time consumed.

View file

@ -8,7 +8,6 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
@ -67,13 +66,16 @@ var subsystems = subsystemSet{
const (
testScopeWait = 4
testSliceWait = 4
)
var (
connLock sync.Mutex
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasStartTransientSliceUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
@ -146,6 +148,48 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Assume we have the ability to start a transient unit as a slice
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
// For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
hasStartTransientSliceUnit = true
// To ensure simple clean-up, we create a slice off the root with no hierarchy
slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
if _, ok := err.(dbus.Error); ok {
hasStartTransientSliceUnit = false
}
}
for i := 0; i <= testSliceWait; i++ {
if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
hasStartTransientSliceUnit = false
break
}
}
} else {
break
}
time.Sleep(time.Millisecond)
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
theConn.StopUnit(slice, "replace", nil)
}
return hasStartTransientUnit
}
@ -179,13 +223,29 @@ func (m *Manager) Apply(pid int) error {
slice = c.Parent
}
properties = append(properties,
systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name),
newProp("PIDs", []uint32{uint32(pid)}),
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
if !hasStartTransientSliceUnit {
return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
}
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
if hasDelegate {
// This is only supported on systemd versions 218 and above.
newProp("Delegate", true),
)
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
@ -214,17 +274,15 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := setKernelMemory(c); err != nil {
return err
}
}
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil {
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
return err
}
@ -273,7 +331,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
@ -286,10 +344,9 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return "", err
}
return path, nil
}
@ -331,10 +388,10 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
func ExpandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
@ -348,6 +405,10 @@ func expandSlice(slice string) (string, error) {
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
// if input was -.slice, we should just return root now
if sliceName == "-" {
return "/", nil
}
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
@ -372,13 +433,15 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
if err != nil {
return "", err
}
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
slice := "system.slice"
if c.Parent != "" {
slice = c.Parent
}
slice, err = expandSlice(slice)
slice, err = ExpandSlice(slice)
if err != nil {
return "", err
}
@ -439,6 +502,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range subsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
@ -460,7 +528,11 @@ func (m *Manager) Set(container *configs.Config) error {
}
func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
return c.Name
}
func setKernelMemory(c *configs.Cgroup) error {
@ -472,8 +544,15 @@ func setKernelMemory(c *configs.Cgroup) error {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
return fs.EnableKernelMemoryAccounting(path)
}
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return false
}

View file

@ -16,37 +16,24 @@ import (
"github.com/docker/go-units"
)
const cgroupNamePrefix = "name="
const (
cgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Split(txt, " ")
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], nil
}
}
}
if err := scanner.Err(); err != nil {
return "", err
}
return "", NewNotFoundError(subsystem)
mnt, _, err := FindCgroupMountpointAndRoot(subsystem)
return mnt, err
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", "", err
@ -70,6 +57,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
@ -121,16 +117,17 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
for scanner.Scan() {
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+9] != "cgroup" {
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
@ -139,12 +136,17 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if ss[opt] {
} else {
m.Subsystems = append(m.Subsystems, opt)
}
if !all {
numFound++
}
}
res = append(res, m)
}
@ -154,26 +156,28 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
return res, nil
}
func GetCgroupMounts() ([]Mount, error) {
// GetCgroupMounts returns the mounts for the cgroup subsystems.
// all indicates whether to return just the first instance or all the mounts.
func GetCgroupMounts(all bool) ([]Mount, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close()
all, err := GetAllSubsystems()
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
allMap := make(map[string]bool)
for _, s := range all {
for s := range allSubsystems {
allMap[s] = true
}
return getCgroupMountsHelper(allMap, f)
return getCgroupMountsHelper(allMap, f, all)
}
// Returns all the cgroup subsystems supported by the kernel
// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups")
if err != nil {
@ -185,9 +189,6 @@ func GetAllSubsystems() ([]string, error) {
s := bufio.NewScanner(f)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
if text[0] != '#' {
parts := strings.Fields(text)
@ -196,10 +197,13 @@ func GetAllSubsystems() ([]string, error) {
}
}
}
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil
}
// Returns the relative path to the cgroup docker is running in.
// GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
@ -220,7 +224,7 @@ func GetInitCgroupDir(subsystem string) (string, error) {
}
func readProcsFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, "cgroup.procs"))
f, err := os.Open(filepath.Join(dir, CgroupProcesses))
if err != nil {
return nil, err
}
@ -243,6 +247,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil
}
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
@ -250,21 +256,35 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
defer f.Close()
s := bufio.NewScanner(f)
return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
// from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2]
}
}
if err := s.Err(); err != nil {
return nil, err
}
return cgroups, nil
}
@ -291,8 +311,7 @@ func PathExists(path string) bool {
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"),
[]byte(strconv.Itoa(pid)), 0700); err != nil {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
@ -361,7 +380,7 @@ func GetAllPids(path string) ([]int, error) {
// collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
dir, file := filepath.Split(p)
if file != "cgroup.procs" {
if file != CgroupProcesses {
return nil
}
if iErr != nil {
@ -376,3 +395,20 @@ func GetAllPids(path string) ([]int, error) {
})
return pids, err
}
// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
func WriteCgroupProc(dir string, pid int) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
}
return nil
}

View file

@ -4,6 +4,8 @@ package cgroups
import (
"bytes"
"fmt"
"reflect"
"strings"
"testing"
)
@ -91,6 +93,34 @@ const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,
136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755
25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw
27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel
28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw
29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct
30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory
31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio
32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio
33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event
34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb
35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer
36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw
64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered
39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw
40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel
41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel
`
func TestGetCgroupMounts(t *testing.T) {
type testData struct {
mountInfo string
@ -132,7 +162,7 @@ func TestGetCgroupMounts(t *testing.T) {
}
for _, td := range testTable {
mi := bytes.NewBufferString(td.mountInfo)
cgMounts, err := getCgroupMountsHelper(td.subsystems, mi)
cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false)
if err != nil {
t.Fatal(err)
}
@ -185,8 +215,88 @@ func BenchmarkGetCgroupMounts(b *testing.B) {
b.StopTimer()
mi := bytes.NewBufferString(fedoraMountinfo)
b.StartTimer()
if _, err := getCgroupMountsHelper(subsystems, mi); err != nil {
if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil {
b.Fatal(err)
}
}
}
func TestParseCgroupString(t *testing.T) {
testCases := []struct {
input string
expectedError error
expectedOutput map[string]string
}{
{
// Taken from a CoreOS instance running systemd 225 with CPU/Mem
// accounting enabled in systemd
input: `9:blkio:/
8:freezer:/
7:perf_event:/
6:devices:/system.slice/system-sshd.slice
5:cpuset:/
4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
3:net_cls,net_prio:/
2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`,
expectedOutput: map[string]string{
"name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
"blkio": "/",
"freezer": "/",
"perf_event": "/",
"devices": "/system.slice/system-sshd.slice",
"cpuset": "/",
"cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
"cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
"net_cls": "/",
"net_prio": "/",
"memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
},
},
{
input: `malformed input`,
expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`),
},
}
for ndx, testCase := range testCases {
out, err := parseCgroupFromReader(strings.NewReader(testCase.input))
if err != nil {
if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() {
t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err)
}
} else {
if !reflect.DeepEqual(testCase.expectedOutput, out) {
t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out)
}
}
}
}
func TestIgnoreCgroup2Mount(t *testing.T) {
subsystems := map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"pids": true,
"name=systemd": true,
}
mi := bytes.NewBufferString(cgroup2Mountinfo)
cgMounts, err := getCgroupMountsHelper(subsystems, mi, false)
if err != nil {
t.Fatal(err)
}
for _, m := range cgMounts {
if m.Mountpoint == "/sys/fs/cgroup/systemd" {
t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it")
}
}
}

View file

@ -22,7 +22,7 @@ type Cgroup struct {
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
@ -36,7 +36,7 @@ type Cgroup struct {
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
// Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
@ -69,10 +69,10 @@ type Resources struct {
CpuPeriod int64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_quota"`
CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_period"`
CpuRtPeriod int64 `json:"cpu_rt_period"`
// CPU to use
CpusetCpus string `json:"cpuset_cpus"`
@ -95,7 +95,7 @@ type Resources struct {
// IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per divice, bytes per second.
// IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second.
@ -120,5 +120,5 @@ type Resources struct {
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets
NetClsClassid string `json:"net_cls_classid"`
NetClsClassid uint32 `json:"net_cls_classid_u"`
}

View file

@ -8,6 +8,7 @@ import (
"time"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runtime-spec/specs-go"
)
type Rlimit struct {
@ -33,7 +34,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"`
}
// An action to be taken upon rule match in Seccomp
// Action is taken upon rule match in Seccomp
type Action int
const (
@ -44,7 +45,7 @@ const (
Trace
)
// A comparison operator to be used when matching syscall arguments in Seccomp
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
@ -57,7 +58,7 @@ const (
MaskEqualTo
)
// A rule to match a specific syscall argument in Seccomp
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
@ -65,7 +66,7 @@ type Arg struct {
Op Operator `json:"op"`
}
// An rule to match a syscall in Seccomp
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
@ -85,11 +86,6 @@ type Config struct {
// that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"`
// PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set.
// When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable.
// This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot.
PivotDir string `json:"pivot_dir"`
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
@ -148,10 +144,6 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"`
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string `json:"additional_groups"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
@ -187,6 +179,10 @@ type Config struct {
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
}
type Hooks struct {
@ -248,20 +244,14 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) {
}
// HookState is the payload provided to a hook on execution.
type HookState struct {
Version string `json:"ociVersion"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
BundlePath string `json:"bundlePath"`
}
type HookState specs.State
type Hook interface {
// Run executes the hook with the provided state.
Run(HookState) error
}
// NewFunctionHooks will call the provided function when the hook is run.
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{
run: f,
@ -284,7 +274,7 @@ type Command struct {
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHooks will execute the provided command when the hook is run.
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
@ -300,29 +290,38 @@ func (c Command) Run(s HookState) error {
if err != nil {
return err
}
var stdout, stderr bytes.Buffer
cmd := exec.Cmd{
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Stdout: &stdout,
Stderr: &stderr,
}
if err := cmd.Start(); err != nil {
return err
}
errC := make(chan error, 1)
go func() {
out, err := cmd.CombinedOutput()
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("%s: %s", err, out)
err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()
var timerCh <-chan time.Time
if c.Timeout != nil {
select {
case err := <-errC:
return err
case <-time.After(*c.Timeout):
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
timer := time.NewTimer(*c.Timeout)
defer timer.Stop()
timerCh = timer.C
}
select {
case err := <-errC:
return err
case <-timerCh:
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
return <-errC
}

View file

@ -120,10 +120,10 @@ func TestMarshalHooksWithUnexpectedType(t *testing.T) {
func TestFuncHookRun(t *testing.T) {
state := configs.HookState{
Version: "1",
ID: "1",
Pid: 1,
Root: "root",
Version: "1",
ID: "1",
Pid: 1,
BundlePath: "/bundle",
}
fHook := configs.NewFunctionHook(func(s configs.HookState) error {
@ -138,10 +138,10 @@ func TestFuncHookRun(t *testing.T) {
func TestCommandHookRun(t *testing.T) {
state := configs.HookState{
Version: "1",
ID: "1",
Pid: 1,
Root: "root",
Version: "1",
ID: "1",
Pid: 1,
BundlePath: "/bundle",
}
timeout := time.Second
@ -161,10 +161,10 @@ func TestCommandHookRun(t *testing.T) {
func TestCommandHookRunTimeout(t *testing.T) {
state := configs.HookState{
Version: "1",
ID: "1",
Pid: 1,
Root: "root",
Version: "1",
ID: "1",
Pid: 1,
BundlePath: "/bundle",
}
timeout := (10 * time.Millisecond)

View file

@ -4,7 +4,7 @@ package configs
import "fmt"
// Gets the root uid for the process on host which could be non-zero
// HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {
@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
return 0, nil
}
// Gets the root gid for the process on host which could be non-zero
// HostGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {

View file

@ -10,30 +10,6 @@ import (
"testing"
)
// Checks whether the expected capability is specified in the capabilities.
func contains(expected string, values []string) bool {
for _, v := range values {
if v == expected {
return true
}
}
return false
}
func containsDevice(expected *Device, values []*Device) bool {
for _, d := range values {
if d.Path == expected.Path &&
d.Permissions == expected.Permissions &&
d.FileMode == expected.FileMode &&
d.Major == expected.Major &&
d.Minor == expected.Minor &&
d.Type == expected.Type {
return true
}
}
return false
}
func loadConfig(name string) (*Config, error) {
f, err := os.Open(filepath.Join("../sample_configs", name))
if err != nil {

View file

@ -3,7 +3,7 @@
package configs
var (
// These are devices that are to be both allowed and created.
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{
@ -107,19 +107,5 @@ var (
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{
{
// /dev/fuse is created but not allowed.
// This is to allow java to work. Because java
// Insists on there being a /dev/fuse
// https://github.com/docker/docker/issues/514
// https://github.com/docker/docker/issues/2393
//
Path: "/dev/fuse",
Type: 'c',
Major: 10,
Minor: 229,
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
)

View file

@ -1,5 +1,11 @@
package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
@ -22,6 +28,9 @@ type Mount struct {
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`

View file

@ -4,12 +4,10 @@ package configs
func (n *Namespace) Syscall() int {
panic("No namespace syscall support")
return 0
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
return uintptr(0)
}

View file

@ -22,8 +22,8 @@ var (
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if ok {
return supported
}
nsFile := nsToFile(ns)
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {

View file

@ -7,6 +7,7 @@ import (
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/selinux"
)
type Validator interface {
@ -45,6 +46,12 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
}
return err
}
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
@ -80,6 +87,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
@ -121,6 +132,11 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
if err := checkHostNs(s, path); err != nil {
return err
}
}
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
@ -131,3 +147,44 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return nil
}
func isSymbolicLink(path string) (bool, error) {
fi, err := os.Lstat(path)
if err != nil {
return false, err
}
return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
}
// checkHostNs checks whether network sysctl is used in host namespace.
func checkHostNs(sysctlConfig string, path string) error {
var currentProcessNetns = "/proc/self/ns/net"
// readlink on the current processes network namespace
destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
if err != nil {
return fmt.Errorf("read soft link %q error", currentProcessNetns)
}
// First check if the provided path is a symbolic link
symLink, err := isSymbolicLink(path)
if err != nil {
return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
}
if symLink == false {
// The provided namespace is not a symbolic link,
// it is not the host namespace.
return nil
}
// readlink on the path provided in the struct
destOfContainer, err := os.Readlink(path)
if err != nil {
return fmt.Errorf("read soft link %q error", path)
}
if destOfContainer == destOfCurrentProcess {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
}
return nil
}

View file

@ -100,7 +100,7 @@ func TestValidateHostnameWithoutUTSNamespace(t *testing.T) {
func TestValidateSecurityWithMaskPaths(t *testing.T) {
config := &configs.Config{
Rootfs: "/var",
MaskPaths: []string{"/proc/kcores"},
MaskPaths: []string{"/proc/kcore"},
Namespaces: configs.Namespaces(
[]configs.Namespace{
{Type: configs.NEWNS},
@ -136,7 +136,7 @@ func TestValidateSecurityWithROPaths(t *testing.T) {
func TestValidateSecurityWithoutNEWNS(t *testing.T) {
config := &configs.Config{
Rootfs: "/var",
MaskPaths: []string{"/proc/kcores"},
MaskPaths: []string{"/proc/kcore"},
ReadonlyPaths: []string{"/proc/sys"},
}
@ -148,6 +148,9 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {
}
func TestValidateUsernamespace(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
}
config := &configs.Config{
Rootfs: "/var",
Namespaces: configs.Namespaces(
@ -198,3 +201,67 @@ func TestValidateSysctl(t *testing.T) {
}
}
}
func TestValidateValidSysctl(t *testing.T) {
sysctl := map[string]string{
"fs.mqueue.ctl": "ctl",
"net.ctl": "ctl",
"kernel.msgmax": "ctl",
}
for k, v := range sysctl {
config := &configs.Config{
Rootfs: "/var",
Sysctl: map[string]string{k: v},
Namespaces: []configs.Namespace{
{
Type: configs.NEWNET,
},
{
Type: configs.NEWIPC,
},
},
}
validator := validate.New()
err := validator.Validate(config)
if err != nil {
t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err)
}
}
}
func TestValidateSysctlWithSameNs(t *testing.T) {
config := &configs.Config{
Rootfs: "/var",
Sysctl: map[string]string{"net.ctl": "ctl"},
Namespaces: configs.Namespaces(
[]configs.Namespace{
{
Type: configs.NEWNET,
Path: "/proc/self/ns/net",
},
},
),
}
validator := validate.New()
err := validator.Validate(config)
if err == nil {
t.Error("Expected error to occur but it was nil")
}
}
func TestValidateSysctlWithoutNETNamespace(t *testing.T) {
config := &configs.Config{
Rootfs: "/var",
Sysctl: map[string]string{"net.ctl": "ctl"},
Namespaces: []configs.Namespace{},
}
validator := validate.New()
err := validator.Validate(config)
if err == nil {
t.Error("Expected error to occur but it was nil")
}
}

View file

@ -1,15 +1,73 @@
package libcontainer
import "io"
import (
"encoding/json"
"fmt"
"io"
"os"
)
// Console represents a pseudo TTY.
type Console interface {
io.ReadWriter
io.Closer
io.ReadWriteCloser
// Path returns the filesystem path to the slave side of the pty.
Path() string
// Fd returns the fd for the master of the pty.
Fd() uintptr
File() *os.File
}
const (
TerminalInfoVersion uint32 = 201610041
TerminalInfoType uint8 = 'T'
)
// TerminalInfo is the structure which is passed as the non-ancillary data
// in the sendmsg(2) call when runc is run with --console-socket. It
// contains some information about the container which the console master fd
// relates to (to allow for consumers to use a single unix socket to handle
// multiple containers). This structure will probably move to runtime-spec
// at some point. But for now it lies in libcontainer.
type TerminalInfo struct {
// Version of the API.
Version uint32 `json:"version"`
// Type of message (future proofing).
Type uint8 `json:"type"`
// Container contains the ID of the container.
ContainerID string `json:"container_id"`
}
func (ti *TerminalInfo) String() string {
encoded, err := json.Marshal(*ti)
if err != nil {
panic(err)
}
return string(encoded)
}
func NewTerminalInfo(containerId string) *TerminalInfo {
return &TerminalInfo{
Version: TerminalInfoVersion,
Type: TerminalInfoType,
ContainerID: containerId,
}
}
func GetTerminalInfo(encoded string) (*TerminalInfo, error) {
ti := new(TerminalInfo)
if err := json.Unmarshal([]byte(encoded), ti); err != nil {
return nil, err
}
if ti.Type != TerminalInfoType {
return nil, fmt.Errorf("terminal info: incorrect type in payload (%q): %q", TerminalInfoType, ti.Type)
}
if ti.Version != TerminalInfoVersion {
return nil, fmt.Errorf("terminal info: incorrect version in payload (%q): %q", TerminalInfoVersion, ti.Version)
}
return ti, nil
}

View file

@ -6,8 +6,8 @@ import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD")
}

View file

@ -3,20 +3,20 @@ package libcontainer
import (
"fmt"
"os"
"path/filepath"
"syscall"
"unsafe"
"github.com/opencontainers/runc/libcontainer/label"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
func newConsole() (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, err
}
if err := saneTerminal(master); err != nil {
return nil, err
}
console, err := ptsname(master)
if err != nil {
return nil, err
@ -24,34 +24,20 @@ func NewConsole(uid, gid int) (Console, error) {
if err := unlockpt(master); err != nil {
return nil, err
}
if err := os.Chmod(console, 0600); err != nil {
return nil, err
}
if err := os.Chown(console, uid, gid); err != nil {
return nil, err
}
return &linuxConsole{
slavePath: console,
master: master,
}, nil
}
// newConsoleFromPath is an internal function returning an initialized console for use inside
// a container's MNT namespace.
func newConsoleFromPath(slavePath string) *linuxConsole {
return &linuxConsole{
slavePath: slavePath,
}
}
// linuxConsole is a linux psuedo TTY for use within a container.
// linuxConsole is a linux pseudo TTY for use within a container.
type linuxConsole struct {
master *os.File
slavePath string
}
func (c *linuxConsole) Fd() uintptr {
return c.master.Fd()
func (c *linuxConsole) File() *os.File {
return c.master
}
func (c *linuxConsole) Path() string {
@ -75,21 +61,17 @@ func (c *linuxConsole) Close() error {
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func (c *linuxConsole) mount(rootfs, mountLabel string) error {
func (c *linuxConsole) mount() error {
oldMask := syscall.Umask(0000)
defer syscall.Umask(oldMask)
if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil {
return err
}
dest := filepath.Join(rootfs, "/dev/console")
f, err := os.Create(dest)
f, err := os.Create("/dev/console")
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "")
return syscall.Mount(c.slavePath, "/dev/console", "bind", syscall.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
@ -143,3 +125,26 @@ func ptsname(f *os.File) (string, error) {
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}
// saneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair
// created by us acts normally. In particular, a not-very-well-known default of
// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
// problem for terminal emulators, because we relay data from the terminal we
// also relay that funky line discipline.
func saneTerminal(terminal *os.File) error {
// Go doesn't have a wrapper for any of the termios ioctls.
var termios syscall.Termios
if err := ioctl(terminal.Fd(), syscall.TCGETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error())
}
// Set -onlcr so we don't have to deal with \r.
termios.Oflag &^= syscall.ONLCR
if err := ioctl(terminal.Fd(), syscall.TCSETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error())
}
return nil
}

View file

@ -0,0 +1,11 @@
package libcontainer
import (
"errors"
)
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View file

@ -1,11 +1,11 @@
package libcontainer
// NewConsole returns an initalized console that can be used within a container
func NewConsole(uid, gid int) (Console, error) {
// newConsole returns an initialized console that can be used within a container
func newConsole() (Console, error) {
return &windowsConsole{}, nil
}
// windowsConsole is a Windows psuedo TTY for use within a container.
// windowsConsole is a Windows pseudo TTY for use within a container.
type windowsConsole struct {
}

View file

@ -1,4 +1,4 @@
// Libcontainer provides a native Go implementation for creating containers
// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
@ -11,24 +11,20 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
// The status of a container.
// Status is the status of a container.
type Status int
const (
// The container exists but has not been run yet
// Created is the status that denotes the container exists but has not been run yet.
Created Status = iota
// The container exists and is running.
// Running is the status that denotes the container exists and is running.
Running
// The container exists, it is in the process of being paused.
// Pausing is the status that denotes the container exists, it is in the process of being paused.
Pausing
// The container exists, but all its processes are paused.
// Paused is the status that denotes the container exists, but all its processes are paused.
Paused
// The container does not exist.
Destroyed
// Stopped is the status that denotes the container does not have a created or running process.
Stopped
)
func (s Status) String() string {
@ -41,8 +37,8 @@ func (s Status) String() string {
return "pausing"
case Paused:
return "paused"
case Destroyed:
return "destroyed"
case Stopped:
return "stopped"
default:
return "unknown"
}
@ -67,7 +63,7 @@ type BaseState struct {
Config configs.Config `json:"config"`
}
// A libcontainer container object.
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@ -79,14 +75,14 @@ type BaseContainer interface {
// Returns the current status of the container.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
State() (*State, error)
// Returns the current config of the container.
@ -95,7 +91,7 @@ type BaseContainer interface {
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
@ -105,7 +101,7 @@ type BaseContainer interface {
// Returns statistics for the container.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Stats() (*Stats, error)
@ -114,31 +110,57 @@ type BaseContainer interface {
// We can use this to change resources when containers are running.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// Systemerror - System error.
// SystemError - System error.
Start(process *Process) (err error)
// Destroys the container after killing all running processes.
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container, if its in a valid state, after killing any
// remaining running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
//
// errors:
// Systemerror - System error.
// ContainerNotStopped - Container is still running,
// ContainerPaused - Container is paused,
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// If all is specified the signal is sent to all processes in the container
// including the initial process.
//
// errors:
// Systemerror - System error.
Signal(s os.Signal) error
// SystemError - System error.
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
}

View file

@ -22,6 +22,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
@ -30,18 +31,18 @@ import (
const stdioFdCount = 3
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
// State represents a running container's state
@ -62,7 +63,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// A libcontainer container object.
// Container is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@ -84,13 +85,14 @@ type Container interface {
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// ContainerNotRunning - Container not running or created,
// Systemerror - System error.
Pause() error
@ -99,7 +101,8 @@ type Container interface {
// If the Container state is RUNNING, do nothing.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ContainerNotExists - Container no longer exists,
// ContainerNotPaused - Container is not paused,
// Systemerror - System error.
Resume() error
@ -141,7 +144,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
}
return pids, nil
}
@ -152,14 +155,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
@ -170,6 +173,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config
return c.cgroupManager.Set(c.config)
}
@ -181,42 +191,89 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil {
return err
}
doInit := status == Destroyed
parent, err := c.newParentProcess(process, doInit)
return c.start(process, status == Stopped)
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return newSystemError(err)
return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{
c: c,
}
if doInit {
if err := c.updateState(parent); err != nil {
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststart {
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
@ -224,9 +281,12 @@ func (c *linuxContainer) Start(process *Process) error {
return nil
}
func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
if all {
return signalAllProcesses(c.cgroupManager, s)
}
if err := c.initProcess.signal(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
@ -234,23 +294,32 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
// We only set up rootDir if we're not doing a `runc exec`. The reason for
// this is to avoid cases where a racing, unprivileged process inside the
// container can get access to the statedir file descriptor (which would
// allow for container rootfs escape).
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
}
cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{
Path: c.initPath,
Args: c.initArgs,
}
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
cmd.Stderr = p.Stderr
@ -259,7 +328,8 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
@ -269,7 +339,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
@ -278,10 +348,11 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
p.consoleChan = make(chan *os.File, 1)
return &initProcess{
cmd: cmd,
childPipe: childPipe,
@ -292,6 +363,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
@ -299,15 +371,16 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil {
return nil, err
}
// TODO: set on container for process management
p.consoleChan = make(chan *os.File, 1)
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
@ -325,8 +398,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
Args: process.Args,
Env: process.Env,
User: process.User,
AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd,
Console: process.consolePath,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
@ -334,6 +407,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
@ -347,6 +421,17 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
/*
* TODO: This should not be automatically computed. We should implement
* this as a field in libcontainer.Process, and then we only dup the
* new console over the file descriptors which were not explicitly
* set with process.Std{in,out,err}. The reason I've left this as-is
* is because the GetConsole() interface is new, there's no need to
* polish this interface right now.
*/
if process.Stdin == nil && process.Stdout == nil && process.Stderr == nil {
cfg.CreateConsole = true
}
return cfg
}
@ -371,15 +456,16 @@ func (c *linuxContainer) Pause() error {
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
switch status {
case Running, Created:
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error {
@ -408,13 +494,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// check Criu version greater than or equal to min_version
func (c *linuxContainer) checkCriuVersion(min_version string) error {
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
}
versionReq = x*10000 + y*100 + z
@ -459,7 +545,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version)
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
}
return nil
@ -480,6 +566,29 @@ func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
for _, path := range c.config.MaskPaths {
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
if err != nil {
if os.IsNotExist(err) {
continue
}
return err
}
if fi.IsDir() {
continue
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String("/dev/null"),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
return nil
}
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
@ -575,6 +684,15 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
if err := c.addMaskPaths(req); err != nil {
return err
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuDumpMount(req, m)
}
// Write the FD info to a file in the image directory
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
@ -607,6 +725,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
@ -690,23 +829,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break
}
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
if len(c.config.MaskPaths) > 0 {
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
c.addCriuRestoreMount(req, m)
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuRestoreMount(req, m)
}
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
// append optional manage cgroups mode
@ -950,14 +1085,14 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Prestart {
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
@ -974,7 +1109,9 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil {
return err
}
if err := c.updateState(r); err != nil {
// create a timestamp indicating when the restored checkpoint was started
c.created = time.Now().UTC()
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@ -986,13 +1123,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil
}
func (c *linuxContainer) updateState(process parentProcess) error {
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process
state, err := c.currentState()
if err != nil {
return err
return nil, err
}
return c.saveState(state)
err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
}
func (c *linuxContainer) saveState(s *State) error {
@ -1027,37 +1168,75 @@ func (c *linuxContainer) refreshState() error {
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
t, err := c.runType()
if err != nil {
return err
}
if running {
switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil {
// doesInitProcessExist checks if the init process is still the same process
// as the initial one, it could happen that the original process has exited
// and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
}
if c.initProcessStartTime != startTime {
return false, nil
}
// return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil
}
return false, newSystemError(err)
}
return true, nil
}
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
@ -1106,16 +1285,22 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
order := []configs.NamespaceType{
// The user namespace *must* be done first.
configs.NEWUSER,
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
// Remove namespaces that we don't need to join.
var nsTypes []configs.NamespaceType
for _, ns := range order {
if c.config.Namespaces.Contains(ns) {
nsTypes = append(nsTypes, ns)
}
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
@ -1125,14 +1310,14 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
}
}
return paths, nil
@ -1155,7 +1340,7 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
@ -1165,14 +1350,6 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: uint32(cloneFlags),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
Type: ConsolePathAttr,
Value: []byte(consolePath),
})
}
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)

View file

@ -79,11 +79,11 @@ func (m *mockProcess) signal(_ os.Signal) error {
return nil
}
func (p *mockProcess) externalDescriptors() []string {
func (m *mockProcess) externalDescriptors() []string {
return []string{}
}
func (p *mockProcess) setExternalDescriptors(newFds []string) {
func (m *mockProcess) setExternalDescriptors(newFds []string) {
}
func TestGetContainerPids(t *testing.T) {

View file

@ -0,0 +1,20 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View file

@ -3,13 +3,13 @@
package libcontainer
// cgroup restoring strategy provided by criu
type cg_mode uint32
type cgMode uint32
const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
)
type CriuPageServerInfo struct {
@ -32,6 +32,6 @@ type CriuOpts struct {
FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode
ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
}

View file

@ -23,7 +23,7 @@ var (
ioutilReadDir = ioutil.ReadDir
)
// Given the path to a device and it's cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct.
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*configs.Device, error) {
fileInfo, err := osLstat(path)
if err != nil {
@ -94,6 +94,9 @@ func getDevices(path string) ([]*configs.Device, error) {
if err == ErrNotADevice {
continue
}
if os.IsNotExist(err) {
continue
}
return nil, err
}
out = append(out, device)

View file

@ -2,7 +2,7 @@ package libcontainer
import "io"
// API error code type.
// ErrorCode is the API error code type.
type ErrorCode int
// API error codes.
@ -56,13 +56,13 @@ func (c ErrorCode) String() string {
}
}
// API Error type.
// Error is the API error type.
type Error interface {
error
// Returns a verbose string including the error message
// and a representation of the stack trace suitable for
// printing.
// Returns an error if it failed to write the detail of the Error to w.
// The detail of the Error may include the error message and a
// representation of the stack trace.
Detail(w io.Writer) error
// Returns the error code for this error.

View file

@ -4,12 +4,17 @@ import "testing"
func TestErrorCode(t *testing.T) {
codes := map[ErrorCode]string{
IdInUse: "Id already in use",
InvalidIdFormat: "Invalid format",
ContainerPaused: "Container paused",
ConfigInvalid: "Invalid configuration",
SystemError: "System error",
ContainerNotExists: "Container does not exist",
IdInUse: "Id already in use",
InvalidIdFormat: "Invalid format",
ContainerPaused: "Container paused",
ConfigInvalid: "Invalid configuration",
SystemError: "System error",
ContainerNotExists: "Container does not exist",
ContainerNotStopped: "Container is not stopped",
ContainerNotRunning: "Container is not running",
ConsoleExists: "Console exists for process",
ContainerNotPaused: "Container is not paused",
NoProcessOps: "No process operations",
}
for code, expected := range codes {

View file

@ -6,7 +6,6 @@ import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime/debug"
@ -23,41 +22,27 @@ import (
)
const (
stateFilename = "state.json"
stateFilename = "state.json"
execFifoFilename = "exec.fifo"
)
var (
idRegex = regexp.MustCompile(`^[\w-\.]+$`)
idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
maxIdLen = 1024
)
// InitArgs returns an options func to configure a LinuxFactory with the
// provided init arguments.
// provided init binary path and arguments.
func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
name := args[0]
if filepath.Base(name) == name {
if lp, err := exec.LookPath(name); err == nil {
name = lp
return func(l *LinuxFactory) (err error) {
if len(args) > 0 {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
return newGenericError(err, ConfigInvalid)
}
} else {
abs, err := filepath.Abs(name)
if err != nil {
return err
}
name = abs
}
l.InitPath = "/proc/self/exe"
l.InitArgs = append([]string{name}, args[1:]...)
return nil
}
}
// InitPath returns an options func to configure a LinuxFactory with the
// provided absolute path to the init binary and arguements.
func InitPath(path string, args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.InitPath = path
l.InitArgs = args
return nil
}
@ -102,6 +87,15 @@ func TmpfsRoot(l *LinuxFactory) error {
return nil
}
// CriuPath returns an option func to configure a LinuxFactory with the
// provided criupath
func CriuPath(criupath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.CriuPath = criupath
return nil
}
}
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
@ -112,10 +106,10 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
}
l := &LinuxFactory{
Root: root,
InitArgs: []string{"/proc/self/exe", "init"},
Validator: validate.New(),
CriuPath: "criu",
}
InitArgs(os.Args[0], "init")(l)
Cgroupfs(l)
for _, opt := range options {
if err := opt(l); err != nil {
@ -130,9 +124,6 @@ type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
// InitPath is the absolute path to the init binary.
InitPath string
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
@ -158,20 +149,40 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0700); err != nil {
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
@ -185,7 +196,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
state, err := l.loadState(containerRoot)
state, err := l.loadState(containerRoot, id)
if err != nil {
return nil, err
}
@ -195,17 +206,17 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
fds: state.ExternalDescriptors,
}
c := &linuxContainer{
initProcess: r,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
}
c.state = &createdState{c: c, s: Created}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
}
@ -219,55 +230,69 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(fdStr)
var (
pipefd, rootfd int
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
)
// Get the INITPIPE.
pipefd, err = strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err)
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
)
defer pipe.Close()
// Only init processes have STATEDIR.
rootfd = -1
if it == initStandard {
rootfd, err = strconv.Atoi(envStateDir)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
}
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err)
}
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
// ensure that this pipe is always closed
pipe.Close()
}()
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err = newContainerInit(it, pipe)
i, err := newContainerInit(it, pipe, rootfd)
if err != nil {
return err
}
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()
}
func (l *LinuxFactory) loadState(root string) (*State, error) {
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(err, ContainerNotExists)
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}

View file

@ -1,6 +1,7 @@
package libcontainer
import (
"fmt"
"io"
"text/template"
"time"
@ -8,20 +9,6 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
type syncType uint8
const (
procReady syncType = iota
procError
procRun
procHooks
procResume
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}
@ -51,14 +38,27 @@ func newGenericError(err error, c ErrorCode) Error {
}
func newSystemError(err error) Error {
if le, ok := err.(Error); ok {
return le
}
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Stack: stacktrace.Capture(1),
Cause: cause,
Stack: stacktrace.Capture(2),
}
if err != nil {
gerr.Message = err.Error()
@ -70,12 +70,17 @@ type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Cause string
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
return e.Message
if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode {

View file

@ -12,6 +12,7 @@ import (
"strconv"
"strings"
"syscall"
"unsafe"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -52,19 +53,21 @@ type initConfig struct {
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
CreateConsole bool `json:"create_console"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File) (initer, error) {
func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@ -75,13 +78,15 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
switch t {
case initSetns:
return &linuxSetnsInit{
pipe: pipe,
config: config,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
stateDirFD: stateDirFD,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@ -141,30 +146,85 @@ func finalizeNamespace(config *initConfig) error {
}
if config.Cwd != "" {
if err := syscall.Chdir(config.Cwd); err != nil {
return err
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
}
}
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(pipe *os.File, config *initConfig, mount bool) error {
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
console, err := newConsole()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer console.Close()
linuxConsole, ok := console.(*linuxConsole)
if !ok {
return fmt.Errorf("failed to cast console to *linuxConsole")
}
// Mount the console inside our rootfs.
if mount {
if err := linuxConsole.mount(); err != nil {
return err
}
}
if err := writeSync(pipe, procConsole); err != nil {
return err
}
// We need to have a two-way synchronisation here. Though it might seem
// pointless, it's important to make sure that the sendmsg(2) payload
// doesn't get swallowed by an out-of-place read(2) [which happens if the
// syscalls get reordered so that sendmsg(2) is before the other side's
// read(2) of procConsole].
if err := readSync(pipe, procConsoleReq); err != nil {
return err
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(pipe, linuxConsole.File()); err != nil {
return err
}
// Make sure the other side received the fd.
if err := readSync(pipe, procConsoleAck); err != nil {
return err
}
// Now, dup over all the things.
return linuxConsole.dupStdio()
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
if err := readSync(pipe, procRun); err != nil {
return err
}
return nil
}
@ -173,19 +233,15 @@ func syncParentReady(pipe io.ReadWriter) error {
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
if err := readSync(pipe, procResume); err != nil {
return err
}
return nil
}
@ -211,8 +267,8 @@ func setupUser(config *initConfig) error {
}
var addGroups []int
if len(config.Config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath)
if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil {
return err
}
@ -259,11 +315,17 @@ func fixStdioPermissions(u *user.ExecUser) error {
if err := syscall.Fstat(int(fd), &s); err != nil {
return err
}
// skip chown of /dev/null if it was used as one of the STDIO fds.
// Skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev {
continue
}
if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
// We only change the uid owner (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := syscall.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
return err
}
}
@ -331,10 +393,51 @@ func setOomScoreAdj(oomScoreAdj int, pid int) error {
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
}
// killCgroupProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending a SIGKILL to each process then waiting for them to
// exit.
func killCgroupProcesses(m cgroups.Manager) error {
const _P_PID = 1
type siginfo struct {
si_signo int32
si_errno int32
si_code int32
// below here is a union; si_pid is the only field we use
si_pid int32
// Pad to 128 bytes as detailed in blockUntilWaitable
pad [96]byte
}
// isWaitable returns true if the process has exited false otherwise.
// Its based off blockUntilWaitable in src/os/wait_waitid.go
func isWaitable(pid int) (bool, error) {
si := &siginfo{}
_, _, e := syscall.Syscall6(syscall.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), syscall.WEXITED|syscall.WNOWAIT|syscall.WNOHANG, 0, 0)
if e != 0 {
return false, os.NewSyscallError("waitid", e)
}
return si.si_pid != 0, nil
}
// isNoChildren returns true if err represents a syscall.ECHILD false otherwise
func isNoChildren(err error) bool {
switch err := err.(type) {
case syscall.Errno:
if err == syscall.ECHILD {
return true
}
case *os.SyscallError:
if err.Err == syscall.ECHILD {
return true
}
}
return false
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
// For all other signals it will check if the process is ready to report its
// exit status and only if it is will a wait be performed.
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
@ -351,16 +454,31 @@ func killCgroupProcesses(m cgroups.Manager) error {
continue
}
procs = append(procs, p)
if err := p.Kill(); err != nil {
if err := p.Signal(s); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
for _, p := range procs {
if s != syscall.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !isNoChildren(err) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
} else if !ok {
// Not ready to report so don't wait
continue
}
}
if _, err := p.Wait(); err != nil {
logrus.Warn(err)
if !isNoChildren(err) {
logrus.Warn("wait: ", err)
}
}
}
return nil

View file

@ -89,7 +89,7 @@ func TestCheckpoint(t *testing.T) {
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
stdinR.Close()
defer stdinW.Close()
if err != nil {

View file

@ -2,6 +2,7 @@ package integration
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"os"
@ -38,12 +39,12 @@ func testExecPS(t *testing.T, userns bool) {
defer remove(rootfs)
config := newTemplateConfig(rootfs)
if userns {
config.UidMappings = []configs.IDMap{{0, 0, 1000}}
config.GidMappings = []configs.IDMap{{0, 0, 1000}}
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
}
buffers, exitCode, err := runContainer(config, "", "ps")
buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
@ -180,8 +181,8 @@ func testRlimit(t *testing.T, userns bool) {
config := newTemplateConfig(rootfs)
if userns {
config.UidMappings = []configs.IDMap{{0, 0, 1000}}
config.GidMappings = []configs.IDMap{{0, 0, 1000}}
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
}
@ -199,17 +200,6 @@ func testRlimit(t *testing.T, userns bool) {
}
}
func newTestRoot() (string, error) {
dir, err := ioutil.TempDir("", "libcontainer")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
return dir, nil
}
func TestEnter(t *testing.T) {
if testing.Short() {
return
@ -241,7 +231,7 @@ func TestEnter(t *testing.T) {
Stdin: stdinR,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -259,7 +249,7 @@ func TestEnter(t *testing.T) {
pconfig2.Stdin = stdinR2
pconfig2.Stdout = &stdout2
err = container.Start(&pconfig2)
err = container.Run(&pconfig2)
stdinR2.Close()
defer stdinW2.Close()
ok(t, err)
@ -330,7 +320,7 @@ func TestProcessEnv(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -378,7 +368,7 @@ func TestProcessCaps(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -431,7 +421,6 @@ func TestAdditionalGroups(t *testing.T) {
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.AdditionalGroups = []string{"plugdev", "audio"}
factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
ok(t, err)
@ -442,13 +431,14 @@ func TestAdditionalGroups(t *testing.T) {
var stdout bytes.Buffer
pconfig := libcontainer.Process{
Cwd: "/",
Args: []string{"sh", "-c", "id", "-Gn"},
Env: standardEnvironment,
Stdin: nil,
Stdout: &stdout,
Cwd: "/",
Args: []string{"sh", "-c", "id", "-Gn"},
Env: standardEnvironment,
Stdin: nil,
Stdout: &stdout,
AdditionalGroups: []string{"plugdev", "audio"},
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -508,7 +498,7 @@ func testFreeze(t *testing.T, systemd bool) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(pconfig)
err = container.Run(pconfig)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -613,7 +603,7 @@ func testPids(t *testing.T, systemd bool) {
}
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
// to fail reliabily.
// to fail reliability.
config.Cgroups.Resources.PidsLimit = 64
out, _, err := runContainer(config, "", "/bin/sh", "-c", `
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
@ -719,7 +709,7 @@ func TestContainerState(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(p)
err = container.Run(p)
if err != nil {
t.Fatal(err)
}
@ -772,7 +762,7 @@ func TestPassExtraFiles(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&process)
err = container.Run(&process)
if err != nil {
t.Fatal(err)
}
@ -853,7 +843,7 @@ func TestMountCmds(t *testing.T) {
Args: []string{"sh", "-c", "env"},
Env: standardEnvironment,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
if err != nil {
t.Fatal(err)
}
@ -902,7 +892,7 @@ func TestSysctl(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -1042,7 +1032,7 @@ func TestOomScoreAdj(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -1059,17 +1049,32 @@ func TestHook(t *testing.T) {
if testing.Short() {
return
}
root, err := newTestRoot()
bundle, err := newTestBundle()
ok(t, err)
defer os.RemoveAll(root)
defer remove(bundle)
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
expectedBundlePath := "/path/to/bundle/path"
expectedBundlePath := bundle
config.Labels = append(config.Labels, fmt.Sprintf("bundle=%s", expectedBundlePath))
getRootfsFromBundle := func(bundle string) (string, error) {
f, err := os.Open(filepath.Join(bundle, "config.json"))
if err != nil {
return "", err
}
var config configs.Config
if err = json.NewDecoder(f).Decode(&config); err != nil {
return "", err
}
return config.Rootfs, nil
}
config.Hooks = &configs.Hooks{
Prestart: []configs.Hook{
configs.NewFunctionHook(func(s configs.HookState) error {
@ -1077,7 +1082,11 @@ func TestHook(t *testing.T) {
t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundlePath, s.BundlePath)
}
f, err := os.Create(filepath.Join(s.Root, "test"))
root, err := getRootfsFromBundle(s.BundlePath)
if err != nil {
return err
}
f, err := os.Create(filepath.Join(root, "test"))
if err != nil {
return err
}
@ -1090,7 +1099,11 @@ func TestHook(t *testing.T) {
t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundlePath, s.BundlePath)
}
return ioutil.WriteFile(filepath.Join(s.Root, "test"), []byte("hello world"), 0755)
root, err := getRootfsFromBundle(s.BundlePath)
if err != nil {
return err
}
return ioutil.WriteFile(filepath.Join(root, "test"), []byte("hello world"), 0755)
}),
},
Poststop: []configs.Hook{
@ -1099,10 +1112,20 @@ func TestHook(t *testing.T) {
t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundlePath, s.BundlePath)
}
return os.RemoveAll(filepath.Join(s.Root, "test"))
root, err := getRootfsFromBundle(s.BundlePath)
if err != nil {
return err
}
return os.RemoveAll(filepath.Join(root, "test"))
}),
},
}
// write config of json format into config.json under bundle
f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644)
ok(t, err)
ok(t, json.NewEncoder(f).Encode(config))
container, err := factory.Create("test", config)
ok(t, err)
@ -1114,7 +1137,7 @@ func TestHook(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(&pconfig)
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
@ -1139,7 +1162,7 @@ func TestHook(t *testing.T) {
}
if err := container.Destroy(); err != nil {
t.Fatalf("container destory %s", err)
t.Fatalf("container destroy %s", err)
}
fi, err := os.Stat(filepath.Join(rootfs, "test"))
if err == nil || !os.IsNotExist(err) {
@ -1231,7 +1254,7 @@ func TestRootfsPropagationSlaveMount(t *testing.T) {
Stdin: stdinR,
}
err = container.Start(pconfig)
err = container.Run(pconfig)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -1260,7 +1283,7 @@ func TestRootfsPropagationSlaveMount(t *testing.T) {
Stdout: &stdout2,
}
err = container.Start(pconfig2)
err = container.Run(pconfig2)
stdinR2.Close()
defer stdinW2.Close()
ok(t, err)
@ -1348,7 +1371,7 @@ func TestRootfsPropagationSharedMount(t *testing.T) {
Stdin: stdinR,
}
err = container.Start(pconfig)
err = container.Run(pconfig)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -1380,7 +1403,7 @@ func TestRootfsPropagationSharedMount(t *testing.T) {
Capabilities: processCaps,
}
err = container.Start(pconfig2)
err = container.Run(pconfig2)
stdinR2.Close()
defer stdinW2.Close()
ok(t, err)
@ -1452,7 +1475,7 @@ func TestInitJoinPID(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR1,
}
err = container1.Start(init1)
err = container1.Run(init1)
stdinR1.Close()
defer stdinW1.Close()
ok(t, err)
@ -1462,7 +1485,7 @@ func TestInitJoinPID(t *testing.T) {
ok(t, err)
pidns1 := state1.NamespacePaths[configs.NEWPID]
// Start a container inside the existing pidns but with different cgroups
// Run a container inside the existing pidns but with different cgroups
config2 := newTemplateConfig(rootfs)
config2.Namespaces.Add(configs.NEWPID, pidns1)
config2.Cgroups.Path = "integration/test2"
@ -1478,7 +1501,7 @@ func TestInitJoinPID(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR2,
}
err = container2.Start(init2)
err = container2.Run(init2)
stdinR2.Close()
defer stdinW2.Close()
ok(t, err)
@ -1508,7 +1531,7 @@ func TestInitJoinPID(t *testing.T) {
Env: standardEnvironment,
Stdout: buffers.Stdout,
}
err = container1.Start(ps)
err = container1.Run(ps)
ok(t, err)
waitProcess(ps, t)
@ -1542,8 +1565,8 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
// Execute a long-running container
config1 := newTemplateConfig(rootfs)
config1.UidMappings = []configs.IDMap{{0, 0, 1000}}
config1.GidMappings = []configs.IDMap{{0, 0, 1000}}
config1.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config1.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER})
container1, err := newContainer(config1)
ok(t, err)
@ -1557,7 +1580,7 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR1,
}
err = container1.Start(init1)
err = container1.Run(init1)
stdinR1.Close()
defer stdinW1.Close()
ok(t, err)
@ -1568,14 +1591,14 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
netns1 := state1.NamespacePaths[configs.NEWNET]
userns1 := state1.NamespacePaths[configs.NEWUSER]
// Start a container inside the existing pidns but with different cgroups
// Run a container inside the existing pidns but with different cgroups
rootfs2, err := newRootfs()
ok(t, err)
defer remove(rootfs2)
config2 := newTemplateConfig(rootfs2)
config2.UidMappings = []configs.IDMap{{0, 0, 1000}}
config2.GidMappings = []configs.IDMap{{0, 0, 1000}}
config2.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config2.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config2.Namespaces.Add(configs.NEWNET, netns1)
config2.Namespaces.Add(configs.NEWUSER, userns1)
config2.Cgroups.Path = "integration/test2"
@ -1591,7 +1614,7 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR2,
}
err = container2.Start(init2)
err = container2.Run(init2)
stdinR2.Close()
defer stdinW2.Close()
ok(t, err)
@ -1622,3 +1645,52 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
stdinW1.Close()
waitProcess(init1, t)
}
func TestTmpfsCopyUp(t *testing.T) {
if testing.Short() {
return
}
root, err := newTestRoot()
ok(t, err)
defer os.RemoveAll(root)
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Mounts = append(config.Mounts, &configs.Mount{
Source: "tmpfs",
Destination: "/etc",
Device: "tmpfs",
Extensions: configs.EXT_COPYUP,
})
factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
ok(t, err)
container, err := factory.Create("test", config)
ok(t, err)
defer container.Destroy()
var stdout bytes.Buffer
pconfig := libcontainer.Process{
Args: []string{"ls", "/etc/passwd"},
Env: standardEnvironment,
Stdin: nil,
Stdout: &stdout,
}
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
waitProcess(&pconfig, t)
outputLs := string(stdout.Bytes())
// Check that the ls output has /etc/passwd
if !strings.Contains(outputLs, "/etc/passwd") {
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
}
}

View file

@ -36,7 +36,7 @@ func TestExecIn(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -51,7 +51,7 @@ func TestExecIn(t *testing.T) {
Stderr: buffers.Stderr,
}
err = container.Start(ps)
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
stdinW.Close()
@ -61,6 +61,9 @@ func TestExecIn(t *testing.T) {
if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
t.Fatalf("unexpected running process, output %q", out)
}
if strings.Contains(out, "\r") {
t.Fatalf("unexpected carriage-return in output")
}
}
func TestExecInUsernsRlimit(t *testing.T) {
@ -86,8 +89,8 @@ func testExecInRlimit(t *testing.T, userns bool) {
config := newTemplateConfig(rootfs)
if userns {
config.UidMappings = []configs.IDMap{{0, 0, 1000}}
config.GidMappings = []configs.IDMap{{0, 0, 1000}}
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
}
@ -103,7 +106,7 @@ func testExecInRlimit(t *testing.T, userns bool) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -121,7 +124,7 @@ func testExecInRlimit(t *testing.T, userns bool) {
{Type: syscall.RLIMIT_NOFILE, Hard: 1026, Soft: 1026},
},
}
err = container.Start(ps)
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
@ -134,6 +137,64 @@ func testExecInRlimit(t *testing.T, userns bool) {
}
}
func TestExecInAdditionalGroups(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
var stdout bytes.Buffer
pconfig := libcontainer.Process{
Cwd: "/",
Args: []string{"sh", "-c", "id", "-Gn"},
Env: standardEnvironment,
Stdin: nil,
Stdout: &stdout,
AdditionalGroups: []string{"plugdev", "audio"},
}
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
waitProcess(&pconfig, t)
stdinW.Close()
waitProcess(process, t)
outputGroups := string(stdout.Bytes())
// Check that the groups output has the groups that we specified
if !strings.Contains(outputGroups, "audio") {
t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
}
if !strings.Contains(outputGroups, "plugdev") {
t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
}
}
func TestExecInError(t *testing.T) {
if testing.Short() {
return
@ -155,7 +216,7 @@ func TestExecInError(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer func() {
stdinW.Close()
@ -173,7 +234,7 @@ func TestExecInError(t *testing.T) {
Env: standardEnvironment,
Stdout: &out,
}
err = container.Start(unexistent)
err = container.Run(unexistent)
if err == nil {
t.Fatal("Should be an error")
}
@ -207,7 +268,7 @@ func TestExecInTTY(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -218,15 +279,15 @@ func TestExecInTTY(t *testing.T) {
Args: []string{"ps"},
Env: standardEnvironment,
}
console, err := ps.NewConsole(0)
err = container.Run(ps)
ok(t, err)
console, err := ps.GetConsole()
copy := make(chan struct{})
go func() {
io.Copy(&stdout, console)
close(copy)
}()
ok(t, err)
err = container.Start(ps)
ok(t, err)
select {
case <-time.After(5 * time.Second):
t.Fatal("Waiting for copy timed out")
@ -238,9 +299,12 @@ func TestExecInTTY(t *testing.T) {
waitProcess(process, t)
out := stdout.String()
if !strings.Contains(out, "cat") || !strings.Contains(string(out), "ps") {
if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
t.Fatalf("unexpected running process, output %q", out)
}
if strings.Contains(out, "\r") {
t.Fatalf("unexpected carriage-return in output")
}
}
func TestExecInEnvironment(t *testing.T) {
@ -264,7 +328,7 @@ func TestExecInEnvironment(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -283,7 +347,7 @@ func TestExecInEnvironment(t *testing.T) {
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
}
err = container.Start(process2)
err = container.Run(process2)
ok(t, err)
waitProcess(process2, t)
@ -328,7 +392,7 @@ func TestExecinPassExtraFiles(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
if err != nil {
@ -346,7 +410,7 @@ func TestExecinPassExtraFiles(t *testing.T) {
Stdin: nil,
Stdout: &stdout,
}
err = container.Start(inprocess)
err = container.Run(inprocess)
if err != nil {
t.Fatal(err)
}
@ -401,7 +465,7 @@ func TestExecInOomScoreAdj(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -415,7 +479,7 @@ func TestExecInOomScoreAdj(t *testing.T) {
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
}
err = container.Start(ps)
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
@ -439,8 +503,8 @@ func TestExecInUserns(t *testing.T) {
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.UidMappings = []configs.IDMap{{0, 0, 1000}}
config.GidMappings = []configs.IDMap{{0, 0, 1000}}
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
container, err := newContainer(config)
ok(t, err)
@ -456,7 +520,7 @@ func TestExecInUserns(t *testing.T) {
Env: standardEnvironment,
Stdin: stdinR,
}
err = container.Start(process)
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
@ -476,7 +540,7 @@ func TestExecInUserns(t *testing.T) {
Stdout: buffers.Stdout,
Stderr: os.Stderr,
}
err = container.Start(process2)
err = container.Run(process2)
ok(t, err)
waitProcess(process2, t)
stdinW.Close()

View file

@ -36,19 +36,19 @@ var (
func TestMain(m *testing.M) {
var (
err error
ret int = 0
ret int
)
logrus.SetOutput(os.Stderr)
logrus.SetLevel(logrus.InfoLevel)
factory, err = libcontainer.New(".", libcontainer.Cgroupfs)
factory, err = libcontainer.New("/run/libctTests", libcontainer.Cgroupfs)
if err != nil {
logrus.Error(err)
os.Exit(1)
}
if systemd.UseSystemd() {
systemdFactory, err = libcontainer.New(".", libcontainer.SystemdCgroups)
systemdFactory, err = libcontainer.New("/run/libctTests", libcontainer.SystemdCgroups)
if err != nil {
logrus.Error(err)
os.Exit(1)

View file

@ -50,7 +50,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
Stderr: buffers.Stderr,
}
err = container.Start(pwd)
err = container.Run(pwd)
if err != nil {
t.Fatal(err)
}
@ -101,8 +101,8 @@ func TestSeccompPermitWriteConditional(t *testing.T) {
Args: []*configs.Arg{
{
Index: 0,
Value: 1,
Op: configs.GreaterThan,
Value: 2,
Op: configs.EqualTo,
},
},
},
@ -125,7 +125,7 @@ func TestSeccompPermitWriteConditional(t *testing.T) {
Stderr: buffers.Stderr,
}
err = container.Start(dmesg)
err = container.Run(dmesg)
if err != nil {
t.Fatal(err)
}
@ -162,8 +162,8 @@ func TestSeccompDenyWriteConditional(t *testing.T) {
Args: []*configs.Arg{
{
Index: 0,
Value: 1,
Op: configs.GreaterThan,
Value: 2,
Op: configs.EqualTo,
},
},
},
@ -186,7 +186,7 @@ func TestSeccompDenyWriteConditional(t *testing.T) {
Stderr: buffers.Stderr,
}
err = container.Start(dmesg)
err = container.Run(dmesg)
if err != nil {
t.Fatal(err)
}

View file

@ -20,6 +20,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD
// it uses a network strategy of just setting a loopback interface
// and the default setup for devices
func newTemplateConfig(rootfs string) *configs.Config {
allowAllDevices := false
return &configs.Config{
Rootfs: rootfs,
Capabilities: []string{
@ -49,12 +50,13 @@ func newTemplateConfig(rootfs string) *configs.Config {
Path: "integration/test",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: false,
AllowAllDevices: &allowAllDevices,
AllowedDevices: configs.DefaultAllowedDevices,
},
},
MaskPaths: []string{
"/proc/kcore",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
@ -89,12 +91,15 @@ func newTemplateConfig(rootfs string) *configs.Config {
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
/*
CI is broken on the debian based kernels with this
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
*/
{
Source: "sysfs",
Destination: "/sys",

View file

@ -2,6 +2,8 @@ package integration
import (
"bytes"
"crypto/md5"
"encoding/hex"
"fmt"
"io/ioutil"
"os"
@ -11,6 +13,7 @@ import (
"strings"
"syscall"
"testing"
"time"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
@ -62,6 +65,28 @@ func waitProcess(p *libcontainer.Process, t *testing.T) {
}
}
func newTestRoot() (string, error) {
dir, err := ioutil.TempDir("", "libcontainer")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
return dir, nil
}
func newTestBundle() (string, error) {
dir, err := ioutil.TempDir("", "bundle")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
return dir, nil
}
// newRootfs creates a new tmp directory and copies the busybox root filesystem
func newRootfs() (string, error) {
dir, err := ioutil.TempDir("", "")
@ -92,7 +117,9 @@ func copyBusybox(dest string) error {
}
func newContainer(config *configs.Config) (libcontainer.Container, error) {
return newContainerWithName("testCT", config)
h := md5.New()
h.Write([]byte(time.Now().String()))
return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config)
}
func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) {
@ -123,7 +150,7 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe
Stderr: buffers.Stderr,
}
err = container.Start(process)
err = container.Run(process)
if err != nil {
return buffers, -1, err
}

View file

@ -1,12 +1,12 @@
// +build linux
package keyctl
package keys
import (
"fmt"
"syscall"
"strings"
"strconv"
"strings"
"syscall"
"unsafe"
)
@ -17,7 +17,7 @@ const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte = nil
var _name *byte
var err error
if len(name) > 0 {
@ -34,7 +34,7 @@ func JoinSessionKeyring(name string) (KeySerial, error) {
return KeySerial(sessKeyId), nil
}
// modify permissions on a keyring by reading the current permissions,
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
@ -64,4 +64,3 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
return nil
}

View file

@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) {
return "", "", nil
}
func GetROMountLabel() string {
return ""
}
func GenLabels(options string) (string, string, error) {
return "", "", nil
}

View file

@ -55,6 +55,10 @@ func InitLabels(options []string) (string, string, error) {
return processLabel, mountLabel, nil
}
func GetROMountLabel() string {
return selinux.GetROFileLabel()
}
// DEPRECATED: The GenLabels function is only to be used during the transition to the official API.
func GenLabels(options string) (string, string, error) {
return InitLabels(strings.Fields(options))
@ -107,7 +111,7 @@ func SetFileLabel(path string, fileLabel string) error {
return nil
}
// Tell the kernel the label for all files to be created
// SetFileCreateLabel tells the kernel the label for all files to be created
func SetFileCreateLabel(fileLabel string) error {
if selinux.SelinuxEnabled() {
return selinux.Setfscreatecon(fileLabel)
@ -115,7 +119,7 @@ func SetFileCreateLabel(fileLabel string) error {
return nil
}
// Change the label of path to the filelabel string.
// Relabel changes the label of path to the filelabel string.
// It changes the MCS label to s0 if shared is true.
// This will allow all containers to share the content.
func Relabel(path string, fileLabel string, shared bool) error {
@ -129,7 +133,7 @@ func Relabel(path string, fileLabel string, shared bool) error {
exclude_paths := map[string]bool{"/": true, "/usr": true, "/etc": true}
if exclude_paths[path] {
return fmt.Errorf("Relabeling of %s is not allowed", path)
return fmt.Errorf("SELinux relabeling of %s is not allowed", path)
}
if shared {
@ -137,7 +141,10 @@ func Relabel(path string, fileLabel string, shared bool) error {
c["level"] = "s0"
fileLabel = c.Get()
}
return selinux.Chcon(path, fileLabel, true)
if err := selinux.Chcon(path, fileLabel, true); err != nil {
return fmt.Errorf("SELinux relabeling of %s is not allowed: %q", path, err)
}
return nil
}
// GetPidLabel will return the label of the process running with the specified pid
@ -166,7 +173,7 @@ func UnreserveLabel(label string) error {
return nil
}
// DupSecOpt takes an process label and returns security options that
// DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string {
return selinux.DupSecOpt(src)

View file

@ -19,6 +19,10 @@ func TestInit(t *testing.T) {
t.Fatal(err)
}
testDisabled := []string{"disable"}
roMountLabel := GetROMountLabel()
if roMountLabel == "" {
t.Errorf("GetROMountLabel Failed")
}
plabel, mlabel, err = InitLabels(testDisabled)
if err != nil {
t.Log("InitLabels Disabled Failed")
@ -26,7 +30,7 @@ func TestInit(t *testing.T) {
}
if plabel != "" {
t.Log("InitLabels Disabled Failed")
t.Fatal()
t.FailNow()
}
testUser := []string{"user:user_u", "role:user_r", "type:user_t", "level:s0:c1,c15"}
plabel, mlabel, err = InitLabels(testUser)
@ -51,39 +55,40 @@ func TestDuplicateLabel(t *testing.T) {
secopt := DupSecOpt("system_u:system_r:svirt_lxc_net_t:s0:c1,c2")
t.Log(secopt)
for _, opt := range secopt {
con := strings.SplitN(opt, ":", 3)
if len(con) != 3 || con[0] != "label" {
parts := strings.SplitN(opt, "=", 2)
if len(parts) != 2 || parts[0] != "label" {
t.Errorf("Invalid DupSecOpt return value")
continue
}
if con[1] == "user" {
if con[2] != "system_u" {
con := strings.SplitN(parts[1], ":", 2)
if con[0] == "user" {
if con[1] != "system_u" {
t.Errorf("DupSecOpt Failed user incorrect")
}
continue
}
if con[1] == "role" {
if con[2] != "system_r" {
if con[0] == "role" {
if con[1] != "system_r" {
t.Errorf("DupSecOpt Failed role incorrect")
}
continue
}
if con[1] == "type" {
if con[2] != "svirt_lxc_net_t" {
if con[0] == "type" {
if con[1] != "svirt_lxc_net_t" {
t.Errorf("DupSecOpt Failed type incorrect")
}
continue
}
if con[1] == "level" {
if con[2] != "s0:c1,c2" {
if con[0] == "level" {
if con[1] != "s0:c1,c2" {
t.Errorf("DupSecOpt Failed level incorrect")
}
continue
}
t.Errorf("DupSecOpt Failed invalid field %q", con[1])
t.Errorf("DupSecOpt Failed invalid field %q", con[0])
}
secopt = DisableSecOpt()
if secopt[0] != "label:disable" {
if secopt[0] != "label=disable" {
t.Errorf("DisableSecOpt Failed level incorrect")
}
}
@ -93,24 +98,24 @@ func TestRelabel(t *testing.T) {
t.Fatal(err)
}
defer os.RemoveAll(testdir)
label := "system_u:system_r:svirt_sandbox_file_t:s0:c1,c2"
label := "system_u:object_r:svirt_sandbox_file_t:s0:c1,c2"
if err := Relabel(testdir, "", true); err != nil {
t.Fatal("Relabel with no label failed: %v", err)
t.Fatalf("Relabel with no label failed: %v", err)
}
if err := Relabel(testdir, label, true); err != nil {
t.Fatal("Relabel shared failed: %v", err)
t.Fatalf("Relabel shared failed: %v", err)
}
if err := Relabel(testdir, label, false); err != nil {
t.Fatal("Relabel unshared failed: %v", err)
t.Fatalf("Relabel unshared failed: %v", err)
}
if err := Relabel("/etc", label, false); err == nil {
t.Fatal("Relabel /etc succeeded")
t.Fatalf("Relabel /etc succeeded")
}
if err := Relabel("/", label, false); err == nil {
t.Fatal("Relabel / succeeded")
t.Fatalf("Relabel / succeeded")
}
if err := Relabel("/usr", label, false); err == nil {
t.Fatal("Relabel /usr succeeded")
t.Fatalf("Relabel /usr succeeded")
}
}
@ -131,13 +136,13 @@ func TestValidate(t *testing.T) {
func TestIsShared(t *testing.T) {
if shared := IsShared("Z"); shared {
t.Fatal("Expected label `Z` to not be shared, got %v", shared)
t.Fatalf("Expected label `Z` to not be shared, got %v", shared)
}
if shared := IsShared("z"); !shared {
t.Fatal("Expected label `z` to be shared, got %v", shared)
t.Fatalf("Expected label `z` to be shared, got %v", shared)
}
if shared := IsShared("Zz"); !shared {
t.Fatal("Expected label `Zz` to be shared, got %v", shared)
t.Fatalf("Expected label `Zz` to be shared, got %v", shared)
}
}

View file

@ -11,13 +11,12 @@ import (
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
@ -27,7 +26,8 @@ type Int32msg struct {
Value uint32
}
// int32msg has the following representation
// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
@ -43,7 +43,7 @@ func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {

View file

@ -10,16 +10,35 @@ The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd
package. In cgo, if the import of "C" is immediately preceded by a comment, that comment,
called the preamble, is used as a header when compiling the C parts of the package.
So every time we import package `nsenter`, the C code function `nsexec()` would be
called. And package `nsenter` is now only imported in Docker execdriver, so every time
before we call `execdriver.Exec()`, that C code would run.
called. And package `nsenter` is now only imported in `main_unix.go`, so every time
before we call `cmd.Start` on linux, that C code would run.
Because `nsexec()` must be run before the Go runtime in order to use the
Linux kernel namespace, you must `import` this library into a package if
you plan to use `libcontainer` directly. Otherwise Go will not execute
the `nsexec()` constructor, which means that the re-exec will not cause
the namespaces to be joined. You can import it like this:
```go
import _ "github.com/opencontainers/runc/libcontainer/nsenter"
```
`nsexec()` will first get the file descriptor number for the init pipe
from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened
by the parent and kept open across the fork-exec of the `nsexec()` init
process). The init pipe is used to read bootstrap data (namespace paths,
clone flags, uid and gid mappings, and the console path) from the parent
process. `nsexec()` will then call `setns(2)` to join the namespaces
provided in the bootstrap data (if available), `clone(2)` a child process
with the provided clone flags, update the user and group ID mappings, do
some further miscellaneous setup steps, and then send the PID of the
child process to the parent of the `nsexec()` "caller". Finally,
the parent `nsexec()` will exit and the child `nsexec()` process will
return to allow the Go runtime take over.
NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
CLONE_NEW* clone flags because we must fork a new process in order to
enter the PID namespace.
`nsexec()` will first check the environment variable `_LIBCONTAINER_INITPID`
which will give the process of the container that should be joined. Namespaces fd will
be found from `/proc/[pid]/ns` and set by `setns` syscall.
And then get the pipe number from `_LIBCONTAINER_INITPIPE`, error message could
be transfered through it. If tty is added, `_LIBCONTAINER_CONSOLE_PATH` will
have value and start a console for output.
Finally, `nsexec()` will clone a child process , exit the parent process and let
the Go runtime take over.

View file

@ -0,0 +1,32 @@
#ifndef NSENTER_NAMESPACE_H
#define NSENTER_NAMESPACE_H
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
#include <sched.h>
/* All of these are taken from include/uapi/linux/sched.h */
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#endif
#ifndef CLONE_NEWIPC
# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#endif
#ifndef CLONE_NEWUSER
# define CLONE_NEWUSER 0x10000000 /* New user namespace */
#endif
#ifndef CLONE_NEWPID
# define CLONE_NEWPID 0x20000000 /* New pid namespace */
#endif
#ifndef CLONE_NEWNET
# define CLONE_NEWNET 0x40000000 /* New network namespace */
#endif
#endif /* NSENTER_NAMESPACE_H */

View file

@ -29,7 +29,7 @@ func TestNsenterValidPaths(t *testing.T) {
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()),
fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
}
cmd := &exec.Cmd{
Path: os.Args[0],
@ -87,7 +87,47 @@ func TestNsenterInvalidPaths(t *testing.T) {
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("/proc/%d/ns/pid", -1),
fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
}
cmd := &exec.Cmd{
Path: os.Args[0],
Args: args,
ExtraFiles: []*os.File{child},
Env: []string{"_LIBCONTAINER_INITPIPE=3"},
}
if err := cmd.Start(); err != nil {
t.Fatal(err)
}
// write cloneFlags
r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
r.AddData(&libcontainer.Int32msg{
Type: libcontainer.CloneFlagsAttr,
Value: uint32(syscall.CLONE_NEWNET),
})
r.AddData(&libcontainer.Bytemsg{
Type: libcontainer.NsPathsAttr,
Value: []byte(strings.Join(namespaces, ",")),
})
if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
t.Fatal(err)
}
if err := cmd.Wait(); err == nil {
t.Fatalf("nsenter exits with a zero exit status")
}
}
func TestNsenterIncorrectPathType(t *testing.T) {
args := []string{"nsenter-exec"}
parent, child, err := newPipe()
if err != nil {
t.Fatalf("failed to create pipe %v", err)
}
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
}
cmd := &exec.Cmd{
Path: os.Args[0],

File diff suppressed because it is too large Load diff

View file

@ -28,6 +28,10 @@ type Process struct {
// local to the container's user and group configuration.
User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs.
Cwd string
@ -43,8 +47,9 @@ type Process struct {
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// consolePath is the path to the console allocated to the container.
consolePath string
// consoleChan provides the masterfd console.
// TODO: Make this persistent in Process.
consoleChan chan *os.File
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
@ -101,21 +106,14 @@ type IO struct {
Stderr io.ReadCloser
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid)
if err != nil {
return nil, err
func (p *Process) GetConsole() (Console, error) {
consoleFd, ok := <-p.consoleChan
if !ok {
return nil, fmt.Errorf("failed to get console from process")
}
p.consolePath = console.Path()
return console, nil
}
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
// TODO: Fix this so that it used the console API.
return &linuxConsole{
master: consoleFd,
}, nil
}

View file

@ -32,7 +32,7 @@ type parentProcess interface {
// wait waits on the process returning the process state.
wait() (*os.ProcessState, error)
// startTime return's the process start time.
// startTime returns the process start time.
startTime() (string, error)
signal(os.Signal) error
@ -70,47 +70,74 @@ func (p *setnsProcess) start() (err error) {
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "starting setns process")
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
if err = p.execSetns(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "writing config to pipe")
}
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procConsole:
if err := writeSync(p.parentPipe, procConsoleReq); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'request fd'")
}
masterFile, err := utils.RecvFd(p.parentPipe)
if err != nil {
return newSystemErrorWithCause(err, "getting master pty from child pipe")
}
if p.process.consoleChan == nil {
// TODO: Don't panic here, do something more sane.
panic("consoleChan is nil")
}
p.process.consoleChan <- masterFile
if err := writeSync(p.parentPipe, procConsoleAck); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'ack fd'")
}
case procReady:
// This shouldn't happen.
panic("unexpected procReady in setns")
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
return nil
})
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@ -123,7 +150,7 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
p.cmd.Wait()
@ -132,7 +159,7 @@ func (p *setnsProcess) execSetns() error {
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
@ -144,7 +171,7 @@ func (p *setnsProcess) execSetns() error {
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becomming a zombie.
// avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
@ -186,6 +213,7 @@ type initProcess struct {
process *Process
bootstrapData io.Reader
sharePidns bool
rootDir *os.File
}
func (p *initProcess) pid() int {
@ -221,6 +249,7 @@ func (p *initProcess) execSetns() error {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
@ -229,28 +258,29 @@ func (p *initProcess) start() error {
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
p.process.ops = nil
return newSystemError(err)
return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
defer func() {
if err != nil {
@ -259,60 +289,69 @@ func (p *initProcess) start() error {
}
}()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procConsole:
if err := writeSync(p.parentPipe, procConsoleReq); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'request fd'")
}
masterFile, err := utils.RecvFd(p.parentPipe)
if err != nil {
return newSystemErrorWithCause(err, "getting master pty from child pipe")
}
if p.process.consoleChan == nil {
// TODO: Don't panic here, do something more sane.
panic("consoleChan is nil")
}
p.process.consoleChan <- masterFile
if err := writeSync(p.parentPipe, procConsoleAck); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'ack fd'")
}
return newSystemError(err)
}
switch procSync.Type {
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for _, hook := range p.config.Config.Hooks.Prestart {
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
}
sentRun = true
case procHooks:
@ -321,48 +360,40 @@ loop:
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for _, hook := range p.config.Config.Hooks.Prestart {
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemError(err)
if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
}
sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
}
return nil
})
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process: %v", ierr))
return newSystemErrorWithCause(ierr, "container init")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@ -374,7 +405,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
killCgroupProcesses(p.manager)
signalAllProcesses(p.manager, syscall.SIGKILL)
}
return p.cmd.ProcessState, nil
}
@ -435,6 +466,8 @@ func getPipeFds(pid int) ([]string, error) {
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
@ -445,9 +478,11 @@ func getPipeFds(pid int) ([]string, error) {
return fds, nil
}
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
@ -479,7 +514,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
}
}

View file

@ -16,6 +16,7 @@ import (
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/symlink"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
@ -25,51 +26,56 @@ import (
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// setupDev returns true if /dev needs to be set up.
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
if m.Device == "bind" && (m.Destination == "/dev" || m.Destination == "/dev/") {
if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false
}
}
return true
}
// setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro or pivot_root,
// because console setup happens inside the caller. You must call
// finalizeRootfs in order to finish the rootfs setup.
func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
if err := prepareRoot(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "preparing rootfs")
}
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running postmount command")
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating device nodes")
}
if err := setupPtmx(config, console); err != nil {
return newSystemError(err)
if err := setupPtmx(config); err != nil {
return newSystemErrorWithCause(err, "setting up ptmx")
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting up /dev symlinks")
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
@ -77,52 +83,70 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
if err := syncParentHooks(pipe); err != nil {
return err
}
// The reason these operations are done here rather than in finalizeRootfs
// is because the console-handling code gets quite sticky if we have to set
// up the console before doing the pivot_root(2). This is because the
// Console API has to also work with the ExecIn case, which means that the
// API must be able to deal with being inside as well as outside the
// container. It's just cleaner to do this here (at the expense of the
// operation not being perfectly split).
if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
} else {
err = pivotRoot(config.Rootfs, config.PivotDir)
err = pivotRoot(config.Rootfs)
}
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "jailing process inside rootfs")
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
}
}
// remount dev as ro if specifed
return nil
}
// finalizeRootfs actually switches the root of the process and sets anything
// to ro if necessary. You must call prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
// remount dev as ro if specified
for _, m := range config.Mounts {
if m.Destination == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
return newSystemError(err)
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY {
if err := remountReadonly(m); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting rootfs as readonly")
}
}
syscall.Umask(0022)
return nil
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
}
return nil
}
@ -154,15 +178,41 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
return nil
case "tmpfs":
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
tmpDir := ""
stat, err := os.Stat(dest)
if err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
defer os.RemoveAll(tmpDir)
m.Destination = tmpDir
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
if copyUp {
if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
if err := syscall.Mount(tmpDir, dest, "", syscall.MS_MOVE, ""); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
@ -240,34 +290,23 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
return err
}
}
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil {
// if cgroup already exists, then okay(it could have been created before)
if os.IsExist(err) {
continue
}
os.Chdir(cwd)
// symlink(2) is very dumb, it will just shove the path into
// the link and doesn't do any checks or relative path
// conversion. Also, don't error out if the cgroup already exists.
if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
return err
}
}
}
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly
mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY,
Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
}
if err := remount(mcgrouproot, rootfs); err != nil {
return err
@ -283,7 +322,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
mounts, err := cgroups.GetCgroupMounts()
mounts, err := cgroups.GetCgroupMounts(false)
if err != nil {
return nil, err
}
@ -307,7 +346,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
binds = append(binds, &configs.Mount{
Device: "bind",
Source: filepath.Join(mm.Mountpoint, relDir),
Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")),
Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags,
PropagationFlags: m.PropagationFlags,
})
@ -319,9 +358,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{
"/proc",
}
@ -333,6 +369,8 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/diskstats",
"/proc/meminfo",
"/proc/stat",
"/proc/swaps",
"/proc/uptime",
"/proc/net/dev",
}
for _, valid := range validDestinations {
@ -515,10 +553,10 @@ func getParentMount(rootfs string) (string, string, error) {
}
// Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error {
func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs)
parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil {
return err
}
@ -551,7 +589,10 @@ func prepareRoot(config *configs.Config) error {
return err
}
if err := rootfsParentMountPrivate(config); err != nil {
// Make parent mount private to make sure following bind mount does
// not propagate in other namespaces. Also it will help with kernel
// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err
}
@ -562,7 +603,7 @@ func setReadonly() error {
return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
func setupPtmx(config *configs.Config, console *linuxConsole) error {
func setupPtmx(config *configs.Config) error {
ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
@ -570,47 +611,61 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
if console != nil {
return console.mount(config.Rootfs, config.MountLabel)
}
return nil
}
func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" {
pivotBaseDir = "/"
}
tmpDir := filepath.Join(rootfs, pivotBaseDir)
if err := os.MkdirAll(tmpDir, 0755); err != nil {
return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err)
}
pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root")
if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
// path to pivot dir now changed, update
pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir))
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
func pivotRoot(rootfs string) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
// Make pivotDir rprivate to make sure any of the unmounts don't
// propagate to parent.
if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
oldroot, err := syscall.Open("/", syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil {
return err
}
defer syscall.Close(oldroot)
newroot, err := syscall.Open(rootfs, syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil {
return err
}
defer syscall.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := syscall.Fchdir(newroot); err != nil {
return err
}
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err)
if err := syscall.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := syscall.Fchdir(oldroot); err != nil {
return err
}
// Make oldroot rprivate to make sure our unmounts don't propagate to the
// host (and thus bork the machine).
if err := syscall.Mount("", ".", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
return nil
}
@ -645,17 +700,26 @@ func createIfNotExists(path string, isDir bool) error {
return nil
}
// remountReadonly will bind over the top of an existing path and ensure that it is read-only.
func remountReadonly(path string) error {
// readonlyPath will make a path read only.
func readonlyPath(path string) error {
if err := syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
// remountReadonly will remount an existing mount point and ensure that it is read-only.
func remountReadonly(m *configs.Mount) error {
var (
dest = m.Destination
flags = m.Flags
)
for i := 0; i < 5; i++ {
if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) {
if err := syscall.Mount("", dest, "", uintptr(flags|syscall.MS_REMOUNT|syscall.MS_RDONLY), ""); err != nil {
switch err {
case syscall.EINVAL:
// Probably not a mountpoint, use bind-mount
if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
return err
}
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "")
case syscall.EBUSY:
time.Sleep(100 * time.Millisecond)
continue
@ -665,13 +729,19 @@ func remountReadonly(path string) error {
}
return nil
}
return fmt.Errorf("unable to mount %s as readonly max retries reached", path)
return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
}
// maskFile bind mounts /dev/null over the top of the specified path inside a container
// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ).
func maskFile(path string) error {
// maskPath masks the top of the specified path inside a container to avoid
// security issues from processes reading information from non-namespace aware
// mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string) error {
if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
if err == syscall.ENOTDIR {
return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "")
}
return err
}
return nil
@ -705,10 +775,12 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
)
if dest == "/dev" {
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) {
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
if !(copyUp || strings.HasPrefix(dest, rootfs)) {
dest = filepath.Join(rootfs, dest)
}

View file

@ -32,14 +32,6 @@ func TestCheckMountDestFalsePositive(t *testing.T) {
}
}
func TestCheckMountRoot(t *testing.T) {
dest := "/rootfs"
err := checkMountDestination("/rootfs", dest)
if err == nil {
t.Fatal(err)
}
}
func TestNeedsSetupDev(t *testing.T) {
config := &configs.Config{
Mounts: []*configs.Mount{

View file

@ -36,6 +36,11 @@ var archs = map[string]string{
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.

View file

@ -212,10 +212,6 @@ func parseStatusFile(path string) (map[string]string, error) {
status := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
@ -225,5 +221,9 @@ func parseStatusFile(path string) (map[string]string, error) {
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}

View file

@ -10,7 +10,7 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled

View file

@ -16,7 +16,6 @@ import (
"sync"
"syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/system"
)
@ -33,20 +32,106 @@ const (
stRdOnly = 0x01
)
type selinuxState struct {
enabledSet bool
enabled bool
selinuxfsSet bool
selinuxfs string
mcsList map[string]bool
sync.Mutex
}
var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool)
mcsLock sync.Mutex
selinuxfs = "unknown"
selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
state = selinuxState{
mcsList: make(map[string]bool),
}
)
type SELinuxContext map[string]string
func (s *selinuxState) setEnable(enabled bool) bool {
s.Lock()
defer s.Unlock()
s.enabledSet = true
s.enabled = enabled
return s.enabled
}
func (s *selinuxState) getEnabled() bool {
s.Lock()
enabled := s.enabled
enabledSet := s.enabledSet
s.Unlock()
if enabledSet {
return enabled
}
enabled = false
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
enabled = true
}
}
return s.setEnable(enabled)
}
// SetDisabled disables selinux support for the package
func SetDisabled() {
selinuxEnabled, selinuxEnabledChecked = false, true
state.setEnable(false)
}
func (s *selinuxState) setSELinuxfs(selinuxfs string) string {
s.Lock()
defer s.Unlock()
s.selinuxfsSet = true
s.selinuxfs = selinuxfs
return s.selinuxfs
}
func (s *selinuxState) getSELinuxfs() string {
s.Lock()
selinuxfs := s.selinuxfs
selinuxfsSet := s.selinuxfsSet
s.Unlock()
if selinuxfsSet {
return selinuxfs
}
selinuxfs = ""
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return selinuxfs
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
// Safe as mountinfo encodes mountpoints with spaces as \040.
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
continue
}
if !strings.Contains(txt[sepIdx:], "selinuxfs") {
continue
}
fields := strings.Split(txt, " ")
if len(fields) < 5 {
continue
}
selinuxfs = fields[4]
break
}
if selinuxfs != "" {
var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf)
if (buf.Flags & stRdOnly) == 1 {
selinuxfs = ""
}
}
return s.setSELinuxfs(selinuxfs)
}
// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs
@ -55,43 +140,12 @@ func SetDisabled() {
// processes. The existence of an selinuxfs mount is used to determine
// whether selinux is currently enabled or not.
func getSelinuxMountPoint() string {
if selinuxfs != "unknown" {
return selinuxfs
}
selinuxfs = ""
mounts, err := mount.GetMounts()
if err != nil {
return selinuxfs
}
for _, mount := range mounts {
if mount.Fstype == "selinuxfs" {
selinuxfs = mount.Mountpoint
break
}
}
if selinuxfs != "" {
var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf)
if (buf.Flags & stRdOnly) == 1 {
selinuxfs = ""
}
}
return selinuxfs
return state.getSELinuxfs()
}
// SelinuxEnabled returns whether selinux is currently enabled.
func SelinuxEnabled() bool {
if selinuxEnabledChecked {
return selinuxEnabled
}
selinuxEnabledChecked = true
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
selinuxEnabled = true
}
}
return selinuxEnabled
return state.getEnabled()
}
func readConfig(target string) (value string) {
@ -269,19 +323,19 @@ func SelinuxGetEnforceMode() int {
}
func mcsAdd(mcs string) error {
mcsLock.Lock()
defer mcsLock.Unlock()
if mcsList[mcs] {
state.Lock()
defer state.Unlock()
if state.mcsList[mcs] {
return fmt.Errorf("MCS Label already exists")
}
mcsList[mcs] = true
state.mcsList[mcs] = true
return nil
}
func mcsDelete(mcs string) {
mcsLock.Lock()
mcsList[mcs] = false
mcsLock.Unlock()
state.Lock()
defer state.Unlock()
state.mcsList[mcs] = false
}
func IntToMcs(id int, catRange uint32) string {
@ -297,7 +351,7 @@ func IntToMcs(id int, catRange uint32) string {
for ORD > TIER {
ORD = ORD - TIER
TIER -= 1
TIER--
}
TIER = SETSIZE - TIER
ORD = ORD + TIER
@ -320,9 +374,7 @@ func uniqMcs(catRange uint32) string {
continue
} else {
if c1 > c2 {
t := c1
c1 = c2
c2 = t
c1, c2 = c2, c1
}
}
mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2)
@ -341,6 +393,12 @@ func FreeLxcContexts(scon string) {
}
}
var roFileLabel string
func GetROFileLabel() (fileLabel string) {
return roFileLabel
}
func GetLxcContexts() (processLabel string, fileLabel string) {
var (
val, key string
@ -385,6 +443,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
if key == "file" {
fileLabel = strings.Trim(val, "\"")
}
if key == "ro_file" {
roFileLabel = strings.Trim(val, "\"")
}
}
}
@ -392,6 +453,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
return "", ""
}
if roFileLabel == "" {
roFileLabel = fileLabel
}
exit:
// mcs := IntToMcs(os.Getpid(), 1024)
mcs := uniqMcs(1024)
@ -432,13 +496,13 @@ func badPrefix(fpath string) error {
for _, prefix := range badprefixes {
if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) {
return fmt.Errorf("Relabeling content in %s is not allowed.", prefix)
return fmt.Errorf("relabeling content in %s is not allowed", prefix)
}
}
return nil
}
// Change the fpath file object to the SELinux label scon.
// Chcon changes the fpath file object to the SELinux label scon.
// If the fpath is a directory and recurse is true Chcon will walk the
// directory tree setting the label
func Chcon(fpath string, scon string, recurse bool) error {
@ -472,14 +536,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" {
return nil
}
return []string{"label:user:" + con["user"],
"label:role:" + con["role"],
"label:type:" + con["type"],
"label:level:" + con["level"]}
return []string{"user:" + con["user"],
"role:" + con["role"],
"type:" + con["type"],
"level:" + con["level"]}
}
// DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes
func DisableSecOpt() []string {
return []string{"label:disable"}
return []string{"disable"}
}

View file

@ -16,6 +16,7 @@ import (
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *os.File
config *initConfig
}
@ -24,9 +25,19 @@ func (l *linuxSetnsInit) getSessionRingName() string {
}
func (l *linuxSetnsInit) Init() error {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
}
}
if l.config.CreateConsole {
if err := setupConsole(l.pipe, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
@ -44,10 +55,8 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if l.config.ProcessLabel != "" {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View file

@ -8,12 +8,10 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
@ -22,7 +20,7 @@ import (
const wildcard = -1
var namespaceMapping = map[specs.NamespaceType]configs.NamespaceType{
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
specs.PIDNamespace: configs.NEWPID,
specs.NetworkNamespace: configs.NEWNET,
specs.MountNamespace: configs.NEWNS,
@ -145,6 +143,7 @@ type CreateOpts struct {
CgroupName string
UseSystemdCgroup bool
NoPivotRoot bool
NoNewKeyring bool
Spec *specs.Spec
}
@ -165,14 +164,17 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
labels := []string{}
for k, v := range spec.Annotations {
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
}
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: []string{
"bundle=" + cwd,
},
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
}
exists := false
@ -185,6 +187,9 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
if config.Namespaces.Contains(t) {
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
@ -218,12 +223,12 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
config.Seccomp = seccomp
}
config.Sysctl = spec.Linux.Sysctl
if oomScoreAdj := spec.Linux.Resources.OOMScoreAdj; oomScoreAdj != nil {
config.OomScoreAdj = *oomScoreAdj
if spec.Process.SelinuxLabel != "" {
config.ProcessLabel = spec.Process.SelinuxLabel
}
for _, g := range spec.Process.User.AdditionalGids {
config.AdditionalGroups = append(config.AdditionalGroups, strconv.FormatUint(uint64(g), 10))
config.Sysctl = spec.Linux.Sysctl
if spec.Linux.Resources != nil && spec.Linux.Resources.OOMScoreAdj != nil {
config.OomScoreAdj = *spec.Linux.Resources.OOMScoreAdj
}
createHooks(spec, config)
config.MountLabel = spec.Linux.MountLabel
@ -232,7 +237,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data := parseMountOptions(m.Options)
flags, pgflags, data, ext := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
if !filepath.IsAbs(source) {
@ -246,20 +251,18 @@ func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
Data: data,
Flags: flags,
PropagationFlags: pgflags,
Extensions: ext,
}
}
func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) {
var (
err error
myCgroupPath string
)
var myCgroupPath string
c := &configs.Cgroup{
Resources: &configs.Resources{},
}
if spec.Linux.CgroupsPath != nil {
if spec.Linux != nil && spec.Linux.CgroupsPath != nil {
myCgroupPath = libcontainerUtils.CleanPath(*spec.Linux.CgroupsPath)
if useSystemdCgroup {
myCgroupPath = *spec.Linux.CgroupsPath
@ -284,16 +287,15 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
}
} else {
if myCgroupPath == "" {
myCgroupPath, err = cgroups.GetThisCgroupDir("devices")
if err != nil {
return nil, err
}
myCgroupPath = filepath.Join(myCgroupPath, name)
c.Name = name
}
c.Path = myCgroupPath
}
c.Resources.AllowedDevices = allowedDevices
if spec.Linux == nil {
return c, nil
}
r := spec.Linux.Resources
if r == nil {
return c, nil
@ -376,7 +378,7 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
}
}
if r.Pids != nil {
c.Resources.PidsLimit = *r.Pids.Limit
c.Resources.PidsLimit = r.Pids.Limit
}
if r.BlockIO != nil {
if r.BlockIO.Weight != nil {
@ -387,39 +389,50 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
}
if r.BlockIO.WeightDevice != nil {
for _, wd := range r.BlockIO.WeightDevice {
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, *wd.Weight, *wd.LeafWeight)
var weight, leafWeight uint16
if wd.Weight != nil {
weight = *wd.Weight
}
if wd.LeafWeight != nil {
leafWeight = *wd.LeafWeight
}
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
}
}
if r.BlockIO.ThrottleReadBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleReadIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
}
}
}
for _, l := range r.HugepageLimits {
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
Pagesize: *l.Pagesize,
Limit: *l.Limit,
Pagesize: l.Pagesize,
Limit: l.Limit,
})
}
if r.DisableOOMKiller != nil {
@ -427,7 +440,7 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
}
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = string(*r.Network.ClassID)
c.Resources.NetClsClassid = *r.Network.ClassID
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
@ -513,6 +526,8 @@ func createDevices(spec *specs.Spec, config *configs.Config) error {
// merge in additional devices from the spec
for _, d := range spec.Linux.Devices {
var uid, gid uint32
var filemode os.FileMode = 0666
if d.UID != nil {
uid = *d.UID
}
@ -523,12 +538,15 @@ func createDevices(spec *specs.Spec, config *configs.Config) error {
if err != nil {
return err
}
if d.FileMode != nil {
filemode = *d.FileMode
}
device := &configs.Device{
Type: dt,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: *d.FileMode,
FileMode: filemode,
Uid: uid,
Gid: gid,
}
@ -541,11 +559,7 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
if len(spec.Linux.UIDMappings) == 0 {
return nil
}
// do not override the specified user namespace path
if config.Namespaces.PathOf(configs.NEWUSER) == "" {
config.Namespaces.Add(configs.NEWUSER, "")
}
create := func(m specs.IDMapping) configs.IDMap {
create := func(m specs.LinuxIDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
@ -575,11 +589,12 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
// parseMountOptions parses the string and returns the flags, propagation
// flags and any mount data that it contains.
func parseMountOptions(options []string) (int, []int, string) {
func parseMountOptions(options []string) (int, []int, string, int) {
var (
flag int
pgflag []int
data []string
flag int
pgflag []int
data []string
extFlags int
)
flags := map[string]struct {
clear bool
@ -611,18 +626,21 @@ func parseMountOptions(options []string) (int, []int, string) {
"suid": {true, syscall.MS_NOSUID},
"sync": {false, syscall.MS_SYNCHRONOUS},
}
propagationFlags := map[string]struct {
propagationFlags := map[string]int{
"private": syscall.MS_PRIVATE,
"shared": syscall.MS_SHARED,
"slave": syscall.MS_SLAVE,
"unbindable": syscall.MS_UNBINDABLE,
"rprivate": syscall.MS_PRIVATE | syscall.MS_REC,
"rshared": syscall.MS_SHARED | syscall.MS_REC,
"rslave": syscall.MS_SLAVE | syscall.MS_REC,
"runbindable": syscall.MS_UNBINDABLE | syscall.MS_REC,
}
extensionFlags := map[string]struct {
clear bool
flag int
}{
"private": {false, syscall.MS_PRIVATE},
"shared": {false, syscall.MS_SHARED},
"slave": {false, syscall.MS_SLAVE},
"unbindable": {false, syscall.MS_UNBINDABLE},
"rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC},
"rshared": {false, syscall.MS_SHARED | syscall.MS_REC},
"rslave": {false, syscall.MS_SLAVE | syscall.MS_REC},
"runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC},
"tmpcopyup": {false, configs.EXT_COPYUP},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
@ -634,16 +652,22 @@ func parseMountOptions(options []string) (int, []int, string) {
} else {
flag |= f.flag
}
} else if f, exists := propagationFlags[o]; exists && f.flag != 0 {
pgflag = append(pgflag, f.flag)
} else if f, exists := propagationFlags[o]; exists && f != 0 {
pgflag = append(pgflag, f)
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
if f.clear {
extFlags &= ^f.flag
} else {
extFlags |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, pgflag, strings.Join(data, ",")
return flag, pgflag, strings.Join(data, ","), extFlags
}
func setupSeccomp(config *specs.Seccomp) (*configs.Seccomp, error) {
func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}

View file

@ -1,9 +1,8 @@
// build +linux
// +build linux
package specconv
import (
"strings"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
@ -13,7 +12,9 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) {
cgroupsPath := "/user/cgroups/path/id"
spec := &specs.Spec{}
spec.Linux.CgroupsPath = &cgroupsPath
spec.Linux = &specs.Linux{
CgroupsPath: &cgroupsPath,
}
cgroup, err := createCgroupConfig("ContainerID", false, spec)
if err != nil {
@ -33,7 +34,31 @@ func TestLinuxCgroupsPathNotSpecified(t *testing.T) {
t.Errorf("Couldn't create Cgroup config: %v", err)
}
if !strings.HasSuffix(cgroup.Path, "/ContainerID") {
t.Errorf("Wrong cgroupsPath, expected it to have suffix '%s' got '%s'", "/ContainerID", cgroup.Path)
if cgroup.Path != "" {
t.Errorf("Wrong cgroupsPath, expected it to be empty string, got '%s'", cgroup.Path)
}
}
func TestDupNamespaces(t *testing.T) {
spec := &specs.Spec{
Linux: &specs.Linux{
Namespaces: []specs.LinuxNamespace{
{
Type: "pid",
},
{
Type: "pid",
Path: "/proc/1/ns/pid",
},
},
},
}
_, err := CreateLibcontainerConfig(&CreateOpts{
Spec: spec,
})
if err == nil {
t.Errorf("Duplicated namespaces should be forbidden")
}
}

View file

@ -2,14 +2,14 @@ package stacktrace
import "runtime"
// Caputure captures a stacktrace for the current calling go program
// Capture captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr = 0
prevPc uintptr
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)

View file

@ -19,9 +19,9 @@ func TestCaptureTestFunc(t *testing.T) {
// the first frame is the caller
frame := stack.Frames[0]
if expected := "captureFunc"; frame.Function != expected {
t.Fatalf("expteced function %q but recevied %q", expected, frame.Function)
t.Fatalf("expected function %q but recevied %q", expected, frame.Function)
}
expected := "github.com/opencontainers/runc/libcontainer/stacktrace"
expected := "/runc/libcontainer/stacktrace"
if !strings.HasSuffix(frame.Package, expected) {
t.Fatalf("expected package %q but received %q", expected, frame.Package)
}

View file

@ -4,8 +4,8 @@ package libcontainer
import (
"fmt"
"io"
"os"
"os/exec"
"syscall"
"github.com/opencontainers/runc/libcontainer/apparmor"
@ -17,9 +17,10 @@ import (
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
pipe *os.File
parentPid int
stateDirFD int
config *initConfig
}
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
@ -43,30 +44,20 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error {
ringname, keepperms, newperms := l.getSessionRingParams()
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
if err := console.dupStdio(); err != nil {
return err
}
}
if console != nil {
if err := system.Setctty(); err != nil {
// do not inherit the parent's session keyring
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
@ -75,12 +66,33 @@ func (l *linuxStandardInit) Init() error {
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
// prepareRootfs() can be executed only for a new mount namespace.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
return err
}
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.pipe, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := syscall.Sethostname([]byte(hostname)); err != nil {
return err
@ -99,12 +111,12 @@ func (l *linuxStandardInit) Init() error {
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
if err := readonlyPath(path); err != nil {
return err
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskFile(path); err != nil {
if err := maskPath(path); err != nil {
return err
}
}
@ -123,7 +135,10 @@ func (l *linuxStandardInit) Init() error {
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
@ -136,12 +151,39 @@ func (l *linuxStandardInit) Init() error {
if err := pdeath.Restore(); err != nil {
return err
}
// compare the parent from the inital start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparened to something else so we should
// compare the parent from the initial start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
l.pipe.Close()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
}
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
syscall.Close(l.stateDirFD)
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}

View file

@ -6,6 +6,7 @@ import (
"fmt"
"os"
"path/filepath"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
@ -38,7 +39,7 @@ type containerState interface {
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
if err := signalAllProcesses(c.cgroupManager, syscall.SIGKILL); err != nil {
logrus.Warn(err)
}
}
@ -59,7 +60,6 @@ func runPoststopHooks(c *linuxContainer) error {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststop {
@ -77,15 +77,12 @@ type stoppedState struct {
}
func (b *stoppedState) status() Status {
return Destroyed
return Stopped
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
case *runningState, *restoredState:
b.c.state = s
return nil
case *stoppedState:
@ -110,11 +107,11 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
@ -129,16 +126,40 @@ func (r *runningState) transition(s containerState) error {
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(syscall.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
@ -161,11 +182,11 @@ func (p *pausedState) transition(s containerState) error {
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
t, err := p.c.runType()
if err != nil {
return err
}
if !isRunning {
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
@ -174,8 +195,8 @@ func (p *pausedState) destroy() error {
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
// restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
c *linuxContainer
@ -187,9 +208,7 @@ func (r *restoredState) status() Status {
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
case *stoppedState, *runningState:
return nil
}
return newStateTransitionError(r, s)
@ -204,23 +223,23 @@ func (r *restoredState) destroy() error {
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
type loadedState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
func (n *loadedState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
func (n *loadedState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}

View file

@ -6,10 +6,11 @@ import "testing"
func TestStateStatus(t *testing.T) {
states := map[containerState]Status{
&stoppedState{}: Destroyed,
&stoppedState{}: Stopped,
&runningState{}: Running,
&restoredState{}: Running,
&pausedState{}: Paused,
&createdState{}: Created,
}
for s, status := range states {
if s.status() != status {

View file

@ -0,0 +1,7 @@
package libcontainer
// Solaris - TODO
type Stats struct {
Interfaces []*NetworkInterface
}

View file

@ -0,0 +1,110 @@
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"github.com/opencontainers/runc/libcontainer/utils"
)
type syncType string
// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by a &genericError).
//
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
//
// procConsole -->
// <-- procConsoleReq
// [send(fd)] --> [recv(fd)]
// <-- procConsoleAck
//
// procReady --> [final setup]
// <-- procRun
const (
procError syncType = "procError"
procReady syncType = "procReady"
procRun syncType = "procRun"
procHooks syncType = "procHooks"
procResume syncType = "procResume"
procConsole syncType = "procConsole"
procConsoleReq syncType = "procConsoleReq"
procConsoleAck syncType = "procConsoleAck"
)
type syncT struct {
Type syncType `json:"type"`
}
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil {
return err
}
return nil
}
// readSync is used to read from a synchronisation pipe. An error is returned
// if we got a genericError, the pipe was closed, or we got an unexpected flag.
func readSync(pipe io.Reader, expected syncType) error {
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type == procError {
var ierr genericError
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
return fmt.Errorf("failed reading error from parent: %v", err)
}
return &ierr
}
if procSync.Type != expected {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
dec := json.NewDecoder(pipe)
for {
var sync syncT
if err := dec.Decode(&sync); err != nil {
if err == io.EOF {
break
}
return err
}
// We handle this case outside fn for cleanliness reasons.
var ierr *genericError
if sync.Type == procError {
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
return ierr
}
// Programmer error.
panic("No error following JSON procError payload.")
}
if err := fn(&sync); err != nil {
return err
}
}
return nil
}

View file

@ -100,17 +100,12 @@ func Setctty() error {
return nil
}
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()

View file

@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
if err != nil {
return "", err
}
return parseStartTime(string(data))
}
parts := strings.Split(string(data), " ")
func parseStartTime(stat string) (string, error) {
// the starttime is located at pos 22
// from the man page
//
@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
// (22) The time the process started after system boot. In kernels before Linux 2.6, this
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks
// (divide by sysconf(_SC_CLK_TCK)).
return parts[22-1], nil // starts at 1
//
// NOTE:
// pos 2 could contain space and is inside `(` and `)`:
// (2) comm %s
// The filename of the executable, in parentheses.
// This is visible whether or not the executable is
// swapped out.
//
// the following is an example:
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
// get parts after last `)`:
s := strings.Split(stat, ")")
parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
return parts[22-3], nil // starts at 3 (after the filename pos `2`)
}

View file

@ -0,0 +1,20 @@
package system
import "testing"
func TestParseStartTime(t *testing.T) {
data := map[string]string{
"4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": "9126532",
"9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": "9214966",
"24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": "8722075",
}
for line, startTime := range data {
st, err := parseStartTime(line)
if err != nil {
t.Fatal(err)
}
if startTime != st {
t.Fatalf("expected start time %q but received %q", startTime, st)
}
}
}

View file

@ -8,7 +8,7 @@ import (
// Setuid sets the uid of the calling thread to the specified uid.
func Setuid(uid int) (err error) {
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(uid), 0, 0)
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETUID32, uintptr(uid), 0, 0)
if e1 != 0 {
err = e1
}

View file

@ -343,7 +343,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else if groupArg != "" {
} else {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.

View file

@ -7,6 +7,8 @@ import (
"strconv"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/utils"
)
func TestUserParseLine(t *testing.T) {
@ -382,6 +384,12 @@ this is just some garbage data
}
func TestGetAdditionalGroups(t *testing.T) {
type foo struct {
groups []string
expected []int
hasError bool
}
const groupContent = `
root:x:0:root
adm:x:43:
@ -389,11 +397,7 @@ grp:x:1234:root,adm
adm:x:4343:root,adm-duplicate
this is just some garbage data
`
tests := []struct {
groups []string
expected []int
hasError bool
}{
tests := []foo{
{
// empty group
groups: []string{},
@ -436,12 +440,15 @@ this is just some garbage data
expected: nil,
hasError: true,
},
{
}
if utils.GetIntSize() > 4 {
tests = append(tests, foo{
// groups with too large id
groups: []string{strconv.Itoa(1 << 31)},
expected: nil,
hasError: true,
},
})
}
for _, test := range tests {

View file

@ -0,0 +1,148 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "cmsg.h"
#define error(fmt, ...) \
({ \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
errno = ECOMM; \
goto err; /* return value */ \
})
/*
* Sends a file descriptor along the sockfd provided. Returns the return
* value of sendmsg(2). Any synchronisation and preparation of state
* should be done external to this (we expect the other side to be in
* recvfd() in the code).
*/
ssize_t sendfd(int sockfd, struct file_t file)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
int *fdptr;
int ret;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/*
* We need to send some other data along with the ancillary data,
* otherwise the other side won't recieve any data. This is very
* well-hidden in the documentation (and only applies to
* SOCK_STREAM). See the bottom part of unix(7).
*/
iov[0].iov_base = file.name;
iov[0].iov_len = strlen(file.name) + 1;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
fdptr = (int *) CMSG_DATA(cmsg);
memcpy(fdptr, &file.fd, sizeof(int));
return sendmsg(sockfd, &msg, 0);
}
/*
* Receives a file descriptor from the sockfd provided. Returns the file
* descriptor as sent from sendfd(). It will return the file descriptor
* or die (literally) trying. Any synchronisation and preparation of
* state should be done external to this (we expect the other side to be
* in sendfd() in the code).
*/
struct file_t recvfd(int sockfd)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
struct file_t file = {0};
int *fdptr;
int olderrno;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/* Allocate a buffer. */
/* TODO: Make this dynamic with MSG_PEEK. */
file.name = malloc(TAG_BUFFER);
if (!file.name)
error("recvfd: failed to allocate file.tag buffer\n");
/*
* We need to "recieve" the non-ancillary data even though we don't
* plan to use it at all. Otherwise, things won't work as expected.
* See unix(7) and other well-hidden documentation.
*/
iov[0].iov_base = file.name;
iov[0].iov_len = TAG_BUFFER;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
ssize_t ret = recvmsg(sockfd, &msg, 0);
if (ret < 0)
goto err;
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
error("recvfd: got NULL from CMSG_FIRSTHDR");
if (cmsg->cmsg_level != SOL_SOCKET)
error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
if (cmsg->cmsg_type != SCM_RIGHTS)
error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
error("recvfd: expected correct CMSG_LEN in cmsg: %lu", (unsigned long)cmsg->cmsg_len);
fdptr = (int *) CMSG_DATA(cmsg);
if (!fdptr || *fdptr < 0)
error("recvfd: recieved invalid pointer");
file.fd = *fdptr;
return file;
err:
olderrno = errno;
free(file.name);
errno = olderrno;
return (struct file_t){0};
}

View file

@ -0,0 +1,57 @@
// +build linux
package utils
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
#include <errno.h>
#include <stdlib.h>
#include "cmsg.h"
*/
import "C"
import (
"os"
"unsafe"
)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
file, err := C.recvfd(C.int(socket.Fd()))
if err != nil {
return nil, err
}
defer C.free(unsafe.Pointer(file.name))
return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket, file *os.File) error {
var cfile C.struct_file_t
cfile.fd = C.int(file.Fd())
cfile.name = C.CString(file.Name())
defer C.free(unsafe.Pointer(cfile.name))
_, err := C.sendfd(C.int(socket.Fd()), cfile)
return err
}

View file

@ -0,0 +1,36 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if !defined(CMSG_H)
#define CMSG_H
#include <sys/types.h>
/* TODO: Implement this properly with MSG_PEEK. */
#define TAG_BUFFER 4096
/* This mirrors Go's (*os.File). */
struct file_t {
char *name;
int fd;
};
struct file_t recvfd(int sockfd);
ssize_t sendfd(int sockfd, struct file_t file);
#endif /* !defined(CMSG_H) */

View file

@ -9,6 +9,7 @@ import (
"path/filepath"
"strings"
"syscall"
"unsafe"
)
const (
@ -100,3 +101,26 @@ func SearchLabels(labels []string, query string) string {
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}
func GetIntSize() int {
return int(unsafe.Sizeof(1))
}

View file

@ -133,21 +133,21 @@ func TestWriteJSON(t *testing.T) {
func TestCleanPath(t *testing.T) {
path := CleanPath("")
if path != "" {
t.Errorf("expected to received empty string and received %s", path)
t.Errorf("expected to receive empty string and received %s", path)
}
path = CleanPath("rootfs")
if path != "rootfs" {
t.Errorf("expected to received 'rootfs' and received %s", path)
t.Errorf("expected to receive 'rootfs' and received %s", path)
}
path = CleanPath("../../../var")
if path != "var" {
t.Errorf("expected to received 'var' and received %s", path)
t.Errorf("expected to receive 'var' and received %s", path)
}
path = CleanPath("/../../../var")
if path != "/var" {
t.Errorf("expected to received '/var' and received %s", path)
t.Errorf("expected to receive '/var' and received %s", path)
}
}