update vendor

Signed-off-by: Jess Frazelle <acidburn@microsoft.com>
This commit is contained in:
Jess Frazelle 2018-09-25 12:27:46 -04:00
parent 19a32db84d
commit 94d1cfbfbf
No known key found for this signature in database
GPG key ID: 18F3685C0022BFF3
10501 changed files with 2307943 additions and 29279 deletions

View file

@ -3,7 +3,6 @@
package fs
import (
"errors"
"fmt"
"io"
"io/ioutil"
@ -14,6 +13,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var (
@ -35,7 +36,7 @@ var (
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
type subsystemSet []subsystem
@ -62,9 +63,10 @@ type subsystem interface {
}
type Manager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
mu sync.Mutex
Cgroups *configs.Cgroup
Rootless bool
Paths map[string]string
}
// The absolute path to the root of the cgroup hierarchies.
@ -100,6 +102,33 @@ type cgroupData struct {
pid int
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if os.IsPermission(errors.Cause(err)) {
return true
}
// Try to handle other errnos.
var errno error
switch err := errors.Cause(err).(type) {
case *os.PathError:
errno = err.Err
case *os.LinkError:
errno = err.Err
case *os.SyscallError:
errno = err.Err
}
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
@ -145,11 +174,11 @@ func (m *Manager) Apply(pid int) (err error) {
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
if os.IsPermission(err) && m.Cgroups.Path == "" {
// If we didn't set a cgroup path, then let's defer the error here
// until we know whether we have set limits or not.
// If we hadn't set limits, then it's ok that we couldn't join this cgroup, because
// it will have the same limits as its parent.
// In the case of rootless, where an explicit cgroup path hasn't
// been set, we don't bail on error in case of permission problems.
// Cases where limits have been set (and we couldn't create our own
// cgroup) are handled by Set.
if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
delete(m.Paths, sys.Name())
continue
}
@ -208,8 +237,9 @@ func (m *Manager) Set(container *configs.Config) error {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
if path == "" {
// cgroup never applied
return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name())
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}

View file

@ -77,7 +77,7 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}

View file

@ -86,7 +86,7 @@ func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats)
expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
if expected.UseHierarchy != actual.UseHierarchy {
logrus.Printf("Expected memory use hiearchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy)
logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy)
t.Fail()
}

View file

@ -5,6 +5,7 @@ package systemd
import (
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strings"
@ -75,7 +76,8 @@ var (
hasStartTransientUnit bool
hasStartTransientSliceUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
hasDelegateScope bool
hasDelegateSlice bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
@ -150,12 +152,12 @@ func UseSystemd() bool {
theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
hasDelegateScope = true
dlScope := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dlScope}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
hasDelegateScope = false
}
}
}
@ -187,6 +189,22 @@ func UseSystemd() bool {
time.Sleep(time.Millisecond)
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(slice, "replace", nil)
// Assume StartTransientUnit on a slice allows Delegate
hasDelegateSlice = true
dlSlice := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(slice, "replace", []systemdDbus.Property{dlSlice}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
// Starting with systemd v237, Delegate is not even a property of slices anymore,
// so the D-Bus call fails with "InvalidArgs" error.
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") {
hasDelegateSlice = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
theConn.StopUnit(slice, "replace", nil)
@ -242,9 +260,16 @@ func (m *Manager) Apply(pid int) error {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
// Check if we can delegate. This is only supported on systemd versions 218 and above.
if strings.HasSuffix(unitName, ".slice") {
if hasDelegateSlice {
// systemd 237 and above no longer allows delegation on a slice
properties = append(properties, newProp("Delegate", true))
}
} else {
if hasDelegateScope {
properties = append(properties, newProp("Delegate", true))
}
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
@ -271,13 +296,19 @@ func (m *Manager) Apply(pid int) error {
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
// corresponds to USEC_INFINITY in systemd
// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
// always setting a property value ensures we can apply a quota and remove it later
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if c.Resources.CpuQuota > 0 {
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
}
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
@ -296,17 +327,17 @@ func (m *Manager) Apply(pid int) error {
}
}
statusChan := make(chan string)
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) {
statusChan := make(chan string, 1)
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
select {
case <-statusChan:
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
}
} else if !isUnitExists(err) {
return err
}
select {
case <-statusChan:
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit completion signal from dbus. Continuing...")
}
if err := joinCgroups(c, pid); err != nil {
return err
}

View file

@ -13,7 +13,7 @@ import (
"strings"
"time"
"github.com/docker/go-units"
units "github.com/docker/go-units"
)
const (
@ -103,7 +103,7 @@ func FindCgroupMountpointDir() (string, error) {
}
if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
@ -151,19 +151,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
}
ss[opt] = true
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
if !all {
numFound++
opt = opt[len(cgroupNamePrefix):]
}
m.Subsystems = append(m.Subsystems, opt)
numFound++
}
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
@ -187,7 +188,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) {
allMap := make(map[string]bool)
for s := range allSubsystems {
allMap[s] = true
allMap[s] = false
}
return getCgroupMountsHelper(allMap, f, all)
}
@ -262,7 +263,7 @@ func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
// see paths from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err

View file

@ -93,6 +93,62 @@ const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,
136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event`
const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
@ -132,31 +188,46 @@ func TestGetCgroupMounts(t *testing.T) {
mountInfo: fedoraMountinfo,
root: "/",
subsystems: map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"hugetlb": true,
"cpuset": false,
"cpu": false,
"cpuacct": false,
"memory": false,
"devices": false,
"freezer": false,
"net_cls": false,
"blkio": false,
"perf_event": false,
"hugetlb": false,
},
},
{
mountInfo: systemdMountinfo,
root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope",
subsystems: map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"cpuset": false,
"cpu": false,
"cpuacct": false,
"memory": false,
"devices": false,
"freezer": false,
"net_cls": false,
"blkio": false,
"perf_event": false,
},
},
{
mountInfo: bedrockMountinfo,
root: "/",
subsystems: map[string]bool{
"cpuset": false,
"cpu": false,
"cpuacct": false,
"memory": false,
"devices": false,
"freezer": false,
"net_cls": false,
"blkio": false,
"perf_event": false,
},
},
}
@ -199,16 +270,16 @@ func TestGetCgroupMounts(t *testing.T) {
func BenchmarkGetCgroupMounts(b *testing.B) {
subsystems := map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"hugetlb": true,
"cpuset": false,
"cpu": false,
"cpuacct": false,
"memory": false,
"devices": false,
"freezer": false,
"net_cls": false,
"blkio": false,
"perf_event": false,
"hugetlb": false,
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
@ -276,17 +347,17 @@ func TestParseCgroupString(t *testing.T) {
func TestIgnoreCgroup2Mount(t *testing.T) {
subsystems := map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"pids": true,
"name=systemd": true,
"cpuset": false,
"cpu": false,
"cpuacct": false,
"memory": false,
"devices": false,
"freezer": false,
"net_cls": false,
"blkio": false,
"perf_event": false,
"pids": false,
"name=systemd": false,
}
mi := bytes.NewBufferString(cgroup2Mountinfo)

View file

@ -141,9 +141,10 @@ type Config struct {
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed.
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"`
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`

View file

@ -43,13 +43,12 @@ func rootlessMappings(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
}
if len(config.UidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one UID mapping")
if len(config.UidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one GID mapping")
}
}
return nil

View file

@ -151,6 +151,16 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
if config.Namespaces.Contains(configs.NEWUTS) {
switch s {
case "kernel.domainname":
// This is namespaced and there's no explicit OCI field for it.
continue
case "kernel.hostname":
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}

View file

@ -28,7 +28,6 @@ import (
"github.com/golang/protobuf/proto"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
)
@ -225,17 +224,13 @@ func (c *linuxContainer) Set(config configs.Config) error {
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
if process.Init {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
if err := c.start(process); err != nil {
if process.Init {
c.deleteExecFifo()
}
return err
@ -244,17 +239,10 @@ func (c *linuxContainer) Start(process *Process) error {
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
status, err := c.currentStatus()
if err != nil {
c.m.Unlock()
return err
}
c.m.Unlock()
if err := c.Start(process); err != nil {
return err
}
if status == Stopped {
if process.Init {
return c.exec()
}
return nil
@ -335,8 +323,8 @@ type openResult struct {
err error
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
func (c *linuxContainer) start(process *Process) error {
parent, err := c.newParentProcess(process)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
@ -349,7 +337,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
if isInit {
if process.Init {
c.state = &createdState{
c: c,
}
@ -377,10 +365,6 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
}
}
}
} else {
c.state = &runningState{
c: c,
}
}
return nil
}
@ -443,7 +427,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
return nil
}
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
@ -452,7 +436,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
if !p.Init {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
@ -477,6 +461,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
@ -672,7 +657,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
Features: criuFeat,
}
err := c.criuSwrk(nil, req, criuOpts, false)
err := c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil {
logrus.Debugf("%s", err)
return fmt.Errorf("CRIU feature check failed")
@ -785,7 +770,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
Type: &t,
}
err := c.criuSwrk(nil, req, nil, false)
err := c.criuSwrk(nil, req, nil, false, nil)
if err != nil {
return fmt.Errorf("CRIU version check failed: %s", err)
}
@ -943,6 +928,33 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
LazyPages: proto.Bool(criuOpts.LazyPages),
}
// If the container is running in a network namespace and has
// a path to the network namespace configured, we will dump
// that network namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect to be setup correctly.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU expects the information about an external namespace
// like this: --external net[<inode>]:<key>
// This <key> is always 'extRootNetNS'.
var netns syscall.Stat_t
err = syscall.Stat(nsPath, &netns)
if err != nil {
return err
}
criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
rpcOpts.External = append(rpcOpts.External, criuExternal)
}
}
fcg := c.cgroupManager.GetPaths()["freezer"]
if fcg != "" {
rpcOpts.FreezeCgroup = proto.String(fcg)
@ -1047,7 +1059,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
err = c.criuSwrk(nil, req, criuOpts, false)
err = c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil {
return err
}
@ -1091,6 +1103,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
var extraFiles []*os.File
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
if c.config.Rootless {
@ -1165,6 +1179,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
},
}
// Same as during checkpointing. If the container has a specific network namespace
// assigned to it, this now expects that the checkpoint will be restored in a
// already created network namespace.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU wants the information about an existing network namespace
// like this: --inherit-fd fd[<fd>]:<key>
// The <key> needs to be the same as during checkpointing.
// We are always using 'extRootNetNS' as the key in this.
netns, err := os.Open(nsPath)
defer netns.Close()
if err != nil {
logrus.Error("If a specific network namespace is defined it must exist: %s", err)
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
}
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String("extRootNetNS")
// The offset of four is necessary because 0, 1, 2 and 3 is already
// used by stdin, stdout, stderr, 'criu swrk' socket.
inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
// All open FDs need to be transferred to CRIU via extraFiles
extraFiles = append(extraFiles, netns)
}
}
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
@ -1223,7 +1269,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
}
}
return c.criuSwrk(process, req, criuOpts, true)
return c.criuSwrk(process, req, criuOpts, true, extraFiles)
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -1253,7 +1299,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
return nil
}
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
if err != nil {
return err
@ -1294,6 +1340,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
cmd.Stderr = process.Stderr
}
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if extraFiles != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
}
if err := cmd.Start(); err != nil {
return err
@ -1801,28 +1850,22 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: []byte(c.newgidmapPath),
})
}
// The following only applies if we are root.
if !c.config.Rootless {
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(0)
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
if requiresRootOrMappingTool(c.config) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
if c.config.OomScoreAdj != nil {
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
})
}
// write rootless
r.AddData(&Boolmsg{
@ -1847,3 +1890,10 @@ func ignoreTerminateErrors(err error) error {
}
return err
}
func requiresRootOrMappingTool(c *configs.Config) bool {
gidMap := []configs.IDMap{
{ContainerID: 0, HostID: os.Getegid(), Size: 1},
}
return !reflect.DeepEqual(c.GidMappings, gidMap)
}

View file

@ -159,7 +159,7 @@ func TestGetContainerStats(t *testing.T) {
t.Fatal("cgroup stats are nil")
}
if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
t.Fatalf("expected memory usage 1024 but recevied %d", stats.CgroupStats.MemoryStats.Usage.Usage)
t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
}
if intelrdt.IsEnabled() {
if stats.IntelRdtStats == nil {

View file

@ -0,0 +1,105 @@
package devices
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
var (
ErrNotADevice = errors.New("not a device node")
)
// Testing dependencies
var (
unixLstat = unix.Lstat
ioutilReadDir = ioutil.ReadDir
)
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*configs.Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)
if err != nil {
return nil, err
}
var (
devNumber = uint64(stat.Rdev)
major = unix.Major(devNumber)
minor = unix.Minor(devNumber)
)
if major == 0 {
return nil, ErrNotADevice
}
var (
devType rune
mode = stat.Mode
)
switch {
case mode&unix.S_IFBLK == unix.S_IFBLK:
devType = 'b'
case mode&unix.S_IFCHR == unix.S_IFCHR:
devType = 'c'
}
return &configs.Device{
Type: devType,
Path: path,
Major: int64(major),
Minor: int64(minor),
Permissions: permissions,
FileMode: os.FileMode(mode),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}
func HostDevices() ([]*configs.Device, error) {
return getDevices("/dev")
}
func getDevices(path string) ([]*configs.Device, error) {
files, err := ioutilReadDir(path)
if err != nil {
return nil, err
}
out := []*configs.Device{}
for _, f := range files {
switch {
case f.IsDir():
switch f.Name() {
// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts":
continue
default:
sub, err := getDevices(filepath.Join(path, f.Name()))
if err != nil {
return nil, err
}
out = append(out, sub...)
continue
}
case f.Name() == "console":
continue
}
device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
if err != nil {
if err == ErrNotADevice {
continue
}
if os.IsNotExist(err) {
continue
}
return nil, err
}
out = append(out, device)
}
return out, nil
}

View file

@ -0,0 +1,63 @@
package devices
import (
"errors"
"os"
"testing"
"golang.org/x/sys/unix"
)
func TestDeviceFromPathLstatFailure(t *testing.T) {
testError := errors.New("test error")
// Override unix.Lstat to inject error.
unixLstat = func(path string, stat *unix.Stat_t) error {
return testError
}
_, err := DeviceFromPath("", "")
if err != testError {
t.Fatalf("Unexpected error %v, expected %v", err, testError)
}
}
func TestHostDevicesIoutilReadDirFailure(t *testing.T) {
testError := errors.New("test error")
// Override ioutil.ReadDir to inject error.
ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
return nil, testError
}
_, err := HostDevices()
if err != testError {
t.Fatalf("Unexpected error %v, expected %v", err, testError)
}
}
func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
testError := errors.New("test error")
called := false
// Override ioutil.ReadDir to inject error after the first call.
ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
if called {
return nil, testError
}
called = true
// Provoke a second call.
fi, err := os.Lstat("/tmp")
if err != nil {
t.Fatalf("Unexpected error %v", err)
}
return []os.FileInfo{fi}, nil
}
_, err := HostDevices()
if err != testError {
t.Fatalf("Unexpected error %v, expected %v", err, testError)
}
}

View file

@ -11,6 +11,7 @@ import (
"runtime/debug"
"strconv"
"github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
@ -59,9 +60,9 @@ func SystemdCgroups(l *LinuxFactory) error {
return nil
}
// Cgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to
// create and manage cgroups.
// Cgroupfs is an options func to configure a LinuxFactory to return containers
// that use the native cgroups filesystem implementation to create and manage
// cgroups.
func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
@ -72,6 +73,23 @@ func Cgroupfs(l *LinuxFactory) error {
return nil
}
// RootlessCgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to create
// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
// that RootlessCgroupfs can transparently handle permission errors that occur
// during rootless container setup (while still allowing cgroup usage if
// they've been set up properly).
func RootlessCgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Rootless: true,
Paths: paths,
}
}
return nil
}
// IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel Xeon platform shared resources (e.g., L3 cache).
@ -178,7 +196,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
@ -212,7 +233,14 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
//when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
state, err := l.loadState(containerRoot, id)
if err != nil {
return nil, err
@ -322,7 +350,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
if err != nil {
return nil, err
}
f, err := os.Open(stateFilePath)
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
@ -338,7 +370,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) {
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
}

View file

@ -20,6 +20,7 @@ import (
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
)
@ -121,7 +122,7 @@ func finalizeNamespace(config *initConfig) error {
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return err
return errors.Wrap(err, "close exec fds")
}
capabilities := &configs.Capabilities{}
@ -136,20 +137,20 @@ func finalizeNamespace(config *initConfig) error {
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return err
return errors.Wrap(err, "apply bounding set")
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return err
return errors.Wrap(err, "set keep caps")
}
if err := setupUser(config); err != nil {
return err
return errors.Wrap(err, "setup user")
}
if err := system.ClearKeepCaps(); err != nil {
return err
return errors.Wrap(err, "clear keep caps")
}
if err := w.ApplyCaps(); err != nil {
return err
return errors.Wrap(err, "apply caps")
}
if config.Cwd != "" {
if err := unix.Chdir(config.Cwd); err != nil {

View file

@ -0,0 +1,257 @@
package integration
import (
"bufio"
"bytes"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
func showFile(t *testing.T, fname string) error {
t.Logf("=== %s ===\n", fname)
f, err := os.Open(fname)
if err != nil {
t.Log(err)
return err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
t.Log(scanner.Text())
}
if err := scanner.Err(); err != nil {
return err
}
t.Logf("=== END ===\n")
return nil
}
func TestUsernsCheckpoint(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
}
cmd := exec.Command("criu", "check", "--feature", "userns")
if err := cmd.Run(); err != nil {
t.Skip("Unable to c/r a container with userns")
}
testCheckpoint(t, true)
}
func TestCheckpoint(t *testing.T) {
testCheckpoint(t, false)
}
func testCheckpoint(t *testing.T, userns bool) {
if testing.Short() {
return
}
root, err := newTestRoot()
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(root)
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Mounts = append(config.Mounts, &configs.Mount{
Destination: "/sys/fs/cgroup",
Device: "cgroup",
Flags: defaultMountFlags | unix.MS_RDONLY,
})
if userns {
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
}
factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
if err != nil {
t.Fatal(err)
}
container, err := factory.Create("test", config)
if err != nil {
t.Fatal(err)
}
defer container.Destroy()
stdinR, stdinW, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
var stdout bytes.Buffer
pconfig := libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Stdout: &stdout,
Init: true,
}
err = container.Run(&pconfig)
stdinR.Close()
defer stdinW.Close()
if err != nil {
t.Fatal(err)
}
pid, err := pconfig.Pid()
if err != nil {
t.Fatal(err)
}
process, err := os.FindProcess(pid)
if err != nil {
t.Fatal(err)
}
parentDir, err := ioutil.TempDir("", "criu-parent")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(parentDir)
preDumpOpts := &libcontainer.CriuOpts{
ImagesDirectory: parentDir,
WorkDirectory: parentDir,
PreDump: true,
}
preDumpLog := filepath.Join(preDumpOpts.WorkDirectory, "dump.log")
if err := container.Checkpoint(preDumpOpts); err != nil {
showFile(t, preDumpLog)
t.Fatal(err)
}
state, err := container.Status()
if err != nil {
t.Fatal(err)
}
if state != libcontainer.Running {
t.Fatal("Unexpected preDump state: ", state)
}
imagesDir, err := ioutil.TempDir("", "criu")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(imagesDir)
checkpointOpts := &libcontainer.CriuOpts{
ImagesDirectory: imagesDir,
WorkDirectory: imagesDir,
ParentImage: "../criu-parent",
}
dumpLog := filepath.Join(checkpointOpts.WorkDirectory, "dump.log")
restoreLog := filepath.Join(checkpointOpts.WorkDirectory, "restore.log")
if err := container.Checkpoint(checkpointOpts); err != nil {
showFile(t, dumpLog)
t.Fatal(err)
}
state, err = container.Status()
if err != nil {
t.Fatal(err)
}
if state != libcontainer.Stopped {
t.Fatal("Unexpected state checkpoint: ", state)
}
stdinW.Close()
_, err = process.Wait()
if err != nil {
t.Fatal(err)
}
// reload the container
container, err = factory.Load("test")
if err != nil {
t.Fatal(err)
}
restoreStdinR, restoreStdinW, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
restoreProcessConfig := &libcontainer.Process{
Cwd: "/",
Stdin: restoreStdinR,
Stdout: &stdout,
Init: true,
}
err = container.Restore(restoreProcessConfig, checkpointOpts)
restoreStdinR.Close()
defer restoreStdinW.Close()
if err != nil {
showFile(t, restoreLog)
t.Fatal(err)
}
state, err = container.Status()
if err != nil {
t.Fatal(err)
}
if state != libcontainer.Running {
t.Fatal("Unexpected restore state: ", state)
}
pid, err = restoreProcessConfig.Pid()
if err != nil {
t.Fatal(err)
}
process, err = os.FindProcess(pid)
if err != nil {
t.Fatal(err)
}
_, err = restoreStdinW.WriteString("Hello!")
if err != nil {
t.Fatal(err)
}
restoreStdinW.Close()
s, err := process.Wait()
if err != nil {
t.Fatal(err)
}
if !s.Success() {
t.Fatal(s.String(), pid)
}
output := string(stdout.Bytes())
if !strings.Contains(output, "Hello!") {
t.Fatal("Did not restore the pipe correctly:", output)
}
}

View file

@ -0,0 +1,2 @@
// integration is used for integration testing of libcontainer
package integration

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,608 @@
package integration
import (
"bytes"
"fmt"
"io"
"os"
"strconv"
"strings"
"testing"
"time"
"github.com/containerd/console"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"golang.org/x/sys/unix"
)
func TestExecIn(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
buffers := newStdBuffers()
ps := &libcontainer.Process{
Cwd: "/",
Args: []string{"ps"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
}
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
stdinW.Close()
waitProcess(process, t)
out := buffers.Stdout.String()
if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
t.Fatalf("unexpected running process, output %q", out)
}
if strings.Contains(out, "\r") {
t.Fatalf("unexpected carriage-return in output")
}
}
func TestExecInUsernsRlimit(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
}
testExecInRlimit(t, true)
}
func TestExecInRlimit(t *testing.T) {
testExecInRlimit(t, false)
}
func testExecInRlimit(t *testing.T, userns bool) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
if userns {
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
}
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
buffers := newStdBuffers()
ps := &libcontainer.Process{
Cwd: "/",
Args: []string{"/bin/sh", "-c", "ulimit -n"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Rlimits: []configs.Rlimit{
// increase process rlimit higher than container rlimit to test per-process limit
{Type: unix.RLIMIT_NOFILE, Hard: 1026, Soft: 1026},
},
Init: true,
}
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
stdinW.Close()
waitProcess(process, t)
out := buffers.Stdout.String()
if limit := strings.TrimSpace(out); limit != "1026" {
t.Fatalf("expected rlimit to be 1026, got %s", limit)
}
}
func TestExecInAdditionalGroups(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
var stdout bytes.Buffer
pconfig := libcontainer.Process{
Cwd: "/",
Args: []string{"sh", "-c", "id", "-Gn"},
Env: standardEnvironment,
Stdin: nil,
Stdout: &stdout,
AdditionalGroups: []string{"plugdev", "audio"},
}
err = container.Run(&pconfig)
ok(t, err)
// Wait for process
waitProcess(&pconfig, t)
stdinW.Close()
waitProcess(process, t)
outputGroups := string(stdout.Bytes())
// Check that the groups output has the groups that we specified
if !strings.Contains(outputGroups, "audio") {
t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
}
if !strings.Contains(outputGroups, "plugdev") {
t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
}
}
func TestExecInError(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer func() {
stdinW.Close()
if _, err := process.Wait(); err != nil {
t.Log(err)
}
}()
ok(t, err)
for i := 0; i < 42; i++ {
var out bytes.Buffer
unexistent := &libcontainer.Process{
Cwd: "/",
Args: []string{"unexistent"},
Env: standardEnvironment,
Stderr: &out,
}
err = container.Run(unexistent)
if err == nil {
t.Fatal("Should be an error")
}
if !strings.Contains(err.Error(), "executable file not found") {
t.Fatalf("Should be error about not found executable, got %s", err)
}
if !bytes.Contains(out.Bytes(), []byte("executable file not found")) {
t.Fatalf("executable file not found error not delivered to stdio:\n%s", out.String())
}
}
}
func TestExecInTTY(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
var stdout bytes.Buffer
ps := &libcontainer.Process{
Cwd: "/",
Args: []string{"ps"},
Env: standardEnvironment,
}
parent, child, err := utils.NewSockPair("console")
if err != nil {
ok(t, err)
}
defer parent.Close()
defer child.Close()
ps.ConsoleSocket = child
type cdata struct {
c console.Console
err error
}
dc := make(chan *cdata, 1)
go func() {
f, err := utils.RecvFd(parent)
if err != nil {
dc <- &cdata{
err: err,
}
return
}
c, err := console.ConsoleFromFile(f)
if err != nil {
dc <- &cdata{
err: err,
}
return
}
console.ClearONLCR(c.Fd())
dc <- &cdata{
c: c,
}
}()
err = container.Run(ps)
ok(t, err)
data := <-dc
if data.err != nil {
ok(t, data.err)
}
console := data.c
copy := make(chan struct{})
go func() {
io.Copy(&stdout, console)
close(copy)
}()
ok(t, err)
select {
case <-time.After(5 * time.Second):
t.Fatal("Waiting for copy timed out")
case <-copy:
}
waitProcess(ps, t)
stdinW.Close()
waitProcess(process, t)
out := stdout.String()
if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
t.Fatalf("unexpected running process, output %q", out)
}
if strings.Contains(out, "\r") {
t.Fatalf("unexpected carriage-return in output")
}
}
func TestExecInEnvironment(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
buffers := newStdBuffers()
process2 := &libcontainer.Process{
Cwd: "/",
Args: []string{"env"},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"DEBUG=true",
"DEBUG=false",
"ENV=test",
},
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Init: true,
}
err = container.Run(process2)
ok(t, err)
waitProcess(process2, t)
stdinW.Close()
waitProcess(process, t)
out := buffers.Stdout.String()
// check execin's process environment
if !strings.Contains(out, "DEBUG=false") ||
!strings.Contains(out, "ENV=test") ||
!strings.Contains(out, "HOME=/root") ||
!strings.Contains(out, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") ||
strings.Contains(out, "DEBUG=true") {
t.Fatalf("unexpected running process, output %q", out)
}
}
func TestExecinPassExtraFiles(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
container, err := newContainer(config)
if err != nil {
t.Fatal(err)
}
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
if err != nil {
t.Fatal(err)
}
var stdout bytes.Buffer
pipeout1, pipein1, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
pipeout2, pipein2, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
inprocess := &libcontainer.Process{
Cwd: "/",
Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"},
Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
ExtraFiles: []*os.File{pipein1, pipein2},
Stdin: nil,
Stdout: &stdout,
}
err = container.Run(inprocess)
if err != nil {
t.Fatal(err)
}
waitProcess(inprocess, t)
stdinW.Close()
waitProcess(process, t)
out := string(stdout.Bytes())
// fd 5 is the directory handle for /proc/$$/fd
if out != "0 1 2 3 4 5" {
t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out)
}
var buf = []byte{0}
_, err = pipeout1.Read(buf)
if err != nil {
t.Fatal(err)
}
out1 := string(buf)
if out1 != "1" {
t.Fatalf("expected first pipe to receive '1', got '%s'", out1)
}
_, err = pipeout2.Read(buf)
if err != nil {
t.Fatal(err)
}
out2 := string(buf)
if out2 != "2" {
t.Fatalf("expected second pipe to receive '2', got '%s'", out2)
}
}
func TestExecInOomScoreAdj(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.OomScoreAdj = ptrInt(200)
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
buffers := newStdBuffers()
ps := &libcontainer.Process{
Cwd: "/",
Args: []string{"/bin/sh", "-c", "cat /proc/self/oom_score_adj"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
}
err = container.Run(ps)
ok(t, err)
waitProcess(ps, t)
stdinW.Close()
waitProcess(process, t)
out := buffers.Stdout.String()
if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) {
t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj)
}
}
func TestExecInUserns(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
}
if testing.Short() {
return
}
rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
container, err := newContainer(config)
ok(t, err)
defer container.Destroy()
// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)
process := &libcontainer.Process{
Cwd: "/",
Args: []string{"cat"},
Env: standardEnvironment,
Stdin: stdinR,
Init: true,
}
err = container.Run(process)
stdinR.Close()
defer stdinW.Close()
ok(t, err)
initPID, err := process.Pid()
ok(t, err)
initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID))
ok(t, err)
buffers := newStdBuffers()
process2 := &libcontainer.Process{
Cwd: "/",
Args: []string{"readlink", "/proc/self/ns/user"},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
},
Stdout: buffers.Stdout,
Stderr: os.Stderr,
}
err = container.Run(process2)
ok(t, err)
waitProcess(process2, t)
stdinW.Close()
waitProcess(process, t)
if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns {
t.Errorf("execin userns(%s), wanted %s", out, initUserns)
}
}

View file

@ -0,0 +1,61 @@
package integration
import (
"os"
"runtime"
"testing"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
_ "github.com/opencontainers/runc/libcontainer/nsenter"
"github.com/sirupsen/logrus"
)
// init runs the libcontainer initialization code because of the busybox style needs
// to work around the go runtime and the issues with forking
func init() {
if len(os.Args) < 2 || os.Args[1] != "init" {
return
}
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, err := libcontainer.New("")
if err != nil {
logrus.Fatalf("unable to initialize for container: %s", err)
}
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
}
var (
factory libcontainer.Factory
systemdFactory libcontainer.Factory
)
func TestMain(m *testing.M) {
var (
err error
ret int
)
logrus.SetOutput(os.Stderr)
logrus.SetLevel(logrus.InfoLevel)
factory, err = libcontainer.New("/run/libctTests", libcontainer.Cgroupfs)
if err != nil {
logrus.Error(err)
os.Exit(1)
}
if systemd.UseSystemd() {
systemdFactory, err = libcontainer.New("/run/libctTests", libcontainer.SystemdCgroups)
if err != nil {
logrus.Error(err)
os.Exit(1)
}
}
ret = m.Run()
os.Exit(ret)
}

View file

@ -0,0 +1,422 @@
// +build linux,cgo,seccomp
package integration
import (
"strings"
"syscall"
"testing"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
)
func TestSeccompDenyGetcwd(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "getcwd",
Action: configs.Errno,
},
},
}
container, err := newContainer(config)
if err != nil {
t.Fatal(err)
}
defer container.Destroy()
buffers := newStdBuffers()
pwd := &libcontainer.Process{
Cwd: "/",
Args: []string{"pwd"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Init: true,
}
err = container.Run(pwd)
if err != nil {
t.Fatal(err)
}
ps, err := pwd.Wait()
if err == nil {
t.Fatal("Expecting error (negative return code); instead exited cleanly!")
}
var exitCode int
status := ps.Sys().(syscall.WaitStatus)
if status.Exited() {
exitCode = status.ExitStatus()
} else if status.Signaled() {
exitCode = -int(status.Signal())
} else {
t.Fatalf("Unrecognized exit reason!")
}
if exitCode == 0 {
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
}
expected := "pwd: getcwd: Operation not permitted"
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
}
}
func TestSeccompPermitWriteConditional(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
},
},
},
}
container, err := newContainer(config)
if err != nil {
t.Fatal(err)
}
defer container.Destroy()
buffers := newStdBuffers()
dmesg := &libcontainer.Process{
Cwd: "/",
Args: []string{"busybox", "ls", "/"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Init: true,
}
err = container.Run(dmesg)
if err != nil {
t.Fatal(err)
}
if _, err := dmesg.Wait(); err != nil {
t.Fatalf("%s: %s", err, buffers.Stderr)
}
}
func TestSeccompDenyWriteConditional(t *testing.T) {
if testing.Short() {
return
}
// Only test if library version is v2.2.1 or higher
// Conditional filtering will always error in v2.2.0 and lower
major, minor, micro := libseccomp.GetLibraryVersion()
if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
},
},
},
}
container, err := newContainer(config)
if err != nil {
t.Fatal(err)
}
defer container.Destroy()
buffers := newStdBuffers()
dmesg := &libcontainer.Process{
Cwd: "/",
Args: []string{"busybox", "ls", "does_not_exist"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Init: true,
}
err = container.Run(dmesg)
if err != nil {
t.Fatal(err)
}
ps, err := dmesg.Wait()
if err == nil {
t.Fatal("Expecting negative return, instead got 0!")
}
var exitCode int
status := ps.Sys().(syscall.WaitStatus)
if status.Exited() {
exitCode = status.ExitStatus()
} else if status.Signaled() {
exitCode = -int(status.Signal())
} else {
t.Fatalf("Unrecognized exit reason!")
}
if exitCode == 0 {
t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
}
// We're denying write to stderr, so we expect an empty buffer
expected := ""
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
}
}
func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
{
Index: 2,
Value: 0,
Op: configs.NotEqualTo,
},
},
},
},
}
buffers, exitCode, err := runContainer(config, "", "ls", "/")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
}
// We don't need to verify the actual thing printed
// Just that something was written to stdout
if len(buffers.Stdout.String()) == 0 {
t.Fatalf("Nothing was written to stdout, write call failed!\n")
}
}
func TestSeccompDenyWriteMultipleConditions(t *testing.T) {
if testing.Short() {
return
}
// Only test if library version is v2.2.1 or higher
// Conditional filtering will always error in v2.2.0 and lower
major, minor, micro := libseccomp.GetLibraryVersion()
if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
{
Index: 2,
Value: 0,
Op: configs.NotEqualTo,
},
},
},
},
}
buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
if err == nil {
t.Fatalf("Expecting error return, instead got 0")
}
if exitCode == 0 {
t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
}
expected := ""
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
}
}
func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
// Prevent writing to both stdout and stderr
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 1,
Op: configs.EqualTo,
},
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
},
},
},
}
buffers, exitCode, err := runContainer(config, "", "ls", "/")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
}
// Verify that nothing was printed
if len(buffers.Stdout.String()) != 0 {
t.Fatalf("Something was written to stdout, write call succeeded!\n")
}
}
func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) {
if testing.Short() {
return
}
rootfs, err := newRootfs()
if err != nil {
t.Fatal(err)
}
defer remove(rootfs)
// Prevent writing to both stdout and stderr
config := newTemplateConfig(rootfs)
config.Seccomp = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "write",
Action: configs.Errno,
Args: []*configs.Arg{
{
Index: 0,
Value: 1,
Op: configs.EqualTo,
},
{
Index: 0,
Value: 2,
Op: configs.EqualTo,
},
},
},
},
}
buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
if err == nil {
t.Fatalf("Expecting error return, instead got 0")
}
if exitCode == 0 {
t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
}
// Verify nothing was printed
if len(buffers.Stderr.String()) != 0 {
t.Fatalf("Something was written to stderr, write call succeeded!\n")
}
}

View file

@ -0,0 +1,191 @@
package integration
import (
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
var standardEnvironment = []string{
"HOME=/root",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"HOSTNAME=integration",
"TERM=xterm",
}
const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
// newTemplateConfig returns a base template for running a container
//
// it uses a network strategy of just setting a loopback interface
// and the default setup for devices
func newTemplateConfig(rootfs string) *configs.Config {
allowAllDevices := false
return &configs.Config{
Rootfs: rootfs,
Capabilities: &configs.Capabilities{
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Inheritable: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
{Type: configs.NEWIPC},
{Type: configs.NEWPID},
{Type: configs.NEWNET},
}),
Cgroups: &configs.Cgroup{
Path: "integration/test",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: &allowAllDevices,
AllowedDevices: configs.DefaultAllowedDevices,
},
},
MaskPaths: []string{
"/proc/kcore",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Hostname: "integration",
Mounts: []*configs.Mount{
{
Source: "proc",
Destination: "/proc",
Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
/*
CI is broken on the debian based kernels with this
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
*/
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | unix.MS_RDONLY,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: unix.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
}
}

View file

@ -0,0 +1,175 @@
package integration
import (
"bytes"
"crypto/md5"
"encoding/hex"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"syscall"
"testing"
"time"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
)
func ptrInt(v int) *int {
return &v
}
func newStdBuffers() *stdBuffers {
return &stdBuffers{
Stdin: bytes.NewBuffer(nil),
Stdout: bytes.NewBuffer(nil),
Stderr: bytes.NewBuffer(nil),
}
}
type stdBuffers struct {
Stdin *bytes.Buffer
Stdout *bytes.Buffer
Stderr *bytes.Buffer
}
func (b *stdBuffers) String() string {
s := []string{}
if b.Stderr != nil {
s = append(s, b.Stderr.String())
}
if b.Stdout != nil {
s = append(s, b.Stdout.String())
}
return strings.Join(s, "|")
}
// ok fails the test if an err is not nil.
func ok(t testing.TB, err error) {
if err != nil {
_, file, line, _ := runtime.Caller(1)
t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
}
}
func waitProcess(p *libcontainer.Process, t *testing.T) {
_, file, line, _ := runtime.Caller(1)
status, err := p.Wait()
if err != nil {
t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
}
if !status.Success() {
t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String())
}
}
func newTestRoot() (string, error) {
dir, err := ioutil.TempDir("", "libcontainer")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
return dir, nil
}
func newTestBundle() (string, error) {
dir, err := ioutil.TempDir("", "bundle")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
return dir, nil
}
// newRootfs creates a new tmp directory and copies the busybox root filesystem
func newRootfs() (string, error) {
dir, err := ioutil.TempDir("", "")
if err != nil {
return "", err
}
if err := os.MkdirAll(dir, 0700); err != nil {
return "", err
}
if err := copyBusybox(dir); err != nil {
return "", err
}
return dir, nil
}
func remove(dir string) {
os.RemoveAll(dir)
}
// copyBusybox copies the rootfs for a busybox container created for the test image
// into the new directory for the specific test
func copyBusybox(dest string) error {
out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput()
if err != nil {
return fmt.Errorf("copy error %q: %q", err, out)
}
return nil
}
func newContainer(config *configs.Config) (libcontainer.Container, error) {
h := md5.New()
h.Write([]byte(time.Now().String()))
return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config)
}
func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) {
f := factory
if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" {
f = systemdFactory
}
return f.Create(name, config)
}
// runContainer runs the container with the specific config and arguments
//
// buffers are returned containing the STDOUT and STDERR output for the run
// along with the exit code and any go error
func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) {
container, err := newContainer(config)
if err != nil {
return nil, -1, err
}
defer container.Destroy()
buffers = newStdBuffers()
process := &libcontainer.Process{
Cwd: "/",
Args: args,
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
Stderr: buffers.Stderr,
Init: true,
}
err = container.Run(process)
if err != nil {
return buffers, -1, err
}
ps, err := process.Wait()
if err != nil {
return buffers, -1, err
}
status := ps.Sys().(syscall.WaitStatus)
if status.Exited() {
exitCode = status.ExitStatus()
} else if status.Signaled() {
exitCode = -int(status.Signal())
} else {
return buffers, -1, err
}
return
}

View file

@ -177,7 +177,7 @@ func findIntelRdtMountpointDir() (string, error) {
}
if postSeparatorFields[0] == "resctrl" {
// Check that the mount is properly formated.
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}

View file

@ -77,13 +77,13 @@ func (msg *Boolmsg) Serialize() []byte {
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
native.PutUint32(buf[4:8], uint32(1))
} else {
buf[4] = 0
native.PutUint32(buf[4:8], uint32(0))
}
return buf
}
func (msg *Boolmsg) Len() int {
return unix.NLA_HDRLEN + 1
return unix.NLA_HDRLEN + 4 // alignment
}

View file

@ -5,18 +5,15 @@ package libcontainer
import (
"fmt"
"io/ioutil"
"net"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink"
)
var strategies = map[string]networkStrategy{
"veth": &veth{},
"loopback": &loopback{},
}
@ -103,157 +100,3 @@ func (l *loopback) attach(n *configs.Network) (err error) {
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}
// veth is a network strategy that uses a bridge and creates
// a veth pair, one that is attached to the bridge on the host and the other
// is placed inside the container's namespace
type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network
func (v *veth) attach(n *configs.Network) (err error) {
brl, err := netlink.LinkByName(n.Bridge)
if err != nil {
return err
}
br, ok := brl.(*netlink.Bridge)
if !ok {
return fmt.Errorf("Wrong device type %T", brl)
}
host, err := netlink.LinkByName(n.HostInterfaceName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(host, br); err != nil {
return err
}
if err := netlink.LinkSetMTU(host, n.Mtu); err != nil {
return err
}
if n.HairpinMode {
if err := netlink.LinkSetHairpin(host, true); err != nil {
return err
}
}
if err := netlink.LinkSetUp(host); err != nil {
return err
}
return nil
}
func (v *veth) create(n *network, nspid int) (err error) {
tmpName, err := v.generateTempPeerName()
if err != nil {
return err
}
n.TempVethPeerName = tmpName
if n.Bridge == "" {
return fmt.Errorf("bridge is not specified")
}
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: n.HostInterfaceName,
TxQLen: n.TxQueueLen,
},
PeerName: n.TempVethPeerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return err
}
defer func() {
if err != nil {
netlink.LinkDel(veth)
}
}()
if err := v.attach(&n.Network); err != nil {
return err
}
child, err := netlink.LinkByName(n.TempVethPeerName)
if err != nil {
return err
}
return netlink.LinkSetNsPid(child, nspid)
}
func (v *veth) generateTempPeerName() (string, error) {
return utils.GenerateRandomName("veth", 7)
}
func (v *veth) initialize(config *network) error {
peer := config.TempVethPeerName
if peer == "" {
return fmt.Errorf("peer is not specified")
}
child, err := netlink.LinkByName(peer)
if err != nil {
return err
}
if err := netlink.LinkSetDown(child); err != nil {
return err
}
if err := netlink.LinkSetName(child, config.Name); err != nil {
return err
}
// get the interface again after we changed the name as the index also changes.
if child, err = netlink.LinkByName(config.Name); err != nil {
return err
}
if config.MacAddress != "" {
mac, err := net.ParseMAC(config.MacAddress)
if err != nil {
return err
}
if err := netlink.LinkSetHardwareAddr(child, mac); err != nil {
return err
}
}
ip, err := netlink.ParseAddr(config.Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip); err != nil {
return err
}
if config.IPv6Address != "" {
ip6, err := netlink.ParseAddr(config.IPv6Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip6); err != nil {
return err
}
}
if err := netlink.LinkSetMTU(child, config.Mtu); err != nil {
return err
}
if err := netlink.LinkSetUp(child); err != nil {
return err
}
if config.Gateway != "" {
gw := net.ParseIP(config.Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
if config.IPv6Gateway != "" {
gw := net.ParseIP(config.IPv6Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
return nil
}

View file

@ -10,8 +10,8 @@ The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd
package. In cgo, if the import of "C" is immediately preceded by a comment, that comment,
called the preamble, is used as a header when compiling the C parts of the package.
So every time we import package `nsenter`, the C code function `nsexec()` would be
called. And package `nsenter` is now only imported in `main_unix.go`, so every time
before we call `cmd.Start` on linux, that C code would run.
called. And package `nsenter` is only imported in `init.go`, so every time the runc
`init` command is invoked, that C code is run.
Because `nsexec()` must be run before the Go runtime in order to use the
Linux kernel namespace, you must `import` this library into a package if
@ -37,7 +37,7 @@ the parent `nsexec()` will exit and the child `nsexec()` process will
return to allow the Go runtime take over.
NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
CLONE_NEW* clone flags because we must fork a new process in order to
`CLONE_NEW*` clone flags because we must fork a new process in order to
enter the PID namespace.

View file

@ -211,7 +211,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
/*
* If @app is NULL, execve will segfault. Just check it here and bail (if
* we're in this path, the caller is already getting desparate and there
* we're in this path, the caller is already getting desperate and there
* isn't a backup to this failing). This usually would be a configuration
* or programming issue.
*/
@ -505,7 +505,8 @@ void join_namespaces(char *nslist)
ns->fd = fd;
ns->ns = nsflag(namespace);
strncpy(ns->path, path, PATH_MAX);
strncpy(ns->path, path, PATH_MAX - 1);
ns->path[PATH_MAX - 1] = '\0';
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
/*
@ -678,17 +679,15 @@ void nsexec(void)
/*
* Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're
* creating a rootless container (this is required since
* Linux 3.19).
* creating a rootless container for single-entry mapping.
* i.e. config.is_setgroup == false.
* (this is required since Linux 3.19).
*
* For rootless multi-entry mapping, config.is_setgroup shall be true and
* newuidmap/newgidmap shall be used.
*/
if (config.is_rootless && config.is_setgroup) {
kill(child, SIGKILL);
bail("cannot allow setgroup in an unprivileged user namespace setup");
}
if (config.is_setgroup)
update_setgroups(child, SETGROUPS_ALLOW);
if (config.is_rootless)
if (config.is_rootless && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
@ -809,25 +808,30 @@ void nsexec(void)
if (config.namespaces)
join_namespaces(config.namespaces);
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
bail("failed to unshare namespaces");
/*
* Deal with user namespaces first. They are quite special, as they
* affect our ability to unshare other namespaces and are used as
* context for privilege checks.
*
* We don't unshare all namespaces in one go. The reason for this
* is that, while the kernel documentation may claim otherwise,
* there are certain cases where unsharing all namespaces at once
* will result in namespace objects being owned incorrectly.
* Ideally we should just fix these kernel bugs, but it's better to
* be safe than sorry, and fix them separately.
*
* A specific case of this is that the SELinux label of the
* internal kern-mount that mqueue uses will be incorrect if the
* UTS namespace is cloned before the USER namespace is mapped.
* I've also heard of similar problems with the network namespace
* in some scenarios. This also mirrors how LXC deals with this
* problem.
*/
if (config.cloneflags & CLONE_NEWUSER) {
if (unshare(CLONE_NEWUSER) < 0)
bail("failed to unshare user namespace");
config.cloneflags &= ~CLONE_NEWUSER;
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal our parent to hook us up.
@ -853,8 +857,21 @@ void nsexec(void)
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
}
/* Become root in the namespace proper. */
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}
/*
* Unshare all of the namespaces. Note that we don't merge this
* with clone() because there were some old kernel versions where
* clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do
* it the long way.
*/
if (unshare(config.cloneflags) < 0)
bail("failed to unshare namespaces");
/*
* TODO: What about non-namespace clone flags that we're dropping here?
*

View file

@ -72,6 +72,9 @@ type Process struct {
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
// Init specifies whether the process is the first process in the container.
Init bool
ops processOperations
}

View file

@ -537,7 +537,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
// change ownership of the pipes in case we are in a user namespace
for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err

View file

@ -152,6 +152,26 @@ func finalizeRootfs(config *configs.Config) (err error) {
return nil
}
// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
func prepareTmp(topTmpDir string) (string, error) {
tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
if err != nil {
return "", err
}
if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
return "", err
}
if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
return "", err
}
return tmpdir, nil
}
func cleanupTmp(tmpdir string) error {
unix.Unmount(tmpdir, 0)
return os.RemoveAll(tmpdir)
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
@ -199,7 +219,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
}
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
tmpdir, err := prepareTmp("/tmp")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
}
defer cleanupTmp(tmpdir)
tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
@ -396,6 +421,7 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/stat",
"/proc/swaps",
"/proc/uptime",
"/proc/loadavg",
"/proc/net/dev",
}
for _, valid := range validDestinations {
@ -412,7 +438,7 @@ func checkMountDestination(rootfs, dest string) error {
if err != nil {
return err
}
if path == "." || !strings.HasPrefix(path, "..") {
if path != "." && !strings.HasPrefix(path, "..") {
return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
}
}

View file

@ -9,13 +9,21 @@ import (
)
func TestCheckMountDestOnProc(t *testing.T) {
dest := "/rootfs/proc/"
dest := "/rootfs/proc/sys"
err := checkMountDestination("/rootfs", dest)
if err == nil {
t.Fatal("destination inside proc should return an error")
}
}
func TestCheckMountDestOnProcChroot(t *testing.T) {
dest := "/rootfs/proc/"
err := checkMountDestination("/rootfs", dest)
if err != nil {
t.Fatal("destination inside proc when using chroot should not return an error")
}
}
func TestCheckMountDestInSys(t *testing.T) {
dest := "/rootfs//sys/fs/cgroup"
err := checkMountDestination("/rootfs", dest)

View file

@ -0,0 +1,47 @@
Name: cat
State: R (running)
Tgid: 19383
Ngid: 0
Pid: 19383
PPid: 19275
TracerPid: 0
Uid: 1000 1000 1000 1000
Gid: 1000 1000 1000 1000
FDSize: 256
Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001
NStgid: 19383
NSpid: 19383
NSpgid: 19383
NSsid: 19275
VmPeak: 5944 kB
VmSize: 5944 kB
VmLck: 0 kB
VmPin: 0 kB
VmHWM: 744 kB
VmRSS: 744 kB
VmData: 324 kB
VmStk: 136 kB
VmExe: 48 kB
VmLib: 1776 kB
VmPTE: 32 kB
VmPMD: 12 kB
VmSwap: 0 kB
Threads: 1
SigQ: 0/30067
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000080
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
Seccomp: 0
Cpus_allowed: f
Cpus_allowed_list: 0-3
Mems_allowed: 00000000,00000001
Mems_allowed_list: 0
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1

View file

@ -216,7 +216,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
config.Networks = []*configs.Network{
{
Type: "loopback",
@ -233,7 +233,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
config.MountLabel = spec.Linux.MountLabel
config.Sysctl = spec.Linux.Sysctl
if spec.Linux.Seccomp != nil {
seccomp, err := setupSeccomp(spec.Linux.Seccomp)
seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
if err != nil {
return nil, err
}
@ -243,8 +243,8 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
if spec.Process.SelinuxLabel != "" {
config.ProcessLabel = spec.Process.SelinuxLabel
}
if spec.Process != nil && spec.Process.OOMScoreAdj != nil {
config.OomScoreAdj = *spec.Process.OOMScoreAdj
if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
}
if spec.Process.Capabilities != nil {
config.Capabilities = &configs.Capabilities{
@ -269,13 +269,17 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data, ext := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
device := m.Type
if flags&unix.MS_BIND != 0 {
if device == "" {
device = "bind"
}
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: m.Type,
Device: device,
Source: source,
Destination: m.Destination,
Data: data,
@ -732,7 +736,7 @@ func parseMountOptions(options []string) (int, []int, string, int) {
return flag, pgflag, strings.Join(data, ","), extFlags
}
func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}

View file

@ -123,7 +123,7 @@ func TestSetupSeccomp(t *testing.T) {
},
},
}
seccomp, err := setupSeccomp(conf)
seccomp, err := SetupSeccomp(conf)
if err != nil {
t.Errorf("Couldn't create Seccomp config: %v", err)

View file

@ -19,7 +19,7 @@ func TestCaptureTestFunc(t *testing.T) {
// the first frame is the caller
frame := stack.Frames[0]
if expected := "captureFunc"; frame.Function != expected {
t.Fatalf("expected function %q but recevied %q", expected, frame.Function)
t.Fatalf("expected function %q but received %q", expected, frame.Function)
}
expected := "/runc/libcontainer/stacktrace"
if !strings.HasSuffix(frame.Package, expected) {

View file

@ -14,6 +14,7 @@ import (
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@ -49,11 +50,11 @@ func (l *linuxStandardInit) Init() error {
// Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
return errors.Wrap(err, "join session keyring")
}
// Make session keyring searcheable.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
return errors.Wrap(err, "mod keyring permissions")
}
}
@ -76,7 +77,7 @@ func (l *linuxStandardInit) Init() error {
return err
}
if err := system.Setctty(); err != nil {
return err
return errors.Wrap(err, "setctty")
}
}
@ -89,45 +90,45 @@ func (l *linuxStandardInit) Init() error {
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return err
return errors.Wrap(err, "sethostname")
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
return errors.Wrap(err, "apply apparmor profile")
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
return errors.Wrap(err, "set process label")
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
return errors.Wrapf(err, "write sysctl key %s", key)
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return err
return errors.Wrapf(err, "readonly path %s", path)
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return err
return errors.Wrapf(err, "mask path %s", path)
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
return errors.Wrap(err, "get pdeath signal")
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
return errors.Wrap(err, "set nonewprivileges")
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
return errors.Wrap(err, "sync ready")
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
@ -143,7 +144,7 @@ func (l *linuxStandardInit) Init() error {
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
return errors.Wrap(err, "restore pdeath signal")
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died

View file

@ -3,13 +3,12 @@
package system
import (
"bufio"
"fmt"
"os"
"os/exec"
"syscall" // only for exec
"unsafe"
"github.com/opencontainers/runc/libcontainer/user"
"golang.org/x/sys/unix"
)
@ -102,34 +101,43 @@ func Setctty() error {
}
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
// Originally copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()
return UIDMapInUserNS(uidmap)
}
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
func UIDMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if a == 0 && b == 0 && c == 4294967295 {
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}
// GetParentNSeuid returns the euid within the parent user namespace
func GetParentNSeuid() int64 {
euid := int64(os.Geteuid())
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return euid
}
for _, um := range uidmap {
if um.ID <= euid && euid <= um.ID+um.Count-1 {
return um.ParentID + euid - um.ID
}
}
return euid
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)

View file

@ -0,0 +1,45 @@
// +build linux
package system
import (
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/user"
)
func TestUIDMapInUserNS(t *testing.T) {
cases := []struct {
s string
expected bool
}{
{
s: " 0 0 4294967295\n",
expected: false,
},
{
s: " 0 0 1\n",
expected: true,
},
{
s: " 0 1001 1\n 1 231072 65536\n",
expected: true,
},
{
// file exist but empty (the initial state when userns is created. see man 7 user_namespaces)
s: "",
expected: true,
},
}
for _, c := range cases {
uidmap, err := user.ParseIDMap(strings.NewReader(c.s))
if err != nil {
t.Fatal(err)
}
actual := UIDMapInUserNS(uidmap)
if c.expected != actual {
t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s)
}
}
}

View file

@ -2,8 +2,26 @@
package system
import (
"os"
"github.com/opencontainers/runc/libcontainer/user"
)
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}
// UIDMapInUserNS is a stub for non-Linux systems
// Always returns false
func UIDMapInUserNS(uidmap []user.IDMap) bool {
return false
}
// GetParentNSeuid returns the euid within the parent user namespace
// Always returns os.Geteuid on non-linux
func GetParentNSeuid() int {
return os.Geteuid()
}

View file

@ -114,3 +114,29 @@ func CurrentUser() (User, error) {
func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid())
}
func CurrentUserSubUIDs() ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
return ParseSubIDFileFilter("/etc/subuid",
func(entry SubID) bool { return entry.Name == u.Name })
}
func CurrentGroupSubGIDs() ([]SubID, error) {
g, err := CurrentGroup()
if err != nil {
return nil, err
}
return ParseSubIDFileFilter("/etc/subgid",
func(entry SubID) bool { return entry.Name == g.Name })
}
func CurrentProcessUIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/uid_map")
}
func CurrentProcessGIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/gid_map")
}

View file

@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) {
return newGroup, nil
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
SubID int64
Count int64
}
// IDMap represents an entry in /proc/PID/{u,g}id_map
type IDMap struct {
ID int64
ParentID int64
Count int64
}
func parseLine(line string, v ...interface{}) {
if line == "" {
parseParts(strings.Split(line, ":"), v...)
}
func parseParts(parts []string, v ...interface{}) {
if len(parts) == 0 {
return
}
parts := strings.Split(line, ":")
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) {
case *int:
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(p)
case *int64:
*e, _ = strconv.ParseInt(p, 10, 64)
case *[]string:
// Comma-separated lists.
if p != "" {
@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) {
}
default:
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int
}
return GetAdditionalGroups(additionalGroups, group)
}
func ParseSubIDFile(path string) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubID(subid)
}
func ParseSubID(subid io.Reader) ([]SubID, error) {
return ParseSubIDFilter(subid, nil)
}
func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubIDFilter(subid, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
if r == nil {
return nil, fmt.Errorf("nil source for subid-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []SubID{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 5 subuid
p := SubID{}
parseLine(line, &p.Name, &p.SubID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}
func ParseIDMapFile(path string) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMap(r)
}
func ParseIDMap(r io.Reader) ([]IDMap, error) {
return ParseIDMapFilter(r, nil)
}
func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMapFilter(r, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
if r == nil {
return nil, fmt.Errorf("nil source for idmap-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []IDMap{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 7 user_namespaces
p := IDMap{}
parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}

View file

@ -1,8 +1,6 @@
package utils
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"io"
"os"
@ -17,19 +15,6 @@ const (
exitSignalOffset = 128
)
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
if size > 64 {
size = 64
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {

View file

@ -10,28 +10,6 @@ import (
"golang.org/x/sys/unix"
)
func TestGenerateName(t *testing.T) {
name, err := GenerateRandomName("veth", 5)
if err != nil {
t.Fatal(err)
}
expected := 5 + len("veth")
if len(name) != expected {
t.Fatalf("expected name to be %d chars but received %d", expected, len(name))
}
name, err = GenerateRandomName("veth", 65)
if err != nil {
t.Fatal(err)
}
expected = 64 + len("veth")
if len(name) != expected {
t.Fatalf("expected name to be %d chars but received %d", expected, len(name))
}
}
var labelTest = []struct {
labels []string
query string
@ -151,4 +129,14 @@ func TestCleanPath(t *testing.T) {
if path != "/var" {
t.Errorf("expected to receive '/var' and received %s", path)
}
path = CleanPath("/foo/bar/")
if path != "/foo/bar" {
t.Errorf("expected to receive '/foo/bar' and received %s", path)
}
path = CleanPath("/foo/bar/../")
if path != "/foo" {
t.Errorf("expected to receive '/foo' and received %s", path)
}
}