Merge pull request #4327 from crosbymichael/add-libcontainer
Add native execution driver to docker and make it the default
This commit is contained in:
commit
0ef83adf9f
29 changed files with 2083 additions and 4 deletions
|
@ -5,10 +5,23 @@ import (
|
|||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/mount"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Cgroup struct {
|
||||
Name string `json:"name,omitempty"`
|
||||
Parent string `json:"parent,omitempty"`
|
||||
|
||||
DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice
|
||||
Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes)
|
||||
MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap
|
||||
CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers)
|
||||
}
|
||||
|
||||
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
|
||||
func FindCgroupMountpoint(subsystem string) (string, error) {
|
||||
mounts, err := mount.GetMounts()
|
||||
|
@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem)
|
||||
}
|
||||
|
||||
|
@ -40,18 +52,199 @@ func GetThisCgroupDir(subsystem string) (string, error) {
|
|||
return parseCgroupFile(subsystem, f)
|
||||
}
|
||||
|
||||
func GetInitCgroupDir(subsystem string) (string, error) {
|
||||
f, err := os.Open("/proc/1/cgroup")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return parseCgroupFile(subsystem, f)
|
||||
}
|
||||
|
||||
func (c *Cgroup) Path(root, subsystem string) (string, error) {
|
||||
cgroup := c.Name
|
||||
if c.Parent != "" {
|
||||
cgroup = filepath.Join(c.Parent, cgroup)
|
||||
}
|
||||
initPath, err := GetInitCgroupDir(subsystem)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.Join(root, subsystem, initPath, cgroup), nil
|
||||
}
|
||||
|
||||
func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) {
|
||||
path, err := c.Path(root, subsystem)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
|
||||
return "", err
|
||||
}
|
||||
if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (c *Cgroup) Cleanup(root string) error {
|
||||
get := func(subsystem string) string {
|
||||
path, _ := c.Path(root, subsystem)
|
||||
return path
|
||||
}
|
||||
|
||||
for _, path := range []string{
|
||||
get("memory"),
|
||||
get("devices"),
|
||||
get("cpu"),
|
||||
} {
|
||||
os.RemoveAll(path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
|
||||
s := bufio.NewScanner(r)
|
||||
|
||||
for s.Scan() {
|
||||
if err := s.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
text := s.Text()
|
||||
parts := strings.Split(text, ":")
|
||||
if parts[1] == subsystem {
|
||||
return parts[2], nil
|
||||
for _, subs := range strings.Split(parts[1], ",") {
|
||||
if subs == subsystem {
|
||||
return parts[2], nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem)
|
||||
}
|
||||
|
||||
func writeFile(dir, file, data string) error {
|
||||
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
|
||||
}
|
||||
|
||||
func (c *Cgroup) Apply(pid int) error {
|
||||
// We have two implementation of cgroups support, one is based on
|
||||
// systemd and the dbus api, and one is based on raw cgroup fs operations
|
||||
// following the pre-single-writer model docs at:
|
||||
// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
|
||||
//
|
||||
// we can pick any subsystem to find the root
|
||||
cgroupRoot, err := FindCgroupMountpoint("cpu")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cgroupRoot = filepath.Dir(cgroupRoot)
|
||||
|
||||
if _, err := os.Stat(cgroupRoot); err != nil {
|
||||
return fmt.Errorf("cgroups fs not found")
|
||||
}
|
||||
if err := c.setupDevices(cgroupRoot, pid); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := c.setupMemory(cgroupRoot, pid); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := c.setupCpu(cgroupRoot, pid); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) {
|
||||
if !c.DeviceAccess {
|
||||
dir, err := c.Join(cgroupRoot, "devices", pid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
os.RemoveAll(dir)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := writeFile(dir, "devices.deny", "a"); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
allow := []string{
|
||||
// /dev/null, zero, full
|
||||
"c 1:3 rwm",
|
||||
"c 1:5 rwm",
|
||||
"c 1:7 rwm",
|
||||
|
||||
// consoles
|
||||
"c 5:1 rwm",
|
||||
"c 5:0 rwm",
|
||||
"c 4:0 rwm",
|
||||
"c 4:1 rwm",
|
||||
|
||||
// /dev/urandom,/dev/random
|
||||
"c 1:9 rwm",
|
||||
"c 1:8 rwm",
|
||||
|
||||
// /dev/pts/ - pts namespaces are "coming soon"
|
||||
"c 136:* rwm",
|
||||
"c 5:2 rwm",
|
||||
|
||||
// tuntap
|
||||
"c 10:200 rwm",
|
||||
}
|
||||
|
||||
for _, val := range allow {
|
||||
if err := writeFile(dir, "devices.allow", val); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) {
|
||||
if c.Memory != 0 || c.MemorySwap != 0 {
|
||||
dir, err := c.Join(cgroupRoot, "memory", pid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
os.RemoveAll(dir)
|
||||
}
|
||||
}()
|
||||
|
||||
if c.Memory != 0 {
|
||||
if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// By default, MemorySwap is set to twice the size of RAM.
|
||||
// If you want to omit MemorySwap, set it to `-1'.
|
||||
if c.MemorySwap != -1 {
|
||||
if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) {
|
||||
// We always want to join the cpu group, to allow fair cpu scheduling
|
||||
// on a container basis
|
||||
dir, err := c.Join(cgroupRoot, "cpu", pid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if c.CpuShares != 0 {
|
||||
if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
2
libcontainer/MAINTAINERS
Normal file
2
libcontainer/MAINTAINERS
Normal file
|
@ -0,0 +1,2 @@
|
|||
Michael Crosby <michael@crosbymichael.com> (@crosbymichael)
|
||||
Guillaume Charmes <guillaume@dotcloud.com> (@creack)
|
90
libcontainer/README.md
Normal file
90
libcontainer/README.md
Normal file
|
@ -0,0 +1,90 @@
|
|||
## libcontainer - reference implementation for containers
|
||||
|
||||
#### background
|
||||
|
||||
libcontainer specifies configuration options for what a container is. It provides a native Go implementation
|
||||
for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management.
|
||||
|
||||
|
||||
#### container
|
||||
A container is a self contained directory that is able to run one or more processes without
|
||||
affecting the host system. The directory is usually a full system tree. Inside the directory
|
||||
a `container.json` file is placed with the runtime configuration for how the processes
|
||||
should be contained and ran. Environment, networking, and different capabilities for the
|
||||
process are specified in this file. The configuration is used for each process executed inside the container.
|
||||
|
||||
Sample `container.json` file:
|
||||
```json
|
||||
{
|
||||
"hostname": "koye",
|
||||
"tty": true,
|
||||
"environment": [
|
||||
"HOME=/",
|
||||
"PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
|
||||
"container=docker",
|
||||
"TERM=xterm-256color"
|
||||
],
|
||||
"namespaces": [
|
||||
"NEWIPC",
|
||||
"NEWNS",
|
||||
"NEWPID",
|
||||
"NEWUTS",
|
||||
"NEWNET"
|
||||
],
|
||||
"capabilities": [
|
||||
"SETPCAP",
|
||||
"SYS_MODULE",
|
||||
"SYS_RAWIO",
|
||||
"SYS_PACCT",
|
||||
"SYS_ADMIN",
|
||||
"SYS_NICE",
|
||||
"SYS_RESOURCE",
|
||||
"SYS_TIME",
|
||||
"SYS_TTY_CONFIG",
|
||||
"MKNOD",
|
||||
"AUDIT_WRITE",
|
||||
"AUDIT_CONTROL",
|
||||
"MAC_OVERRIDE",
|
||||
"MAC_ADMIN",
|
||||
"NET_ADMIN"
|
||||
],
|
||||
"networks": [{
|
||||
"type": "veth",
|
||||
"context": {
|
||||
"bridge": "docker0",
|
||||
"prefix": "dock"
|
||||
},
|
||||
"address": "172.17.0.100/16",
|
||||
"gateway": "172.17.42.1",
|
||||
"mtu": 1500
|
||||
}
|
||||
],
|
||||
"cgroups": {
|
||||
"name": "docker-koye",
|
||||
"parent": "docker",
|
||||
"memory": 5248000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file
|
||||
is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process.
|
||||
|
||||
|
||||
You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved.
|
||||
|
||||
#### nsinit
|
||||
|
||||
`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to
|
||||
spawn or join new containers giving the current directory. To use `nsinit` cd into a linux
|
||||
rootfs and copy a `container.json` file into the directory with your specified configuration.
|
||||
|
||||
To execute `/bin/bash` in the current directory as a container just run:
|
||||
```bash
|
||||
nsinit exec /bin/bash
|
||||
```
|
||||
|
||||
If you wish to spawn another process inside the container while your current bash session is
|
||||
running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed.
|
||||
|
||||
You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory.
|
17
libcontainer/TODO.md
Normal file
17
libcontainer/TODO.md
Normal file
|
@ -0,0 +1,17 @@
|
|||
#### goals
|
||||
* small and simple - line count is not everything but less code is better
|
||||
* clean lines between what we do in the pkg
|
||||
* provide primitives for working with namespaces not cater to every option
|
||||
* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code
|
||||
|
||||
#### tasks
|
||||
* proper tty for a new process in an existing container
|
||||
* use exec or raw syscalls for new process in existing container
|
||||
* setup proper user in namespace if specified
|
||||
* implement hook or clean interface for cgroups
|
||||
* example configs for different setups (host networking, boot init)
|
||||
* improve pkg documentation with comments
|
||||
* testing - this is hard in a low level pkg but we could do some, maybe
|
||||
* pivot root
|
||||
* selinux
|
||||
* apparmor
|
33
libcontainer/capabilities/capabilities.go
Normal file
33
libcontainer/capabilities/capabilities.go
Normal file
|
@ -0,0 +1,33 @@
|
|||
package capabilities
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/syndtr/gocapability/capability"
|
||||
"os"
|
||||
)
|
||||
|
||||
// DropCapabilities drops capabilities for the current process based
|
||||
// on the container's configuration.
|
||||
func DropCapabilities(container *libcontainer.Container) error {
|
||||
if drop := getCapabilities(container); len(drop) > 0 {
|
||||
c, err := capability.NewPid(os.Getpid())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.Unset(capability.CAPS|capability.BOUNDS, drop...)
|
||||
|
||||
if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getCapabilities returns the specific cap values for the libcontainer types
|
||||
func getCapabilities(container *libcontainer.Container) []capability.Cap {
|
||||
drop := []capability.Cap{}
|
||||
for _, c := range container.Capabilities {
|
||||
drop = append(drop, c.Value)
|
||||
}
|
||||
return drop
|
||||
}
|
36
libcontainer/container.go
Normal file
36
libcontainer/container.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
package libcontainer
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/cgroups"
|
||||
)
|
||||
|
||||
// Context is a generic key value pair that allows
|
||||
// arbatrary data to be sent
|
||||
type Context map[string]string
|
||||
|
||||
// Container defines configuration options for how a
|
||||
// container is setup inside a directory and how a process should be executed
|
||||
type Container struct {
|
||||
Hostname string `json:"hostname,omitempty"` // hostname
|
||||
ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly
|
||||
User string `json:"user,omitempty"` // user to execute the process as
|
||||
WorkingDir string `json:"working_dir,omitempty"` // current working directory
|
||||
Env []string `json:"environment,omitempty"` // environment to set
|
||||
Tty bool `json:"tty,omitempty"` // setup a proper tty or not
|
||||
Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply
|
||||
Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop
|
||||
Networks []*Network `json:"networks,omitempty"` // nil for host's network stack
|
||||
Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"`
|
||||
}
|
||||
|
||||
// Network defines configuration for a container's networking stack
|
||||
//
|
||||
// The network configuration can be omited from a container causing the
|
||||
// container to be setup with the host's networking stack
|
||||
type Network struct {
|
||||
Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc
|
||||
Context Context `json:"context,omitempty"` // generic context for type specific networking options
|
||||
Address string `json:"address,omitempty"`
|
||||
Gateway string `json:"gateway,omitempty"`
|
||||
Mtu int `json:"mtu,omitempty"`
|
||||
}
|
50
libcontainer/container.json
Normal file
50
libcontainer/container.json
Normal file
|
@ -0,0 +1,50 @@
|
|||
{
|
||||
"hostname": "koye",
|
||||
"tty": true,
|
||||
"environment": [
|
||||
"HOME=/",
|
||||
"PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
|
||||
"container=docker",
|
||||
"TERM=xterm-256color"
|
||||
],
|
||||
"namespaces": [
|
||||
"NEWIPC",
|
||||
"NEWNS",
|
||||
"NEWPID",
|
||||
"NEWUTS",
|
||||
"NEWNET"
|
||||
],
|
||||
"capabilities": [
|
||||
"SETPCAP",
|
||||
"SYS_MODULE",
|
||||
"SYS_RAWIO",
|
||||
"SYS_PACCT",
|
||||
"SYS_ADMIN",
|
||||
"SYS_NICE",
|
||||
"SYS_RESOURCE",
|
||||
"SYS_TIME",
|
||||
"SYS_TTY_CONFIG",
|
||||
"MKNOD",
|
||||
"AUDIT_WRITE",
|
||||
"AUDIT_CONTROL",
|
||||
"MAC_OVERRIDE",
|
||||
"MAC_ADMIN",
|
||||
"NET_ADMIN"
|
||||
],
|
||||
"networks": [{
|
||||
"type": "veth",
|
||||
"context": {
|
||||
"bridge": "docker0",
|
||||
"prefix": "dock"
|
||||
},
|
||||
"address": "172.17.0.100/16",
|
||||
"gateway": "172.17.42.1",
|
||||
"mtu": 1500
|
||||
}
|
||||
],
|
||||
"cgroups": {
|
||||
"name": "docker-koye",
|
||||
"parent": "docker",
|
||||
"memory": 5248000
|
||||
}
|
||||
}
|
78
libcontainer/network/network.go
Normal file
78
libcontainer/network/network.go
Normal file
|
@ -0,0 +1,78 @@
|
|||
package network
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/netlink"
|
||||
"net"
|
||||
)
|
||||
|
||||
func InterfaceUp(name string) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkLinkUp(iface)
|
||||
}
|
||||
|
||||
func InterfaceDown(name string) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkLinkDown(iface)
|
||||
}
|
||||
|
||||
func ChangeInterfaceName(old, newName string) error {
|
||||
iface, err := net.InterfaceByName(old)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkChangeName(iface, newName)
|
||||
}
|
||||
|
||||
func CreateVethPair(name1, name2 string) error {
|
||||
return netlink.NetworkCreateVethPair(name1, name2)
|
||||
}
|
||||
|
||||
func SetInterfaceInNamespacePid(name string, nsPid int) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkSetNsPid(iface, nsPid)
|
||||
}
|
||||
|
||||
func SetInterfaceMaster(name, master string) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
masterIface, err := net.InterfaceByName(master)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkSetMaster(iface, masterIface)
|
||||
}
|
||||
|
||||
func SetDefaultGateway(ip string) error {
|
||||
return netlink.AddDefaultGw(net.ParseIP(ip))
|
||||
}
|
||||
|
||||
func SetInterfaceIp(name string, rawIp string) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ip, ipNet, err := net.ParseCIDR(rawIp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkLinkAddIp(iface, ip, ipNet)
|
||||
}
|
||||
|
||||
func SetMtu(name string, mtu int) error {
|
||||
iface, err := net.InterfaceByName(name)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return netlink.NetworkSetMTU(iface, mtu)
|
||||
}
|
32
libcontainer/network/strategy.go
Normal file
32
libcontainer/network/strategy.go
Normal file
|
@ -0,0 +1,32 @@
|
|||
package network
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNotValidStrategyType = errors.New("not a valid network strategy type")
|
||||
)
|
||||
|
||||
var strategies = map[string]NetworkStrategy{
|
||||
"veth": &Veth{},
|
||||
}
|
||||
|
||||
// NetworkStrategy represents a specific network configuration for
|
||||
// a container's networking stack
|
||||
type NetworkStrategy interface {
|
||||
Create(*libcontainer.Network, int, libcontainer.Context) error
|
||||
Initialize(*libcontainer.Network, libcontainer.Context) error
|
||||
}
|
||||
|
||||
// GetStrategy returns the specific network strategy for the
|
||||
// provided type. If no strategy is registered for the type an
|
||||
// ErrNotValidStrategyType is returned.
|
||||
func GetStrategy(tpe string) (NetworkStrategy, error) {
|
||||
s, exists := strategies[tpe]
|
||||
if !exists {
|
||||
return nil, ErrNotValidStrategyType
|
||||
}
|
||||
return s, nil
|
||||
}
|
100
libcontainer/network/veth.go
Normal file
100
libcontainer/network/veth.go
Normal file
|
@ -0,0 +1,100 @@
|
|||
package network
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/utils"
|
||||
)
|
||||
|
||||
// Veth is a network strategy that uses a bridge and creates
|
||||
// a veth pair, one that stays outside on the host and the other
|
||||
// is placed inside the container's namespace
|
||||
type Veth struct {
|
||||
}
|
||||
|
||||
func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error {
|
||||
var (
|
||||
bridge string
|
||||
prefix string
|
||||
exists bool
|
||||
)
|
||||
if bridge, exists = n.Context["bridge"]; !exists {
|
||||
return fmt.Errorf("bridge does not exist in network context")
|
||||
}
|
||||
if prefix, exists = n.Context["prefix"]; !exists {
|
||||
return fmt.Errorf("veth prefix does not exist in network context")
|
||||
}
|
||||
name1, name2, err := createVethPair(prefix)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
context["veth-host"] = name1
|
||||
context["veth-child"] = name2
|
||||
if err := SetInterfaceMaster(name1, bridge); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := SetMtu(name1, n.Mtu); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := InterfaceUp(name1); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := SetInterfaceInNamespacePid(name2, nspid); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error {
|
||||
var (
|
||||
vethChild string
|
||||
exists bool
|
||||
)
|
||||
if vethChild, exists = context["veth-child"]; !exists {
|
||||
return fmt.Errorf("vethChild does not exist in network context")
|
||||
}
|
||||
if err := InterfaceDown(vethChild); err != nil {
|
||||
return fmt.Errorf("interface down %s %s", vethChild, err)
|
||||
}
|
||||
if err := ChangeInterfaceName(vethChild, "eth0"); err != nil {
|
||||
return fmt.Errorf("change %s to eth0 %s", vethChild, err)
|
||||
}
|
||||
if err := SetInterfaceIp("eth0", config.Address); err != nil {
|
||||
return fmt.Errorf("set eth0 ip %s", err)
|
||||
}
|
||||
if err := SetMtu("eth0", config.Mtu); err != nil {
|
||||
return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err)
|
||||
}
|
||||
if err := InterfaceUp("eth0"); err != nil {
|
||||
return fmt.Errorf("eth0 up %s", err)
|
||||
}
|
||||
if err := SetMtu("lo", config.Mtu); err != nil {
|
||||
return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err)
|
||||
}
|
||||
if err := InterfaceUp("lo"); err != nil {
|
||||
return fmt.Errorf("lo up %s", err)
|
||||
}
|
||||
if config.Gateway != "" {
|
||||
if err := SetDefaultGateway(config.Gateway); err != nil {
|
||||
return fmt.Errorf("set gateway to %s %s", config.Gateway, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// createVethPair will automatically generage two random names for
|
||||
// the veth pair and ensure that they have been created
|
||||
func createVethPair(prefix string) (name1 string, name2 string, err error) {
|
||||
name1, err = utils.GenerateRandomName(prefix, 4)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
name2, err = utils.GenerateRandomName(prefix, 4)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if err = CreateVethPair(name1, name2); err != nil {
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
45
libcontainer/nsinit/command.go
Normal file
45
libcontainer/nsinit/command.go
Normal file
|
@ -0,0 +1,45 @@
|
|||
package nsinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
// CommandFactory takes the container's configuration and options passed by the
|
||||
// parent processes and creates an *exec.Cmd that will be used to fork/exec the
|
||||
// namespaced init process
|
||||
type CommandFactory interface {
|
||||
Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd
|
||||
}
|
||||
|
||||
type DefaultCommandFactory struct {
|
||||
Root string
|
||||
}
|
||||
|
||||
// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces
|
||||
// defined on the container's configuration and use the current binary as the init with the
|
||||
// args provided
|
||||
func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd {
|
||||
// get our binary name from arg0 so we can always reexec ourself
|
||||
command := exec.Command(os.Args[0], append([]string{
|
||||
"-console", console,
|
||||
"-pipe", fmt.Sprint(pipe),
|
||||
"-root", c.Root,
|
||||
"init"}, args...)...)
|
||||
|
||||
system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces)))
|
||||
command.Env = container.Env
|
||||
return command
|
||||
}
|
||||
|
||||
// GetNamespaceFlags parses the container's Namespaces options to set the correct
|
||||
// flags on clone, unshare, and setns
|
||||
func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) {
|
||||
for _, ns := range namespaces {
|
||||
flag |= ns.Value
|
||||
}
|
||||
return flag
|
||||
}
|
96
libcontainer/nsinit/exec.go
Normal file
96
libcontainer/nsinit/exec.go
Normal file
|
@ -0,0 +1,96 @@
|
|||
// +build linux
|
||||
|
||||
package nsinit
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/network"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"os"
|
||||
"os/exec"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// Exec performes setup outside of a namespace so that a container can be
|
||||
// executed. Exec is a high level function for working with container namespaces.
|
||||
func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
|
||||
var (
|
||||
master *os.File
|
||||
console string
|
||||
err error
|
||||
)
|
||||
|
||||
// create a pipe so that we can syncronize with the namespaced process and
|
||||
// pass the veth name to the child
|
||||
syncPipe, err := NewSyncPipe()
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
if container.Tty {
|
||||
master, console, err = system.CreateMasterAndConsole()
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
term.SetMaster(master)
|
||||
}
|
||||
|
||||
command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args)
|
||||
if err := term.Attach(command); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
defer term.Close()
|
||||
|
||||
if err := command.Start(); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil {
|
||||
command.Process.Kill()
|
||||
return -1, err
|
||||
}
|
||||
defer ns.stateWriter.DeletePid()
|
||||
|
||||
// Do this before syncing with child so that no children
|
||||
// can escape the cgroup
|
||||
if err := ns.SetupCgroups(container, command.Process.Pid); err != nil {
|
||||
command.Process.Kill()
|
||||
return -1, err
|
||||
}
|
||||
if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil {
|
||||
command.Process.Kill()
|
||||
return -1, err
|
||||
}
|
||||
|
||||
// Sync with child
|
||||
syncPipe.Close()
|
||||
|
||||
if err := command.Wait(); err != nil {
|
||||
if _, ok := err.(*exec.ExitError); !ok {
|
||||
return -1, err
|
||||
}
|
||||
}
|
||||
return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil
|
||||
}
|
||||
|
||||
func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error {
|
||||
if container.Cgroups != nil {
|
||||
if err := container.Cgroups.Apply(nspid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error {
|
||||
context := libcontainer.Context{}
|
||||
for _, config := range container.Networks {
|
||||
strategy, err := network.GetStrategy(config.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := strategy.Create(config, nspid, context); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return pipe.SendToChild(context)
|
||||
}
|
94
libcontainer/nsinit/execin.go
Normal file
94
libcontainer/nsinit/execin.go
Normal file
|
@ -0,0 +1,94 @@
|
|||
// +build linux
|
||||
|
||||
package nsinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// ExecIn uses an existing pid and joins the pid's namespaces with the new command.
|
||||
func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
|
||||
for _, ns := range container.Namespaces {
|
||||
if err := system.Unshare(ns.Value); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
}
|
||||
fds, err := ns.getNsFds(nspid, container)
|
||||
closeFds := func() {
|
||||
for _, f := range fds {
|
||||
system.Closefd(f)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
closeFds()
|
||||
return -1, err
|
||||
}
|
||||
|
||||
// foreach namespace fd, use setns to join an existing container's namespaces
|
||||
for _, fd := range fds {
|
||||
if fd > 0 {
|
||||
if err := system.Setns(fd, 0); err != nil {
|
||||
closeFds()
|
||||
return -1, fmt.Errorf("setns %s", err)
|
||||
}
|
||||
}
|
||||
system.Closefd(fd)
|
||||
}
|
||||
|
||||
// if the container has a new pid and mount namespace we need to
|
||||
// remount proc and sys to pick up the changes
|
||||
if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") {
|
||||
pid, err := system.Fork()
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
if pid == 0 {
|
||||
// TODO: make all raw syscalls to be fork safe
|
||||
if err := system.Unshare(syscall.CLONE_NEWNS); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
if err := remountProc(); err != nil {
|
||||
return -1, fmt.Errorf("remount proc %s", err)
|
||||
}
|
||||
if err := remountSys(); err != nil {
|
||||
return -1, fmt.Errorf("remount sys %s", err)
|
||||
}
|
||||
goto dropAndExec
|
||||
}
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
state, err := proc.Wait()
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus())
|
||||
}
|
||||
dropAndExec:
|
||||
if err := finalizeNamespace(container); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
if err := system.Execv(args[0], args[0:], container.Env); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
panic("unreachable")
|
||||
}
|
||||
|
||||
func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) {
|
||||
fds := make([]uintptr, len(container.Namespaces))
|
||||
for i, ns := range container.Namespaces {
|
||||
f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return fds, err
|
||||
}
|
||||
fds[i] = f.Fd()
|
||||
}
|
||||
return fds, nil
|
||||
}
|
153
libcontainer/nsinit/init.go
Normal file
153
libcontainer/nsinit/init.go
Normal file
|
@ -0,0 +1,153 @@
|
|||
// +build linux
|
||||
|
||||
package nsinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/network"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/utils"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"github.com/dotcloud/docker/pkg/user"
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
|
||||
// and other options required for the new container.
|
||||
func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
|
||||
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// We always read this as it is a way to sync with the parent as well
|
||||
context, err := syncPipe.ReadFromParent()
|
||||
if err != nil {
|
||||
syncPipe.Close()
|
||||
return err
|
||||
}
|
||||
syncPipe.Close()
|
||||
|
||||
if console != "" {
|
||||
// close pipes so that we can replace it with the pty
|
||||
closeStdPipes()
|
||||
slave, err := system.OpenTerminal(console, syscall.O_RDWR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open terminal %s", err)
|
||||
}
|
||||
if err := dupSlave(slave); err != nil {
|
||||
return fmt.Errorf("dup2 slave %s", err)
|
||||
}
|
||||
}
|
||||
if _, err := system.Setsid(); err != nil {
|
||||
return fmt.Errorf("setsid %s", err)
|
||||
}
|
||||
if console != "" {
|
||||
if err := system.Setctty(); err != nil {
|
||||
return fmt.Errorf("setctty %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if err := system.ParentDeathSignal(); err != nil {
|
||||
return fmt.Errorf("parent death signal %s", err)
|
||||
}
|
||||
*/
|
||||
if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil {
|
||||
return fmt.Errorf("setup mount namespace %s", err)
|
||||
}
|
||||
if err := setupNetwork(container, context); err != nil {
|
||||
return fmt.Errorf("setup networking %s", err)
|
||||
}
|
||||
if err := system.Sethostname(container.Hostname); err != nil {
|
||||
return fmt.Errorf("sethostname %s", err)
|
||||
}
|
||||
if err := finalizeNamespace(container); err != nil {
|
||||
return fmt.Errorf("finalize namespace %s", err)
|
||||
}
|
||||
return system.Execv(args[0], args[0:], container.Env)
|
||||
}
|
||||
|
||||
func closeStdPipes() {
|
||||
os.Stdin.Close()
|
||||
os.Stdout.Close()
|
||||
os.Stderr.Close()
|
||||
}
|
||||
|
||||
func setupUser(container *libcontainer.Container) error {
|
||||
switch container.User {
|
||||
case "root", "":
|
||||
if err := system.Setgroups(nil); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setresgid(0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setresuid(0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setgroups(suppGids); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setgid(gid); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Setuid(uid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that
|
||||
// the slave's fd is 0, or stdin
|
||||
func dupSlave(slave *os.File) error {
|
||||
if slave.Fd() != 0 {
|
||||
return fmt.Errorf("slave fd not 0 %d", slave.Fd())
|
||||
}
|
||||
if err := system.Dup2(slave.Fd(), 1); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Dup2(slave.Fd(), 2); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupVethNetwork uses the Network config if it is not nil to initialize
|
||||
// the new veth interface inside the container for use by changing the name to eth0
|
||||
// setting the MTU and IP address along with the default gateway
|
||||
func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error {
|
||||
for _, config := range container.Networks {
|
||||
strategy, err := network.GetStrategy(config.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return strategy.Initialize(config, context)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// finalizeNamespace drops the caps and sets the correct user
|
||||
// and working dir before execing the command inside the namespace
|
||||
func finalizeNamespace(container *libcontainer.Container) error {
|
||||
if err := capabilities.DropCapabilities(container); err != nil {
|
||||
return fmt.Errorf("drop capabilities %s", err)
|
||||
}
|
||||
if err := setupUser(container); err != nil {
|
||||
return fmt.Errorf("setup user %s", err)
|
||||
}
|
||||
if container.WorkingDir != "" {
|
||||
if err := system.Chdir(container.WorkingDir); err != nil {
|
||||
return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
254
libcontainer/nsinit/mount.go
Normal file
254
libcontainer/nsinit/mount.go
Normal file
|
@ -0,0 +1,254 @@
|
|||
// +build linux
|
||||
|
||||
package nsinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// default mount point flags
|
||||
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
|
||||
|
||||
// setupNewMountNamespace is used to initialize a new mount namespace for an new
|
||||
// container in the rootfs that is specified.
|
||||
//
|
||||
// There is no need to unmount the new mounts because as soon as the mount namespace
|
||||
// is no longer in use, the mounts will be removed automatically
|
||||
func setupNewMountNamespace(rootfs, console string, readonly bool) error {
|
||||
// mount as slave so that the new mounts do not propagate to the host
|
||||
if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
|
||||
return fmt.Errorf("mounting / as slave %s", err)
|
||||
}
|
||||
if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
|
||||
return fmt.Errorf("mouting %s as bind %s", rootfs, err)
|
||||
}
|
||||
if readonly {
|
||||
if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
|
||||
return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
|
||||
}
|
||||
}
|
||||
if err := mountSystem(rootfs); err != nil {
|
||||
return fmt.Errorf("mount system %s", err)
|
||||
}
|
||||
if err := copyDevNodes(rootfs); err != nil {
|
||||
return fmt.Errorf("copy dev nodes %s", err)
|
||||
}
|
||||
if err := setupLoopbackDevices(rootfs); err != nil {
|
||||
return fmt.Errorf("setup loopback devices %s", err)
|
||||
}
|
||||
if err := setupDev(rootfs); err != nil {
|
||||
return err
|
||||
}
|
||||
if console != "" {
|
||||
if err := setupPtmx(rootfs, console); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := system.Chdir(rootfs); err != nil {
|
||||
return fmt.Errorf("chdir into %s %s", rootfs, err)
|
||||
}
|
||||
if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
|
||||
return fmt.Errorf("mount move %s into / %s", rootfs, err)
|
||||
}
|
||||
if err := system.Chroot("."); err != nil {
|
||||
return fmt.Errorf("chroot . %s", err)
|
||||
}
|
||||
if err := system.Chdir("/"); err != nil {
|
||||
return fmt.Errorf("chdir / %s", err)
|
||||
}
|
||||
|
||||
system.Umask(0022)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// copyDevNodes mknods the hosts devices so the new container has access to them
|
||||
func copyDevNodes(rootfs string) error {
|
||||
oldMask := system.Umask(0000)
|
||||
defer system.Umask(oldMask)
|
||||
|
||||
for _, node := range []string{
|
||||
"null",
|
||||
"zero",
|
||||
"full",
|
||||
"random",
|
||||
"urandom",
|
||||
"tty",
|
||||
} {
|
||||
if err := copyDevNode(rootfs, node); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setupLoopbackDevices(rootfs string) error {
|
||||
for i := 0; ; i++ {
|
||||
var (
|
||||
device = fmt.Sprintf("loop%d", i)
|
||||
source = filepath.Join("/dev", device)
|
||||
dest = filepath.Join(rootfs, "dev", device)
|
||||
)
|
||||
|
||||
if _, err := os.Stat(source); err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(dest); err == nil {
|
||||
os.Remove(dest)
|
||||
}
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.Close()
|
||||
if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyDevNode(rootfs, node string) error {
|
||||
stat, err := os.Stat(filepath.Join("/dev", node))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var (
|
||||
dest = filepath.Join(rootfs, "dev", node)
|
||||
st = stat.Sys().(*syscall.Stat_t)
|
||||
)
|
||||
if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
|
||||
return fmt.Errorf("copy %s %s", node, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupDev symlinks the current processes pipes into the
|
||||
// appropriate destination on the containers rootfs
|
||||
func setupDev(rootfs string) error {
|
||||
for _, link := range []struct {
|
||||
from string
|
||||
to string
|
||||
}{
|
||||
{"/proc/kcore", "/dev/core"},
|
||||
{"/proc/self/fd", "/dev/fd"},
|
||||
{"/proc/self/fd/0", "/dev/stdin"},
|
||||
{"/proc/self/fd/1", "/dev/stdout"},
|
||||
{"/proc/self/fd/2", "/dev/stderr"},
|
||||
} {
|
||||
dest := filepath.Join(rootfs, link.to)
|
||||
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("remove %s %s", dest, err)
|
||||
}
|
||||
if err := os.Symlink(link.from, dest); err != nil {
|
||||
return fmt.Errorf("symlink %s %s", dest, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupConsole ensures that the container has a proper /dev/console setup
|
||||
func setupConsole(rootfs, console string) error {
|
||||
oldMask := system.Umask(0000)
|
||||
defer system.Umask(oldMask)
|
||||
|
||||
stat, err := os.Stat(console)
|
||||
if err != nil {
|
||||
return fmt.Errorf("stat console %s %s", console, err)
|
||||
}
|
||||
var (
|
||||
st = stat.Sys().(*syscall.Stat_t)
|
||||
dest = filepath.Join(rootfs, "dev/console")
|
||||
)
|
||||
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("remove %s %s", dest, err)
|
||||
}
|
||||
if err := os.Chmod(console, 0600); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Chown(console, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
|
||||
return fmt.Errorf("mknod %s %s", dest, err)
|
||||
}
|
||||
if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
|
||||
return fmt.Errorf("bind %s to %s %s", console, dest, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
|
||||
// inside the mount namespace
|
||||
func mountSystem(rootfs string) error {
|
||||
for _, m := range []struct {
|
||||
source string
|
||||
path string
|
||||
device string
|
||||
flags int
|
||||
data string
|
||||
}{
|
||||
{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
|
||||
{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
|
||||
{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"},
|
||||
{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"},
|
||||
{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"},
|
||||
{source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"},
|
||||
} {
|
||||
if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
|
||||
return fmt.Errorf("mkdirall %s %s", m.path, err)
|
||||
}
|
||||
if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
|
||||
return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and
|
||||
// finishes setting up /dev/console
|
||||
func setupPtmx(rootfs, console string) error {
|
||||
ptmx := filepath.Join(rootfs, "dev/ptmx")
|
||||
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
|
||||
return fmt.Errorf("symlink dev ptmx %s", err)
|
||||
}
|
||||
if err := setupConsole(rootfs, console); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// remountProc is used to detach and remount the proc filesystem
|
||||
// commonly needed with running a new process inside an existing container
|
||||
func remountProc() error {
|
||||
if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func remountSys() error {
|
||||
if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
|
||||
if err != syscall.EINVAL {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
26
libcontainer/nsinit/nsinit.go
Normal file
26
libcontainer/nsinit/nsinit.go
Normal file
|
@ -0,0 +1,26 @@
|
|||
package nsinit
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
)
|
||||
|
||||
// NsInit is an interface with the public facing methods to provide high level
|
||||
// exec operations on a container
|
||||
type NsInit interface {
|
||||
Exec(container *libcontainer.Container, term Terminal, args []string) (int, error)
|
||||
ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error)
|
||||
Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error
|
||||
}
|
||||
|
||||
type linuxNs struct {
|
||||
root string
|
||||
commandFactory CommandFactory
|
||||
stateWriter StateWriter
|
||||
}
|
||||
|
||||
func NewNsInit(command CommandFactory, state StateWriter) NsInit {
|
||||
return &linuxNs{
|
||||
commandFactory: command,
|
||||
stateWriter: state,
|
||||
}
|
||||
}
|
110
libcontainer/nsinit/nsinit/main.go
Normal file
110
libcontainer/nsinit/nsinit/main.go
Normal file
|
@ -0,0 +1,110 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer/nsinit"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
var (
|
||||
root, console string
|
||||
pipeFd int
|
||||
)
|
||||
|
||||
func registerFlags() {
|
||||
flag.StringVar(&console, "console", "", "console (pty slave) path")
|
||||
flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd")
|
||||
flag.StringVar(&root, "root", ".", "root for storing configuration data")
|
||||
|
||||
flag.Parse()
|
||||
}
|
||||
|
||||
func main() {
|
||||
registerFlags()
|
||||
|
||||
if flag.NArg() < 1 {
|
||||
log.Fatalf("wrong number of argments %d", flag.NArg())
|
||||
}
|
||||
container, err := loadContainer()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
ns, err := newNsInit()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
switch flag.Arg(0) {
|
||||
case "exec": // this is executed outside of the namespace in the cwd
|
||||
var exitCode int
|
||||
nspid, err := readPid()
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
if nspid > 0 {
|
||||
exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:])
|
||||
} else {
|
||||
term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty)
|
||||
exitCode, err = ns.Exec(container, term, flag.Args()[1:])
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
os.Exit(exitCode)
|
||||
case "init": // this is executed inside of the namespace to setup the container
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if flag.NArg() < 2 {
|
||||
log.Fatalf("wrong number of argments %d", flag.NArg())
|
||||
}
|
||||
syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
default:
|
||||
log.Fatalf("command not supported for nsinit %s", flag.Arg(0))
|
||||
}
|
||||
}
|
||||
|
||||
func loadContainer() (*libcontainer.Container, error) {
|
||||
f, err := os.Open(filepath.Join(root, "container.json"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var container *libcontainer.Container
|
||||
if err := json.NewDecoder(f).Decode(&container); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return container, nil
|
||||
}
|
||||
|
||||
func readPid() (int, error) {
|
||||
data, err := ioutil.ReadFile(filepath.Join(root, "pid"))
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
pid, err := strconv.Atoi(string(data))
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
func newNsInit() (nsinit.NsInit, error) {
|
||||
return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil
|
||||
}
|
28
libcontainer/nsinit/state.go
Normal file
28
libcontainer/nsinit/state.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
package nsinit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// StateWriter handles writing and deleting the pid file
|
||||
// on disk
|
||||
type StateWriter interface {
|
||||
WritePid(pid int) error
|
||||
DeletePid() error
|
||||
}
|
||||
|
||||
type DefaultStateWriter struct {
|
||||
Root string
|
||||
}
|
||||
|
||||
// writePidFile writes the namespaced processes pid to pid in the rootfs for the container
|
||||
func (d *DefaultStateWriter) WritePid(pid int) error {
|
||||
return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655)
|
||||
}
|
||||
|
||||
func (d *DefaultStateWriter) DeletePid() error {
|
||||
return os.Remove(filepath.Join(d.Root, "pid"))
|
||||
}
|
73
libcontainer/nsinit/sync_pipe.go
Normal file
73
libcontainer/nsinit/sync_pipe.go
Normal file
|
@ -0,0 +1,73 @@
|
|||
package nsinit
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
"github.com/dotcloud/docker/pkg/system"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
)
|
||||
|
||||
// SyncPipe allows communication to and from the child processes
|
||||
// to it's parent and allows the two independent processes to
|
||||
// syncronize their state.
|
||||
type SyncPipe struct {
|
||||
parent, child *os.File
|
||||
}
|
||||
|
||||
func NewSyncPipe() (s *SyncPipe, err error) {
|
||||
s = &SyncPipe{}
|
||||
s.child, s.parent, err = os.Pipe()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
system.UsetCloseOnExec(s.child.Fd())
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) {
|
||||
s := &SyncPipe{}
|
||||
if parendFd > 0 {
|
||||
s.parent = os.NewFile(parendFd, "parendPipe")
|
||||
} else if childFd > 0 {
|
||||
s.child = os.NewFile(childFd, "childPipe")
|
||||
} else {
|
||||
return nil, fmt.Errorf("no valid sync pipe fd specified")
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (s *SyncPipe) SendToChild(context libcontainer.Context) error {
|
||||
data, err := json.Marshal(context)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.parent.Write(data)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) {
|
||||
data, err := ioutil.ReadAll(s.child)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading from sync pipe %s", err)
|
||||
}
|
||||
var context libcontainer.Context
|
||||
if len(data) > 0 {
|
||||
if err := json.Unmarshal(data, &context); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return context, nil
|
||||
|
||||
}
|
||||
|
||||
func (s *SyncPipe) Close() error {
|
||||
if s.parent != nil {
|
||||
s.parent.Close()
|
||||
}
|
||||
if s.child != nil {
|
||||
s.child.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
118
libcontainer/nsinit/term.go
Normal file
118
libcontainer/nsinit/term.go
Normal file
|
@ -0,0 +1,118 @@
|
|||
package nsinit
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/term"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
type Terminal interface {
|
||||
io.Closer
|
||||
SetMaster(*os.File)
|
||||
Attach(*exec.Cmd) error
|
||||
Resize(h, w int) error
|
||||
}
|
||||
|
||||
func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal {
|
||||
if tty {
|
||||
return &TtyTerminal{
|
||||
stdin: stdin,
|
||||
stdout: stdout,
|
||||
stderr: stderr,
|
||||
}
|
||||
}
|
||||
return &StdTerminal{
|
||||
stdin: stdin,
|
||||
stdout: stdout,
|
||||
stderr: stderr,
|
||||
}
|
||||
}
|
||||
|
||||
type TtyTerminal struct {
|
||||
stdin io.Reader
|
||||
stdout, stderr io.Writer
|
||||
master *os.File
|
||||
state *term.State
|
||||
}
|
||||
|
||||
func (t *TtyTerminal) Resize(h, w int) error {
|
||||
return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
|
||||
}
|
||||
|
||||
func (t *TtyTerminal) SetMaster(master *os.File) {
|
||||
t.master = master
|
||||
}
|
||||
|
||||
func (t *TtyTerminal) Attach(command *exec.Cmd) error {
|
||||
go io.Copy(t.stdout, t.master)
|
||||
go io.Copy(t.master, t.stdin)
|
||||
|
||||
state, err := t.setupWindow(t.master, os.Stdin)
|
||||
if err != nil {
|
||||
command.Process.Kill()
|
||||
return err
|
||||
}
|
||||
t.state = state
|
||||
return err
|
||||
}
|
||||
|
||||
// SetupWindow gets the parent window size and sets the master
|
||||
// pty to the current size and set the parents mode to RAW
|
||||
func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) {
|
||||
ws, err := term.GetWinsize(parent.Fd())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := term.SetWinsize(master.Fd(), ws); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return term.SetRawTerminal(parent.Fd())
|
||||
}
|
||||
|
||||
func (t *TtyTerminal) Close() error {
|
||||
term.RestoreTerminal(os.Stdin.Fd(), t.state)
|
||||
return t.master.Close()
|
||||
}
|
||||
|
||||
type StdTerminal struct {
|
||||
stdin io.Reader
|
||||
stdout, stderr io.Writer
|
||||
}
|
||||
|
||||
func (s *StdTerminal) SetMaster(*os.File) {
|
||||
// no need to set master on non tty
|
||||
}
|
||||
|
||||
func (s *StdTerminal) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *StdTerminal) Resize(h, w int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *StdTerminal) Attach(command *exec.Cmd) error {
|
||||
inPipe, err := command.StdinPipe()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
outPipe, err := command.StdoutPipe()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
errPipe, err := command.StderrPipe()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer inPipe.Close()
|
||||
io.Copy(inPipe, s.stdin)
|
||||
}()
|
||||
|
||||
go io.Copy(s.stdout, outPipe)
|
||||
go io.Copy(s.stderr, errPipe)
|
||||
|
||||
return nil
|
||||
}
|
19
libcontainer/nsinit/unsupported.go
Normal file
19
libcontainer/nsinit/unsupported.go
Normal file
|
@ -0,0 +1,19 @@
|
|||
// +build !linux
|
||||
|
||||
package nsinit
|
||||
|
||||
import (
|
||||
"github.com/dotcloud/docker/pkg/libcontainer"
|
||||
)
|
||||
|
||||
func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
|
||||
return -1, libcontainer.ErrUnsupported
|
||||
}
|
||||
|
||||
func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
|
||||
return -1, libcontainer.ErrUnsupported
|
||||
}
|
||||
|
||||
func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
|
||||
return libcontainer.ErrUnsupported
|
||||
}
|
134
libcontainer/types.go
Normal file
134
libcontainer/types.go
Normal file
|
@ -0,0 +1,134 @@
|
|||
package libcontainer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"github.com/syndtr/gocapability/capability"
|
||||
"os"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrUnkownNamespace = errors.New("Unknown namespace")
|
||||
ErrUnkownCapability = errors.New("Unknown capability")
|
||||
ErrUnsupported = errors.New("Unsupported method")
|
||||
)
|
||||
|
||||
// namespaceList is used to convert the libcontainer types
|
||||
// into the names of the files located in /proc/<pid>/ns/* for
|
||||
// each namespace
|
||||
var (
|
||||
namespaceList = Namespaces{}
|
||||
|
||||
capabilityList = Capabilities{
|
||||
{Key: "SETPCAP", Value: capability.CAP_SETPCAP},
|
||||
{Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE},
|
||||
{Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO},
|
||||
{Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT},
|
||||
{Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN},
|
||||
{Key: "SYS_NICE", Value: capability.CAP_SYS_NICE},
|
||||
{Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE},
|
||||
{Key: "SYS_TIME", Value: capability.CAP_SYS_TIME},
|
||||
{Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG},
|
||||
{Key: "MKNOD", Value: capability.CAP_MKNOD},
|
||||
{Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE},
|
||||
{Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL},
|
||||
{Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE},
|
||||
{Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN},
|
||||
{Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN},
|
||||
}
|
||||
)
|
||||
|
||||
type (
|
||||
Namespace struct {
|
||||
Key string
|
||||
Value int
|
||||
File string
|
||||
}
|
||||
Namespaces []*Namespace
|
||||
)
|
||||
|
||||
func (ns *Namespace) String() string {
|
||||
return ns.Key
|
||||
}
|
||||
|
||||
func (ns *Namespace) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(ns.Key)
|
||||
}
|
||||
|
||||
func (ns *Namespace) UnmarshalJSON(src []byte) error {
|
||||
var nsName string
|
||||
if err := json.Unmarshal(src, &nsName); err != nil {
|
||||
return err
|
||||
}
|
||||
ret := GetNamespace(nsName)
|
||||
if ret == nil {
|
||||
return ErrUnkownNamespace
|
||||
}
|
||||
*ns = *ret
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetNamespace(key string) *Namespace {
|
||||
for _, ns := range namespaceList {
|
||||
if ns.Key == key {
|
||||
return ns
|
||||
}
|
||||
}
|
||||
if os.Getenv("DEBUG") != "" {
|
||||
panic("Unreachable: Namespace not found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Contains returns true if the specified Namespace is
|
||||
// in the slice
|
||||
func (n Namespaces) Contains(ns string) bool {
|
||||
return GetNamespace(ns) != nil
|
||||
}
|
||||
|
||||
type (
|
||||
Capability struct {
|
||||
Key string
|
||||
Value capability.Cap
|
||||
}
|
||||
Capabilities []*Capability
|
||||
)
|
||||
|
||||
func (c *Capability) String() string {
|
||||
return c.Key
|
||||
}
|
||||
|
||||
func (c *Capability) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(c.Key)
|
||||
}
|
||||
|
||||
func (c *Capability) UnmarshalJSON(src []byte) error {
|
||||
var capName string
|
||||
if err := json.Unmarshal(src, &capName); err != nil {
|
||||
return err
|
||||
}
|
||||
ret := GetCapability(capName)
|
||||
if ret == nil {
|
||||
return ErrUnkownCapability
|
||||
}
|
||||
*c = *ret
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetCapability(key string) *Capability {
|
||||
for _, capp := range capabilityList {
|
||||
if capp.Key == key {
|
||||
return capp
|
||||
}
|
||||
}
|
||||
if os.Getenv("DEBUG") != "" {
|
||||
panic("Unreachable: Capability not found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Contains returns true if the specified Capability is
|
||||
// in the slice
|
||||
func (c Capabilities) Contains(capp string) bool {
|
||||
return GetCapability(capp) != nil
|
||||
}
|
16
libcontainer/types_linux.go
Normal file
16
libcontainer/types_linux.go
Normal file
|
@ -0,0 +1,16 @@
|
|||
package libcontainer
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
)
|
||||
|
||||
func init() {
|
||||
namespaceList = Namespaces{
|
||||
{Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"},
|
||||
{Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"},
|
||||
{Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"},
|
||||
{Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"},
|
||||
{Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"},
|
||||
{Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"},
|
||||
}
|
||||
}
|
28
libcontainer/utils/utils.go
Normal file
28
libcontainer/utils/utils.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
package utils
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"io"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// GenerateRandomName returns a new name joined with a prefix. This size
|
||||
// specified is used to truncate the randomly generated value
|
||||
func GenerateRandomName(prefix string, size int) (string, error) {
|
||||
id := make([]byte, 32)
|
||||
if _, err := io.ReadFull(rand.Reader, id); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return prefix + hex.EncodeToString(id)[:size], nil
|
||||
}
|
||||
|
||||
// ResolveRootfs ensures that the current working directory is
|
||||
// not a symlink and returns the absolute path to the rootfs
|
||||
func ResolveRootfs(uncleanRootfs string) (string, error) {
|
||||
rootfs, err := filepath.Abs(uncleanRootfs)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.EvalSymlinks(rootfs)
|
||||
}
|
145
system/calls_linux.go
Normal file
145
system/calls_linux.go
Normal file
|
@ -0,0 +1,145 @@
|
|||
package system
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
func Chroot(dir string) error {
|
||||
return syscall.Chroot(dir)
|
||||
}
|
||||
|
||||
func Chdir(dir string) error {
|
||||
return syscall.Chdir(dir)
|
||||
}
|
||||
|
||||
func Exec(cmd string, args []string, env []string) error {
|
||||
return syscall.Exec(cmd, args, env)
|
||||
}
|
||||
|
||||
func Execv(cmd string, args []string, env []string) error {
|
||||
name, err := exec.LookPath(cmd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return Exec(name, args, env)
|
||||
}
|
||||
|
||||
func Fork() (int, error) {
|
||||
syscall.ForkLock.Lock()
|
||||
pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
|
||||
syscall.ForkLock.Unlock()
|
||||
if err != 0 {
|
||||
return -1, err
|
||||
}
|
||||
return int(pid), nil
|
||||
}
|
||||
|
||||
func Mount(source, target, fstype string, flags uintptr, data string) error {
|
||||
return syscall.Mount(source, target, fstype, flags, data)
|
||||
}
|
||||
|
||||
func Unmount(target string, flags int) error {
|
||||
return syscall.Unmount(target, flags)
|
||||
}
|
||||
|
||||
func Pivotroot(newroot, putold string) error {
|
||||
return syscall.PivotRoot(newroot, putold)
|
||||
}
|
||||
|
||||
func Unshare(flags int) error {
|
||||
return syscall.Unshare(flags)
|
||||
}
|
||||
|
||||
func Clone(flags uintptr) (int, error) {
|
||||
syscall.ForkLock.Lock()
|
||||
pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0)
|
||||
syscall.ForkLock.Unlock()
|
||||
if err != 0 {
|
||||
return -1, err
|
||||
}
|
||||
return int(pid), nil
|
||||
}
|
||||
|
||||
func UsetCloseOnExec(fd uintptr) error {
|
||||
if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Setgroups(gids []int) error {
|
||||
return syscall.Setgroups(gids)
|
||||
}
|
||||
|
||||
func Setresgid(rgid, egid, sgid int) error {
|
||||
return syscall.Setresgid(rgid, egid, sgid)
|
||||
}
|
||||
|
||||
func Setresuid(ruid, euid, suid int) error {
|
||||
return syscall.Setresuid(ruid, euid, suid)
|
||||
}
|
||||
|
||||
func Setgid(gid int) error {
|
||||
return syscall.Setgid(gid)
|
||||
}
|
||||
|
||||
func Setuid(uid int) error {
|
||||
return syscall.Setuid(uid)
|
||||
}
|
||||
|
||||
func Sethostname(name string) error {
|
||||
return syscall.Sethostname([]byte(name))
|
||||
}
|
||||
|
||||
func Setsid() (int, error) {
|
||||
return syscall.Setsid()
|
||||
}
|
||||
|
||||
func Ioctl(fd uintptr, flag, data uintptr) error {
|
||||
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Closefd(fd uintptr) error {
|
||||
return syscall.Close(int(fd))
|
||||
}
|
||||
|
||||
func Dup2(fd1, fd2 uintptr) error {
|
||||
return syscall.Dup2(int(fd1), int(fd2))
|
||||
}
|
||||
|
||||
func Mknod(path string, mode uint32, dev int) error {
|
||||
return syscall.Mknod(path, mode, dev)
|
||||
}
|
||||
|
||||
func ParentDeathSignal() error {
|
||||
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Setctty() error {
|
||||
if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Mkfifo(name string, mode uint32) error {
|
||||
return syscall.Mkfifo(name, mode)
|
||||
}
|
||||
|
||||
func Umask(mask int) int {
|
||||
return syscall.Umask(mask)
|
||||
}
|
||||
|
||||
func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
|
||||
if cmd.SysProcAttr == nil {
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{}
|
||||
}
|
||||
cmd.SysProcAttr.Cloneflags = flag
|
||||
}
|
9
system/errors.go
Normal file
9
system/errors.go
Normal file
|
@ -0,0 +1,9 @@
|
|||
package system
|
||||
|
||||
import (
|
||||
"errors"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNotSupportedPlatform = errors.New("platform and architecture is not supported")
|
||||
)
|
58
system/pty_linux.go
Normal file
58
system/pty_linux.go
Normal file
|
@ -0,0 +1,58 @@
|
|||
package system
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
|
||||
// Unlockpt should be called before opening the slave side of a pseudoterminal.
|
||||
func Unlockpt(f *os.File) error {
|
||||
var u int
|
||||
return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
|
||||
}
|
||||
|
||||
// Ptsname retrieves the name of the first available pts for the given master.
|
||||
func Ptsname(f *os.File) (string, error) {
|
||||
var n int
|
||||
|
||||
if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return fmt.Sprintf("/dev/pts/%d", n), nil
|
||||
}
|
||||
|
||||
// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the
|
||||
// pts name for use as the pty slave inside the container
|
||||
func CreateMasterAndConsole() (*os.File, string, error) {
|
||||
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
console, err := Ptsname(master)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
if err := Unlockpt(master); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
return master, console, nil
|
||||
}
|
||||
|
||||
// OpenPtmx opens /dev/ptmx, i.e. the PTY master.
|
||||
func OpenPtmx() (*os.File, error) {
|
||||
// O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all.
|
||||
return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
|
||||
}
|
||||
|
||||
// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC
|
||||
// used to open the pty slave inside the container namespace
|
||||
func OpenTerminal(name string, flag int) (*os.File, error) {
|
||||
r, e := syscall.Open(name, flag, 0)
|
||||
if e != nil {
|
||||
return nil, &os.PathError{"open", name, e}
|
||||
}
|
||||
return os.NewFile(uintptr(r), name), nil
|
||||
}
|
27
system/setns_linux.go
Normal file
27
system/setns_linux.go
Normal file
|
@ -0,0 +1,27 @@
|
|||
package system
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092
|
||||
//
|
||||
// We need different setns values for the different platforms and arch
|
||||
// We are declaring the macro here because the SETNS syscall does not exist in th stdlib
|
||||
var setNsMap = map[string]uintptr{
|
||||
"linux/amd64": 308,
|
||||
}
|
||||
|
||||
func Setns(fd uintptr, flags uintptr) error {
|
||||
ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)]
|
||||
if !exists {
|
||||
return ErrNotSupportedPlatform
|
||||
}
|
||||
_, _, err := syscall.RawSyscall(ns, fd, flags, 0)
|
||||
if err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
15
system/unsupported.go
Normal file
15
system/unsupported.go
Normal file
|
@ -0,0 +1,15 @@
|
|||
// +build !linux
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
|
||||
|
||||
}
|
||||
|
||||
func UsetCloseOnExec(fd uintptr) error {
|
||||
return ErrNotSupportedPlatform
|
||||
}
|
Loading…
Reference in a new issue