Merge pull request #4327 from crosbymichael/add-libcontainer

Add native execution driver to docker and make it the default
This commit is contained in:
Guillaume J. Charmes 2014-03-03 16:34:20 -08:00
commit 0ef83adf9f
29 changed files with 2083 additions and 4 deletions

View file

@ -5,10 +5,23 @@ import (
"fmt" "fmt"
"github.com/dotcloud/docker/pkg/mount" "github.com/dotcloud/docker/pkg/mount"
"io" "io"
"io/ioutil"
"os" "os"
"path/filepath"
"strconv"
"strings" "strings"
) )
type Cgroup struct {
Name string `json:"name,omitempty"`
Parent string `json:"parent,omitempty"`
DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice
Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes)
MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap
CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers)
}
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) { func FindCgroupMountpoint(subsystem string) (string, error) {
mounts, err := mount.GetMounts() mounts, err := mount.GetMounts()
@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
} }
} }
} }
return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem) return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem)
} }
@ -40,18 +52,199 @@ func GetThisCgroupDir(subsystem string) (string, error) {
return parseCgroupFile(subsystem, f) return parseCgroupFile(subsystem, f)
} }
func GetInitCgroupDir(subsystem string) (string, error) {
f, err := os.Open("/proc/1/cgroup")
if err != nil {
return "", err
}
defer f.Close()
return parseCgroupFile(subsystem, f)
}
func (c *Cgroup) Path(root, subsystem string) (string, error) {
cgroup := c.Name
if c.Parent != "" {
cgroup = filepath.Join(c.Parent, cgroup)
}
initPath, err := GetInitCgroupDir(subsystem)
if err != nil {
return "", err
}
return filepath.Join(root, subsystem, initPath, cgroup), nil
}
func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) {
path, err := c.Path(root, subsystem)
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
return "", err
}
if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
return "", err
}
return path, nil
}
func (c *Cgroup) Cleanup(root string) error {
get := func(subsystem string) string {
path, _ := c.Path(root, subsystem)
return path
}
for _, path := range []string{
get("memory"),
get("devices"),
get("cpu"),
} {
os.RemoveAll(path)
}
return nil
}
func parseCgroupFile(subsystem string, r io.Reader) (string, error) { func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
s := bufio.NewScanner(r) s := bufio.NewScanner(r)
for s.Scan() { for s.Scan() {
if err := s.Err(); err != nil { if err := s.Err(); err != nil {
return "", err return "", err
} }
text := s.Text() text := s.Text()
parts := strings.Split(text, ":") parts := strings.Split(text, ":")
if parts[1] == subsystem { for _, subs := range strings.Split(parts[1], ",") {
return parts[2], nil if subs == subsystem {
return parts[2], nil
}
} }
} }
return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem)
} }
func writeFile(dir, file, data string) error {
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func (c *Cgroup) Apply(pid int) error {
// We have two implementation of cgroups support, one is based on
// systemd and the dbus api, and one is based on raw cgroup fs operations
// following the pre-single-writer model docs at:
// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
//
// we can pick any subsystem to find the root
cgroupRoot, err := FindCgroupMountpoint("cpu")
if err != nil {
return err
}
cgroupRoot = filepath.Dir(cgroupRoot)
if _, err := os.Stat(cgroupRoot); err != nil {
return fmt.Errorf("cgroups fs not found")
}
if err := c.setupDevices(cgroupRoot, pid); err != nil {
return err
}
if err := c.setupMemory(cgroupRoot, pid); err != nil {
return err
}
if err := c.setupCpu(cgroupRoot, pid); err != nil {
return err
}
return nil
}
func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) {
if !c.DeviceAccess {
dir, err := c.Join(cgroupRoot, "devices", pid)
if err != nil {
return err
}
defer func() {
if err != nil {
os.RemoveAll(dir)
}
}()
if err := writeFile(dir, "devices.deny", "a"); err != nil {
return err
}
allow := []string{
// /dev/null, zero, full
"c 1:3 rwm",
"c 1:5 rwm",
"c 1:7 rwm",
// consoles
"c 5:1 rwm",
"c 5:0 rwm",
"c 4:0 rwm",
"c 4:1 rwm",
// /dev/urandom,/dev/random
"c 1:9 rwm",
"c 1:8 rwm",
// /dev/pts/ - pts namespaces are "coming soon"
"c 136:* rwm",
"c 5:2 rwm",
// tuntap
"c 10:200 rwm",
}
for _, val := range allow {
if err := writeFile(dir, "devices.allow", val); err != nil {
return err
}
}
}
return nil
}
func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) {
if c.Memory != 0 || c.MemorySwap != 0 {
dir, err := c.Join(cgroupRoot, "memory", pid)
if err != nil {
return err
}
defer func() {
if err != nil {
os.RemoveAll(dir)
}
}()
if c.Memory != 0 {
if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
return err
}
if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
return err
}
}
// By default, MemorySwap is set to twice the size of RAM.
// If you want to omit MemorySwap, set it to `-1'.
if c.MemorySwap != -1 {
if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil {
return err
}
}
}
return nil
}
func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
dir, err := c.Join(cgroupRoot, "cpu", pid)
if err != nil {
return err
}
if c.CpuShares != 0 {
if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil {
return err
}
}
return nil
}

2
libcontainer/MAINTAINERS Normal file
View file

@ -0,0 +1,2 @@
Michael Crosby <michael@crosbymichael.com> (@crosbymichael)
Guillaume Charmes <guillaume@dotcloud.com> (@creack)

90
libcontainer/README.md Normal file
View file

@ -0,0 +1,90 @@
## libcontainer - reference implementation for containers
#### background
libcontainer specifies configuration options for what a container is. It provides a native Go implementation
for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management.
#### container
A container is a self contained directory that is able to run one or more processes without
affecting the host system. The directory is usually a full system tree. Inside the directory
a `container.json` file is placed with the runtime configuration for how the processes
should be contained and ran. Environment, networking, and different capabilities for the
process are specified in this file. The configuration is used for each process executed inside the container.
Sample `container.json` file:
```json
{
"hostname": "koye",
"tty": true,
"environment": [
"HOME=/",
"PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
"container=docker",
"TERM=xterm-256color"
],
"namespaces": [
"NEWIPC",
"NEWNS",
"NEWPID",
"NEWUTS",
"NEWNET"
],
"capabilities": [
"SETPCAP",
"SYS_MODULE",
"SYS_RAWIO",
"SYS_PACCT",
"SYS_ADMIN",
"SYS_NICE",
"SYS_RESOURCE",
"SYS_TIME",
"SYS_TTY_CONFIG",
"MKNOD",
"AUDIT_WRITE",
"AUDIT_CONTROL",
"MAC_OVERRIDE",
"MAC_ADMIN",
"NET_ADMIN"
],
"networks": [{
"type": "veth",
"context": {
"bridge": "docker0",
"prefix": "dock"
},
"address": "172.17.0.100/16",
"gateway": "172.17.42.1",
"mtu": 1500
}
],
"cgroups": {
"name": "docker-koye",
"parent": "docker",
"memory": 5248000
}
}
```
Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file
is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process.
You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved.
#### nsinit
`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to
spawn or join new containers giving the current directory. To use `nsinit` cd into a linux
rootfs and copy a `container.json` file into the directory with your specified configuration.
To execute `/bin/bash` in the current directory as a container just run:
```bash
nsinit exec /bin/bash
```
If you wish to spawn another process inside the container while your current bash session is
running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed.
You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory.

17
libcontainer/TODO.md Normal file
View file

@ -0,0 +1,17 @@
#### goals
* small and simple - line count is not everything but less code is better
* clean lines between what we do in the pkg
* provide primitives for working with namespaces not cater to every option
* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code
#### tasks
* proper tty for a new process in an existing container
* use exec or raw syscalls for new process in existing container
* setup proper user in namespace if specified
* implement hook or clean interface for cgroups
* example configs for different setups (host networking, boot init)
* improve pkg documentation with comments
* testing - this is hard in a low level pkg but we could do some, maybe
* pivot root
* selinux
* apparmor

View file

@ -0,0 +1,33 @@
package capabilities
import (
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/syndtr/gocapability/capability"
"os"
)
// DropCapabilities drops capabilities for the current process based
// on the container's configuration.
func DropCapabilities(container *libcontainer.Container) error {
if drop := getCapabilities(container); len(drop) > 0 {
c, err := capability.NewPid(os.Getpid())
if err != nil {
return err
}
c.Unset(capability.CAPS|capability.BOUNDS, drop...)
if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
return err
}
}
return nil
}
// getCapabilities returns the specific cap values for the libcontainer types
func getCapabilities(container *libcontainer.Container) []capability.Cap {
drop := []capability.Cap{}
for _, c := range container.Capabilities {
drop = append(drop, c.Value)
}
return drop
}

36
libcontainer/container.go Normal file
View file

@ -0,0 +1,36 @@
package libcontainer
import (
"github.com/dotcloud/docker/pkg/cgroups"
)
// Context is a generic key value pair that allows
// arbatrary data to be sent
type Context map[string]string
// Container defines configuration options for how a
// container is setup inside a directory and how a process should be executed
type Container struct {
Hostname string `json:"hostname,omitempty"` // hostname
ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly
User string `json:"user,omitempty"` // user to execute the process as
WorkingDir string `json:"working_dir,omitempty"` // current working directory
Env []string `json:"environment,omitempty"` // environment to set
Tty bool `json:"tty,omitempty"` // setup a proper tty or not
Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply
Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop
Networks []*Network `json:"networks,omitempty"` // nil for host's network stack
Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"`
}
// Network defines configuration for a container's networking stack
//
// The network configuration can be omited from a container causing the
// container to be setup with the host's networking stack
type Network struct {
Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc
Context Context `json:"context,omitempty"` // generic context for type specific networking options
Address string `json:"address,omitempty"`
Gateway string `json:"gateway,omitempty"`
Mtu int `json:"mtu,omitempty"`
}

View file

@ -0,0 +1,50 @@
{
"hostname": "koye",
"tty": true,
"environment": [
"HOME=/",
"PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
"container=docker",
"TERM=xterm-256color"
],
"namespaces": [
"NEWIPC",
"NEWNS",
"NEWPID",
"NEWUTS",
"NEWNET"
],
"capabilities": [
"SETPCAP",
"SYS_MODULE",
"SYS_RAWIO",
"SYS_PACCT",
"SYS_ADMIN",
"SYS_NICE",
"SYS_RESOURCE",
"SYS_TIME",
"SYS_TTY_CONFIG",
"MKNOD",
"AUDIT_WRITE",
"AUDIT_CONTROL",
"MAC_OVERRIDE",
"MAC_ADMIN",
"NET_ADMIN"
],
"networks": [{
"type": "veth",
"context": {
"bridge": "docker0",
"prefix": "dock"
},
"address": "172.17.0.100/16",
"gateway": "172.17.42.1",
"mtu": 1500
}
],
"cgroups": {
"name": "docker-koye",
"parent": "docker",
"memory": 5248000
}
}

View file

@ -0,0 +1,78 @@
package network
import (
"github.com/dotcloud/docker/pkg/netlink"
"net"
)
func InterfaceUp(name string) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
return netlink.NetworkLinkUp(iface)
}
func InterfaceDown(name string) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
return netlink.NetworkLinkDown(iface)
}
func ChangeInterfaceName(old, newName string) error {
iface, err := net.InterfaceByName(old)
if err != nil {
return err
}
return netlink.NetworkChangeName(iface, newName)
}
func CreateVethPair(name1, name2 string) error {
return netlink.NetworkCreateVethPair(name1, name2)
}
func SetInterfaceInNamespacePid(name string, nsPid int) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
return netlink.NetworkSetNsPid(iface, nsPid)
}
func SetInterfaceMaster(name, master string) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
masterIface, err := net.InterfaceByName(master)
if err != nil {
return err
}
return netlink.NetworkSetMaster(iface, masterIface)
}
func SetDefaultGateway(ip string) error {
return netlink.AddDefaultGw(net.ParseIP(ip))
}
func SetInterfaceIp(name string, rawIp string) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
ip, ipNet, err := net.ParseCIDR(rawIp)
if err != nil {
return err
}
return netlink.NetworkLinkAddIp(iface, ip, ipNet)
}
func SetMtu(name string, mtu int) error {
iface, err := net.InterfaceByName(name)
if err != nil {
return err
}
return netlink.NetworkSetMTU(iface, mtu)
}

View file

@ -0,0 +1,32 @@
package network
import (
"errors"
"github.com/dotcloud/docker/pkg/libcontainer"
)
var (
ErrNotValidStrategyType = errors.New("not a valid network strategy type")
)
var strategies = map[string]NetworkStrategy{
"veth": &Veth{},
}
// NetworkStrategy represents a specific network configuration for
// a container's networking stack
type NetworkStrategy interface {
Create(*libcontainer.Network, int, libcontainer.Context) error
Initialize(*libcontainer.Network, libcontainer.Context) error
}
// GetStrategy returns the specific network strategy for the
// provided type. If no strategy is registered for the type an
// ErrNotValidStrategyType is returned.
func GetStrategy(tpe string) (NetworkStrategy, error) {
s, exists := strategies[tpe]
if !exists {
return nil, ErrNotValidStrategyType
}
return s, nil
}

View file

@ -0,0 +1,100 @@
package network
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/utils"
)
// Veth is a network strategy that uses a bridge and creates
// a veth pair, one that stays outside on the host and the other
// is placed inside the container's namespace
type Veth struct {
}
func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error {
var (
bridge string
prefix string
exists bool
)
if bridge, exists = n.Context["bridge"]; !exists {
return fmt.Errorf("bridge does not exist in network context")
}
if prefix, exists = n.Context["prefix"]; !exists {
return fmt.Errorf("veth prefix does not exist in network context")
}
name1, name2, err := createVethPair(prefix)
if err != nil {
return err
}
context["veth-host"] = name1
context["veth-child"] = name2
if err := SetInterfaceMaster(name1, bridge); err != nil {
return err
}
if err := SetMtu(name1, n.Mtu); err != nil {
return err
}
if err := InterfaceUp(name1); err != nil {
return err
}
if err := SetInterfaceInNamespacePid(name2, nspid); err != nil {
return err
}
return nil
}
func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error {
var (
vethChild string
exists bool
)
if vethChild, exists = context["veth-child"]; !exists {
return fmt.Errorf("vethChild does not exist in network context")
}
if err := InterfaceDown(vethChild); err != nil {
return fmt.Errorf("interface down %s %s", vethChild, err)
}
if err := ChangeInterfaceName(vethChild, "eth0"); err != nil {
return fmt.Errorf("change %s to eth0 %s", vethChild, err)
}
if err := SetInterfaceIp("eth0", config.Address); err != nil {
return fmt.Errorf("set eth0 ip %s", err)
}
if err := SetMtu("eth0", config.Mtu); err != nil {
return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err)
}
if err := InterfaceUp("eth0"); err != nil {
return fmt.Errorf("eth0 up %s", err)
}
if err := SetMtu("lo", config.Mtu); err != nil {
return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err)
}
if err := InterfaceUp("lo"); err != nil {
return fmt.Errorf("lo up %s", err)
}
if config.Gateway != "" {
if err := SetDefaultGateway(config.Gateway); err != nil {
return fmt.Errorf("set gateway to %s %s", config.Gateway, err)
}
}
return nil
}
// createVethPair will automatically generage two random names for
// the veth pair and ensure that they have been created
func createVethPair(prefix string) (name1 string, name2 string, err error) {
name1, err = utils.GenerateRandomName(prefix, 4)
if err != nil {
return
}
name2, err = utils.GenerateRandomName(prefix, 4)
if err != nil {
return
}
if err = CreateVethPair(name1, name2); err != nil {
return
}
return
}

View file

@ -0,0 +1,45 @@
package nsinit
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/system"
"os"
"os/exec"
)
// CommandFactory takes the container's configuration and options passed by the
// parent processes and creates an *exec.Cmd that will be used to fork/exec the
// namespaced init process
type CommandFactory interface {
Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd
}
type DefaultCommandFactory struct {
Root string
}
// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces
// defined on the container's configuration and use the current binary as the init with the
// args provided
func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd {
// get our binary name from arg0 so we can always reexec ourself
command := exec.Command(os.Args[0], append([]string{
"-console", console,
"-pipe", fmt.Sprint(pipe),
"-root", c.Root,
"init"}, args...)...)
system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces)))
command.Env = container.Env
return command
}
// GetNamespaceFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare, and setns
func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) {
for _, ns := range namespaces {
flag |= ns.Value
}
return flag
}

View file

@ -0,0 +1,96 @@
// +build linux
package nsinit
import (
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/network"
"github.com/dotcloud/docker/pkg/system"
"os"
"os/exec"
"syscall"
)
// Exec performes setup outside of a namespace so that a container can be
// executed. Exec is a high level function for working with container namespaces.
func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
var (
master *os.File
console string
err error
)
// create a pipe so that we can syncronize with the namespaced process and
// pass the veth name to the child
syncPipe, err := NewSyncPipe()
if err != nil {
return -1, err
}
if container.Tty {
master, console, err = system.CreateMasterAndConsole()
if err != nil {
return -1, err
}
term.SetMaster(master)
}
command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args)
if err := term.Attach(command); err != nil {
return -1, err
}
defer term.Close()
if err := command.Start(); err != nil {
return -1, err
}
if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil {
command.Process.Kill()
return -1, err
}
defer ns.stateWriter.DeletePid()
// Do this before syncing with child so that no children
// can escape the cgroup
if err := ns.SetupCgroups(container, command.Process.Pid); err != nil {
command.Process.Kill()
return -1, err
}
if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil {
command.Process.Kill()
return -1, err
}
// Sync with child
syncPipe.Close()
if err := command.Wait(); err != nil {
if _, ok := err.(*exec.ExitError); !ok {
return -1, err
}
}
return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil
}
func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error {
if container.Cgroups != nil {
if err := container.Cgroups.Apply(nspid); err != nil {
return err
}
}
return nil
}
func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error {
context := libcontainer.Context{}
for _, config := range container.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.Create(config, nspid, context); err != nil {
return err
}
}
return pipe.SendToChild(context)
}

View file

@ -0,0 +1,94 @@
// +build linux
package nsinit
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/system"
"os"
"path/filepath"
"strconv"
"syscall"
)
// ExecIn uses an existing pid and joins the pid's namespaces with the new command.
func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
for _, ns := range container.Namespaces {
if err := system.Unshare(ns.Value); err != nil {
return -1, err
}
}
fds, err := ns.getNsFds(nspid, container)
closeFds := func() {
for _, f := range fds {
system.Closefd(f)
}
}
if err != nil {
closeFds()
return -1, err
}
// foreach namespace fd, use setns to join an existing container's namespaces
for _, fd := range fds {
if fd > 0 {
if err := system.Setns(fd, 0); err != nil {
closeFds()
return -1, fmt.Errorf("setns %s", err)
}
}
system.Closefd(fd)
}
// if the container has a new pid and mount namespace we need to
// remount proc and sys to pick up the changes
if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") {
pid, err := system.Fork()
if err != nil {
return -1, err
}
if pid == 0 {
// TODO: make all raw syscalls to be fork safe
if err := system.Unshare(syscall.CLONE_NEWNS); err != nil {
return -1, err
}
if err := remountProc(); err != nil {
return -1, fmt.Errorf("remount proc %s", err)
}
if err := remountSys(); err != nil {
return -1, fmt.Errorf("remount sys %s", err)
}
goto dropAndExec
}
proc, err := os.FindProcess(pid)
if err != nil {
return -1, err
}
state, err := proc.Wait()
if err != nil {
return -1, err
}
os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus())
}
dropAndExec:
if err := finalizeNamespace(container); err != nil {
return -1, err
}
if err := system.Execv(args[0], args[0:], container.Env); err != nil {
return -1, err
}
panic("unreachable")
}
func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) {
fds := make([]uintptr, len(container.Namespaces))
for i, ns := range container.Namespaces {
f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0)
if err != nil {
return fds, err
}
fds[i] = f.Fd()
}
return fds, nil
}

153
libcontainer/nsinit/init.go Normal file
View file

@ -0,0 +1,153 @@
// +build linux
package nsinit
import (
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
"github.com/dotcloud/docker/pkg/libcontainer/network"
"github.com/dotcloud/docker/pkg/libcontainer/utils"
"github.com/dotcloud/docker/pkg/system"
"github.com/dotcloud/docker/pkg/user"
"os"
"syscall"
)
// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
// and other options required for the new container.
func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
if err != nil {
return err
}
// We always read this as it is a way to sync with the parent as well
context, err := syncPipe.ReadFromParent()
if err != nil {
syncPipe.Close()
return err
}
syncPipe.Close()
if console != "" {
// close pipes so that we can replace it with the pty
closeStdPipes()
slave, err := system.OpenTerminal(console, syscall.O_RDWR)
if err != nil {
return fmt.Errorf("open terminal %s", err)
}
if err := dupSlave(slave); err != nil {
return fmt.Errorf("dup2 slave %s", err)
}
}
if _, err := system.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if console != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
/*
if err := system.ParentDeathSignal(); err != nil {
return fmt.Errorf("parent death signal %s", err)
}
*/
if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil {
return fmt.Errorf("setup mount namespace %s", err)
}
if err := setupNetwork(container, context); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := system.Sethostname(container.Hostname); err != nil {
return fmt.Errorf("sethostname %s", err)
}
if err := finalizeNamespace(container); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
return system.Execv(args[0], args[0:], container.Env)
}
func closeStdPipes() {
os.Stdin.Close()
os.Stdout.Close()
os.Stderr.Close()
}
func setupUser(container *libcontainer.Container) error {
switch container.User {
case "root", "":
if err := system.Setgroups(nil); err != nil {
return err
}
if err := system.Setresgid(0, 0, 0); err != nil {
return err
}
if err := system.Setresuid(0, 0, 0); err != nil {
return err
}
default:
uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid())
if err != nil {
return err
}
if err := system.Setgroups(suppGids); err != nil {
return err
}
if err := system.Setgid(gid); err != nil {
return err
}
if err := system.Setuid(uid); err != nil {
return err
}
}
return nil
}
// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that
// the slave's fd is 0, or stdin
func dupSlave(slave *os.File) error {
if slave.Fd() != 0 {
return fmt.Errorf("slave fd not 0 %d", slave.Fd())
}
if err := system.Dup2(slave.Fd(), 1); err != nil {
return err
}
if err := system.Dup2(slave.Fd(), 2); err != nil {
return err
}
return nil
}
// setupVethNetwork uses the Network config if it is not nil to initialize
// the new veth interface inside the container for use by changing the name to eth0
// setting the MTU and IP address along with the default gateway
func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error {
for _, config := range container.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
return strategy.Initialize(config, context)
}
return nil
}
// finalizeNamespace drops the caps and sets the correct user
// and working dir before execing the command inside the namespace
func finalizeNamespace(container *libcontainer.Container) error {
if err := capabilities.DropCapabilities(container); err != nil {
return fmt.Errorf("drop capabilities %s", err)
}
if err := setupUser(container); err != nil {
return fmt.Errorf("setup user %s", err)
}
if container.WorkingDir != "" {
if err := system.Chdir(container.WorkingDir); err != nil {
return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
}
}
return nil
}

View file

@ -0,0 +1,254 @@
// +build linux
package nsinit
import (
"fmt"
"github.com/dotcloud/docker/pkg/system"
"os"
"path/filepath"
"syscall"
)
// default mount point flags
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// setupNewMountNamespace is used to initialize a new mount namespace for an new
// container in the rootfs that is specified.
//
// There is no need to unmount the new mounts because as soon as the mount namespace
// is no longer in use, the mounts will be removed automatically
func setupNewMountNamespace(rootfs, console string, readonly bool) error {
// mount as slave so that the new mounts do not propagate to the host
if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mounting / as slave %s", err)
}
if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mouting %s as bind %s", rootfs, err)
}
if readonly {
if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
}
}
if err := mountSystem(rootfs); err != nil {
return fmt.Errorf("mount system %s", err)
}
if err := copyDevNodes(rootfs); err != nil {
return fmt.Errorf("copy dev nodes %s", err)
}
if err := setupLoopbackDevices(rootfs); err != nil {
return fmt.Errorf("setup loopback devices %s", err)
}
if err := setupDev(rootfs); err != nil {
return err
}
if console != "" {
if err := setupPtmx(rootfs, console); err != nil {
return err
}
}
if err := system.Chdir(rootfs); err != nil {
return fmt.Errorf("chdir into %s %s", rootfs, err)
}
if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
return fmt.Errorf("mount move %s into / %s", rootfs, err)
}
if err := system.Chroot("."); err != nil {
return fmt.Errorf("chroot . %s", err)
}
if err := system.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
system.Umask(0022)
return nil
}
// copyDevNodes mknods the hosts devices so the new container has access to them
func copyDevNodes(rootfs string) error {
oldMask := system.Umask(0000)
defer system.Umask(oldMask)
for _, node := range []string{
"null",
"zero",
"full",
"random",
"urandom",
"tty",
} {
if err := copyDevNode(rootfs, node); err != nil {
return err
}
}
return nil
}
func setupLoopbackDevices(rootfs string) error {
for i := 0; ; i++ {
var (
device = fmt.Sprintf("loop%d", i)
source = filepath.Join("/dev", device)
dest = filepath.Join(rootfs, "dev", device)
)
if _, err := os.Stat(source); err != nil {
if !os.IsNotExist(err) {
return err
}
return nil
}
if _, err := os.Stat(dest); err == nil {
os.Remove(dest)
}
f, err := os.Create(dest)
if err != nil {
return err
}
f.Close()
if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil {
return err
}
}
return nil
}
func copyDevNode(rootfs, node string) error {
stat, err := os.Stat(filepath.Join("/dev", node))
if err != nil {
return err
}
var (
dest = filepath.Join(rootfs, "dev", node)
st = stat.Sys().(*syscall.Stat_t)
)
if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
return fmt.Errorf("copy %s %s", node, err)
}
return nil
}
// setupDev symlinks the current processes pipes into the
// appropriate destination on the containers rootfs
func setupDev(rootfs string) error {
for _, link := range []struct {
from string
to string
}{
{"/proc/kcore", "/dev/core"},
{"/proc/self/fd", "/dev/fd"},
{"/proc/self/fd/0", "/dev/stdin"},
{"/proc/self/fd/1", "/dev/stdout"},
{"/proc/self/fd/2", "/dev/stderr"},
} {
dest := filepath.Join(rootfs, link.to)
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("remove %s %s", dest, err)
}
if err := os.Symlink(link.from, dest); err != nil {
return fmt.Errorf("symlink %s %s", dest, err)
}
}
return nil
}
// setupConsole ensures that the container has a proper /dev/console setup
func setupConsole(rootfs, console string) error {
oldMask := system.Umask(0000)
defer system.Umask(oldMask)
stat, err := os.Stat(console)
if err != nil {
return fmt.Errorf("stat console %s %s", console, err)
}
var (
st = stat.Sys().(*syscall.Stat_t)
dest = filepath.Join(rootfs, "dev/console")
)
if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("remove %s %s", dest, err)
}
if err := os.Chmod(console, 0600); err != nil {
return err
}
if err := os.Chown(console, 0, 0); err != nil {
return err
}
if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
return fmt.Errorf("mknod %s %s", dest, err)
}
if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
return fmt.Errorf("bind %s to %s %s", console, dest, err)
}
return nil
}
// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
// inside the mount namespace
func mountSystem(rootfs string) error {
for _, m := range []struct {
source string
path string
device string
flags int
data string
}{
{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"},
{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"},
{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"},
{source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"},
} {
if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
return fmt.Errorf("mkdirall %s %s", m.path, err)
}
if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
}
}
return nil
}
// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and
// finishes setting up /dev/console
func setupPtmx(rootfs, console string) error {
ptmx := filepath.Join(rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
if err := setupConsole(rootfs, console); err != nil {
return err
}
return nil
}
// remountProc is used to detach and remount the proc filesystem
// commonly needed with running a new process inside an existing container
func remountProc() error {
if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
return err
}
if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
return err
}
return nil
}
func remountSys() error {
if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
if err != syscall.EINVAL {
return err
}
} else {
if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
return err
}
}
return nil
}

View file

@ -0,0 +1,26 @@
package nsinit
import (
"github.com/dotcloud/docker/pkg/libcontainer"
)
// NsInit is an interface with the public facing methods to provide high level
// exec operations on a container
type NsInit interface {
Exec(container *libcontainer.Container, term Terminal, args []string) (int, error)
ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error)
Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error
}
type linuxNs struct {
root string
commandFactory CommandFactory
stateWriter StateWriter
}
func NewNsInit(command CommandFactory, state StateWriter) NsInit {
return &linuxNs{
commandFactory: command,
stateWriter: state,
}
}

View file

@ -0,0 +1,110 @@
package main
import (
"encoding/json"
"flag"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/libcontainer/nsinit"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"
)
var (
root, console string
pipeFd int
)
func registerFlags() {
flag.StringVar(&console, "console", "", "console (pty slave) path")
flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd")
flag.StringVar(&root, "root", ".", "root for storing configuration data")
flag.Parse()
}
func main() {
registerFlags()
if flag.NArg() < 1 {
log.Fatalf("wrong number of argments %d", flag.NArg())
}
container, err := loadContainer()
if err != nil {
log.Fatal(err)
}
ns, err := newNsInit()
if err != nil {
log.Fatal(err)
}
switch flag.Arg(0) {
case "exec": // this is executed outside of the namespace in the cwd
var exitCode int
nspid, err := readPid()
if err != nil {
if !os.IsNotExist(err) {
log.Fatal(err)
}
}
if nspid > 0 {
exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:])
} else {
term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty)
exitCode, err = ns.Exec(container, term, flag.Args()[1:])
}
if err != nil {
log.Fatal(err)
}
os.Exit(exitCode)
case "init": // this is executed inside of the namespace to setup the container
cwd, err := os.Getwd()
if err != nil {
log.Fatal(err)
}
if flag.NArg() < 2 {
log.Fatalf("wrong number of argments %d", flag.NArg())
}
syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd))
if err != nil {
log.Fatal(err)
}
if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil {
log.Fatal(err)
}
default:
log.Fatalf("command not supported for nsinit %s", flag.Arg(0))
}
}
func loadContainer() (*libcontainer.Container, error) {
f, err := os.Open(filepath.Join(root, "container.json"))
if err != nil {
return nil, err
}
defer f.Close()
var container *libcontainer.Container
if err := json.NewDecoder(f).Decode(&container); err != nil {
return nil, err
}
return container, nil
}
func readPid() (int, error) {
data, err := ioutil.ReadFile(filepath.Join(root, "pid"))
if err != nil {
return -1, err
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return -1, err
}
return pid, nil
}
func newNsInit() (nsinit.NsInit, error) {
return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil
}

View file

@ -0,0 +1,28 @@
package nsinit
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
)
// StateWriter handles writing and deleting the pid file
// on disk
type StateWriter interface {
WritePid(pid int) error
DeletePid() error
}
type DefaultStateWriter struct {
Root string
}
// writePidFile writes the namespaced processes pid to pid in the rootfs for the container
func (d *DefaultStateWriter) WritePid(pid int) error {
return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655)
}
func (d *DefaultStateWriter) DeletePid() error {
return os.Remove(filepath.Join(d.Root, "pid"))
}

View file

@ -0,0 +1,73 @@
package nsinit
import (
"encoding/json"
"fmt"
"github.com/dotcloud/docker/pkg/libcontainer"
"github.com/dotcloud/docker/pkg/system"
"io/ioutil"
"os"
)
// SyncPipe allows communication to and from the child processes
// to it's parent and allows the two independent processes to
// syncronize their state.
type SyncPipe struct {
parent, child *os.File
}
func NewSyncPipe() (s *SyncPipe, err error) {
s = &SyncPipe{}
s.child, s.parent, err = os.Pipe()
if err != nil {
return nil, err
}
system.UsetCloseOnExec(s.child.Fd())
return s, nil
}
func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) {
s := &SyncPipe{}
if parendFd > 0 {
s.parent = os.NewFile(parendFd, "parendPipe")
} else if childFd > 0 {
s.child = os.NewFile(childFd, "childPipe")
} else {
return nil, fmt.Errorf("no valid sync pipe fd specified")
}
return s, nil
}
func (s *SyncPipe) SendToChild(context libcontainer.Context) error {
data, err := json.Marshal(context)
if err != nil {
return err
}
s.parent.Write(data)
return nil
}
func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) {
data, err := ioutil.ReadAll(s.child)
if err != nil {
return nil, fmt.Errorf("error reading from sync pipe %s", err)
}
var context libcontainer.Context
if len(data) > 0 {
if err := json.Unmarshal(data, &context); err != nil {
return nil, err
}
}
return context, nil
}
func (s *SyncPipe) Close() error {
if s.parent != nil {
s.parent.Close()
}
if s.child != nil {
s.child.Close()
}
return nil
}

118
libcontainer/nsinit/term.go Normal file
View file

@ -0,0 +1,118 @@
package nsinit
import (
"github.com/dotcloud/docker/pkg/term"
"io"
"os"
"os/exec"
)
type Terminal interface {
io.Closer
SetMaster(*os.File)
Attach(*exec.Cmd) error
Resize(h, w int) error
}
func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal {
if tty {
return &TtyTerminal{
stdin: stdin,
stdout: stdout,
stderr: stderr,
}
}
return &StdTerminal{
stdin: stdin,
stdout: stdout,
stderr: stderr,
}
}
type TtyTerminal struct {
stdin io.Reader
stdout, stderr io.Writer
master *os.File
state *term.State
}
func (t *TtyTerminal) Resize(h, w int) error {
return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
}
func (t *TtyTerminal) SetMaster(master *os.File) {
t.master = master
}
func (t *TtyTerminal) Attach(command *exec.Cmd) error {
go io.Copy(t.stdout, t.master)
go io.Copy(t.master, t.stdin)
state, err := t.setupWindow(t.master, os.Stdin)
if err != nil {
command.Process.Kill()
return err
}
t.state = state
return err
}
// SetupWindow gets the parent window size and sets the master
// pty to the current size and set the parents mode to RAW
func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) {
ws, err := term.GetWinsize(parent.Fd())
if err != nil {
return nil, err
}
if err := term.SetWinsize(master.Fd(), ws); err != nil {
return nil, err
}
return term.SetRawTerminal(parent.Fd())
}
func (t *TtyTerminal) Close() error {
term.RestoreTerminal(os.Stdin.Fd(), t.state)
return t.master.Close()
}
type StdTerminal struct {
stdin io.Reader
stdout, stderr io.Writer
}
func (s *StdTerminal) SetMaster(*os.File) {
// no need to set master on non tty
}
func (s *StdTerminal) Close() error {
return nil
}
func (s *StdTerminal) Resize(h, w int) error {
return nil
}
func (s *StdTerminal) Attach(command *exec.Cmd) error {
inPipe, err := command.StdinPipe()
if err != nil {
return err
}
outPipe, err := command.StdoutPipe()
if err != nil {
return err
}
errPipe, err := command.StderrPipe()
if err != nil {
return err
}
go func() {
defer inPipe.Close()
io.Copy(inPipe, s.stdin)
}()
go io.Copy(s.stdout, outPipe)
go io.Copy(s.stderr, errPipe)
return nil
}

View file

@ -0,0 +1,19 @@
// +build !linux
package nsinit
import (
"github.com/dotcloud/docker/pkg/libcontainer"
)
func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
return -1, libcontainer.ErrUnsupported
}
func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
return -1, libcontainer.ErrUnsupported
}
func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
return libcontainer.ErrUnsupported
}

134
libcontainer/types.go Normal file
View file

@ -0,0 +1,134 @@
package libcontainer
import (
"encoding/json"
"errors"
"github.com/syndtr/gocapability/capability"
"os"
)
var (
ErrUnkownNamespace = errors.New("Unknown namespace")
ErrUnkownCapability = errors.New("Unknown capability")
ErrUnsupported = errors.New("Unsupported method")
)
// namespaceList is used to convert the libcontainer types
// into the names of the files located in /proc/<pid>/ns/* for
// each namespace
var (
namespaceList = Namespaces{}
capabilityList = Capabilities{
{Key: "SETPCAP", Value: capability.CAP_SETPCAP},
{Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE},
{Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO},
{Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT},
{Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN},
{Key: "SYS_NICE", Value: capability.CAP_SYS_NICE},
{Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE},
{Key: "SYS_TIME", Value: capability.CAP_SYS_TIME},
{Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG},
{Key: "MKNOD", Value: capability.CAP_MKNOD},
{Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE},
{Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL},
{Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE},
{Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN},
{Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN},
}
)
type (
Namespace struct {
Key string
Value int
File string
}
Namespaces []*Namespace
)
func (ns *Namespace) String() string {
return ns.Key
}
func (ns *Namespace) MarshalJSON() ([]byte, error) {
return json.Marshal(ns.Key)
}
func (ns *Namespace) UnmarshalJSON(src []byte) error {
var nsName string
if err := json.Unmarshal(src, &nsName); err != nil {
return err
}
ret := GetNamespace(nsName)
if ret == nil {
return ErrUnkownNamespace
}
*ns = *ret
return nil
}
func GetNamespace(key string) *Namespace {
for _, ns := range namespaceList {
if ns.Key == key {
return ns
}
}
if os.Getenv("DEBUG") != "" {
panic("Unreachable: Namespace not found")
}
return nil
}
// Contains returns true if the specified Namespace is
// in the slice
func (n Namespaces) Contains(ns string) bool {
return GetNamespace(ns) != nil
}
type (
Capability struct {
Key string
Value capability.Cap
}
Capabilities []*Capability
)
func (c *Capability) String() string {
return c.Key
}
func (c *Capability) MarshalJSON() ([]byte, error) {
return json.Marshal(c.Key)
}
func (c *Capability) UnmarshalJSON(src []byte) error {
var capName string
if err := json.Unmarshal(src, &capName); err != nil {
return err
}
ret := GetCapability(capName)
if ret == nil {
return ErrUnkownCapability
}
*c = *ret
return nil
}
func GetCapability(key string) *Capability {
for _, capp := range capabilityList {
if capp.Key == key {
return capp
}
}
if os.Getenv("DEBUG") != "" {
panic("Unreachable: Capability not found")
}
return nil
}
// Contains returns true if the specified Capability is
// in the slice
func (c Capabilities) Contains(capp string) bool {
return GetCapability(capp) != nil
}

View file

@ -0,0 +1,16 @@
package libcontainer
import (
"syscall"
)
func init() {
namespaceList = Namespaces{
{Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"},
{Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"},
{Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"},
{Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"},
{Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"},
{Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"},
}
}

View file

@ -0,0 +1,28 @@
package utils
import (
"crypto/rand"
"encoding/hex"
"io"
"path/filepath"
)
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {
rootfs, err := filepath.Abs(uncleanRootfs)
if err != nil {
return "", err
}
return filepath.EvalSymlinks(rootfs)
}

145
system/calls_linux.go Normal file
View file

@ -0,0 +1,145 @@
package system
import (
"os/exec"
"syscall"
)
func Chroot(dir string) error {
return syscall.Chroot(dir)
}
func Chdir(dir string) error {
return syscall.Chdir(dir)
}
func Exec(cmd string, args []string, env []string) error {
return syscall.Exec(cmd, args, env)
}
func Execv(cmd string, args []string, env []string) error {
name, err := exec.LookPath(cmd)
if err != nil {
return err
}
return Exec(name, args, env)
}
func Fork() (int, error) {
syscall.ForkLock.Lock()
pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
syscall.ForkLock.Unlock()
if err != 0 {
return -1, err
}
return int(pid), nil
}
func Mount(source, target, fstype string, flags uintptr, data string) error {
return syscall.Mount(source, target, fstype, flags, data)
}
func Unmount(target string, flags int) error {
return syscall.Unmount(target, flags)
}
func Pivotroot(newroot, putold string) error {
return syscall.PivotRoot(newroot, putold)
}
func Unshare(flags int) error {
return syscall.Unshare(flags)
}
func Clone(flags uintptr) (int, error) {
syscall.ForkLock.Lock()
pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0)
syscall.ForkLock.Unlock()
if err != 0 {
return -1, err
}
return int(pid), nil
}
func UsetCloseOnExec(fd uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 {
return err
}
return nil
}
func Setgroups(gids []int) error {
return syscall.Setgroups(gids)
}
func Setresgid(rgid, egid, sgid int) error {
return syscall.Setresgid(rgid, egid, sgid)
}
func Setresuid(ruid, euid, suid int) error {
return syscall.Setresuid(ruid, euid, suid)
}
func Setgid(gid int) error {
return syscall.Setgid(gid)
}
func Setuid(uid int) error {
return syscall.Setuid(uid)
}
func Sethostname(name string) error {
return syscall.Sethostname([]byte(name))
}
func Setsid() (int, error) {
return syscall.Setsid()
}
func Ioctl(fd uintptr, flag, data uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
return err
}
return nil
}
func Closefd(fd uintptr) error {
return syscall.Close(int(fd))
}
func Dup2(fd1, fd2 uintptr) error {
return syscall.Dup2(int(fd1), int(fd2))
}
func Mknod(path string, mode uint32, dev int) error {
return syscall.Mknod(path, mode, dev)
}
func ParentDeathSignal() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
return err
}
return nil
}
func Setctty() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 {
return err
}
return nil
}
func Mkfifo(name string, mode uint32) error {
return syscall.Mkfifo(name, mode)
}
func Umask(mask int) int {
return syscall.Umask(mask)
}
func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.SysProcAttr.Cloneflags = flag
}

9
system/errors.go Normal file
View file

@ -0,0 +1,9 @@
package system
import (
"errors"
)
var (
ErrNotSupportedPlatform = errors.New("platform and architecture is not supported")
)

58
system/pty_linux.go Normal file
View file

@ -0,0 +1,58 @@
package system
import (
"fmt"
"os"
"syscall"
"unsafe"
)
// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
// Unlockpt should be called before opening the slave side of a pseudoterminal.
func Unlockpt(f *os.File) error {
var u int
return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
}
// Ptsname retrieves the name of the first available pts for the given master.
func Ptsname(f *os.File) (string, error) {
var n int
if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
return "", err
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}
// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the
// pts name for use as the pty slave inside the container
func CreateMasterAndConsole() (*os.File, string, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, "", err
}
console, err := Ptsname(master)
if err != nil {
return nil, "", err
}
if err := Unlockpt(master); err != nil {
return nil, "", err
}
return master, console, nil
}
// OpenPtmx opens /dev/ptmx, i.e. the PTY master.
func OpenPtmx() (*os.File, error) {
// O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all.
return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
}
// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC
// used to open the pty slave inside the container namespace
func OpenTerminal(name string, flag int) (*os.File, error) {
r, e := syscall.Open(name, flag, 0)
if e != nil {
return nil, &os.PathError{"open", name, e}
}
return os.NewFile(uintptr(r), name), nil
}

27
system/setns_linux.go Normal file
View file

@ -0,0 +1,27 @@
package system
import (
"fmt"
"runtime"
"syscall"
)
// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092
//
// We need different setns values for the different platforms and arch
// We are declaring the macro here because the SETNS syscall does not exist in th stdlib
var setNsMap = map[string]uintptr{
"linux/amd64": 308,
}
func Setns(fd uintptr, flags uintptr) error {
ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)]
if !exists {
return ErrNotSupportedPlatform
}
_, _, err := syscall.RawSyscall(ns, fd, flags, 0)
if err != 0 {
return err
}
return nil
}

15
system/unsupported.go Normal file
View file

@ -0,0 +1,15 @@
// +build !linux
package system
import (
"os/exec"
)
func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
}
func UsetCloseOnExec(fd uintptr) error {
return ErrNotSupportedPlatform
}