diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 91ac384..b40e1a3 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -5,10 +5,23 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/mount" "io" + "io/ioutil" "os" + "path/filepath" + "strconv" "strings" ) +type Cgroup struct { + Name string `json:"name,omitempty"` + Parent string `json:"parent,omitempty"` + + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) +} + // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { mounts, err := mount.GetMounts() @@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } } } - return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem) } @@ -40,18 +52,199 @@ func GetThisCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return parseCgroupFile(subsystem, f) +} + +func (c *Cgroup) Path(root, subsystem string) (string, error) { + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(root, subsystem, initPath, cgroup), nil +} + +func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { + path, err := c.Path(root, subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (c *Cgroup) Cleanup(root string) error { + get := func(subsystem string) string { + path, _ := c.Path(root, subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + os.RemoveAll(path) + } + return nil +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) - for s.Scan() { if err := s.Err(); err != nil { return "", err } text := s.Text() parts := strings.Split(text, ":") - if parts[1] == subsystem { - return parts[2], nil + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } } } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) } + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func (c *Cgroup) Apply(pid int) error { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := FindCgroupMountpoint("cpu") + if err != nil { + return err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + if err := c.setupDevices(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupMemory(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupCpu(cgroupRoot, pid); err != nil { + return err + } + return nil +} + +func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { + if !c.DeviceAccess { + dir, err := c.Join(cgroupRoot, "devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := c.Join(cgroupRoot, "memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := c.Join(cgroupRoot, "cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} diff --git a/graphdb/graphdb.go b/graphdb/graphdb.go index 9e2466b..46a23b1 100644 --- a/graphdb/graphdb.go +++ b/graphdb/graphdb.go @@ -4,6 +4,7 @@ import ( "database/sql" "fmt" "path" + "strings" "sync" ) @@ -51,6 +52,21 @@ type Database struct { mux sync.RWMutex } +func IsNonUniqueNameError(err error) bool { + str := err.Error() + // sqlite 3.7.17-1ubuntu1 returns: + // Set failure: Abort due to constraint violation: columns parent_id, name are not unique + if strings.HasSuffix(str, "name are not unique") { + return true + } + // sqlite-3.8.3-1.fc20 returns: + // Set failure: Abort due to constraint violation: UNIQUE constraint failed: edge.parent_id, edge.name + if strings.Contains(str, "UNIQUE constraint failed") && strings.Contains(str, "edge.name") { + return true + } + return false +} + // Create a new graph database initialized with a root entity func NewDatabase(conn *sql.DB, init bool) (*Database, error) { if conn == nil { diff --git a/libcontainer/MAINTAINERS b/libcontainer/MAINTAINERS new file mode 100644 index 0000000..e53d933 --- /dev/null +++ b/libcontainer/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/libcontainer/README.md b/libcontainer/README.md new file mode 100644 index 0000000..d6e4ded --- /dev/null +++ b/libcontainer/README.md @@ -0,0 +1,90 @@ +## libcontainer - reference implementation for containers + +#### background + +libcontainer specifies configuration options for what a container is. It provides a native Go implementation +for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management. + + +#### container +A container is a self contained directory that is able to run one or more processes without +affecting the host system. The directory is usually a full system tree. Inside the directory +a `container.json` file is placed with the runtime configuration for how the processes +should be contained and ran. Environment, networking, and different capabilities for the +process are specified in this file. The configuration is used for each process executed inside the container. + +Sample `container.json` file: +```json +{ + "hostname": "koye", + "tty": true, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS", + "NEWNET" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN", + "NET_ADMIN" + ], + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 5248000 + } +} +``` + +Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file +is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process. + + +You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved. + +#### nsinit + +`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to +spawn or join new containers giving the current directory. To use `nsinit` cd into a linux +rootfs and copy a `container.json` file into the directory with your specified configuration. + +To execute `/bin/bash` in the current directory as a container just run: +```bash +nsinit exec /bin/bash +``` + +If you wish to spawn another process inside the container while your current bash session is +running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. + +You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory. diff --git a/libcontainer/TODO.md b/libcontainer/TODO.md new file mode 100644 index 0000000..f18c0b4 --- /dev/null +++ b/libcontainer/TODO.md @@ -0,0 +1,17 @@ +#### goals +* small and simple - line count is not everything but less code is better +* clean lines between what we do in the pkg +* provide primitives for working with namespaces not cater to every option +* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code + +#### tasks +* proper tty for a new process in an existing container +* use exec or raw syscalls for new process in existing container +* setup proper user in namespace if specified +* implement hook or clean interface for cgroups +* example configs for different setups (host networking, boot init) +* improve pkg documentation with comments +* testing - this is hard in a low level pkg but we could do some, maybe +* pivot root +* selinux +* apparmor diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go new file mode 100644 index 0000000..a6d57d4 --- /dev/null +++ b/libcontainer/apparmor/apparmor.go @@ -0,0 +1,31 @@ +// +build apparmor,linux,amd64 + +package apparmor + +// #cgo LDFLAGS: -lapparmor +// #include +// #include +import "C" +import ( + "io/ioutil" + "unsafe" +) + +func IsEnabled() bool { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && len(buf) > 1 && buf[0] == 'Y' +} + +func ApplyProfile(pid int, name string) error { + if !IsEnabled() || name == "" { + return nil + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + if _, err := C.aa_change_onexec(cName); err != nil { + return err + } + return nil +} diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go new file mode 100644 index 0000000..77543e4 --- /dev/null +++ b/libcontainer/apparmor/apparmor_disabled.go @@ -0,0 +1,13 @@ +// +build !apparmor !linux !amd64 + +package apparmor + +import () + +func IsEnabled() bool { + return false +} + +func ApplyProfile(pid int, name string) error { + return nil +} diff --git a/libcontainer/apparmor/setup.go b/libcontainer/apparmor/setup.go new file mode 100644 index 0000000..e07759c --- /dev/null +++ b/libcontainer/apparmor/setup.go @@ -0,0 +1,97 @@ +package apparmor + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" +) + +const DefaultProfilePath = "/etc/apparmor.d/docker" +const DefaultProfile = ` +# AppArmor profile from lxc for containers. +@{HOME}=@{HOMEDIRS}/*/ /root/ +@{HOMEDIRS}=/home/ +#@{HOMEDIRS}+= +@{multiarch}=*-linux-gnu* +@{PROC}=/proc/ + +profile docker-default flags=(attach_disconnected,mediate_deleted) { + network, + capability, + file, + umount, + + # ignore DENIED message on / remount + deny mount options=(ro, remount) -> /, + + # allow tmpfs mounts everywhere + mount fstype=tmpfs, + + # allow mqueue mounts everywhere + mount fstype=mqueue, + + # allow fuse mounts everywhere + mount fstype=fuse.*, + + # allow bind mount of /lib/init/fstab for lxcguest + mount options=(rw, bind) /lib/init/fstab.lxc/ -> /lib/init/fstab/, + + # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted + mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/, + deny @{PROC}/sys/fs/** wklx, + + # allow efivars to be mounted, writing to it will be blocked though + mount fstype=efivarfs -> /sys/firmware/efi/efivars/, + + # block some other dangerous paths + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + # deny writes in /sys except for /sys/fs/cgroup, also allow + # fusectl, securityfs and debugfs to be mounted there (read-only) + mount fstype=fusectl -> /sys/fs/fuse/connections/, + mount fstype=securityfs -> /sys/kernel/security/, + mount fstype=debugfs -> /sys/kernel/debug/, + deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/, + mount fstype=proc -> /proc/, + mount fstype=sysfs -> /sys/, + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, + mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/, + + # the container may never be allowed to mount devpts. If it does, it + # will remount the host's devpts. We could allow it to do it with + # the newinstance option (but, right now, we don't). + deny mount fstype=devpts, +} +` + +func InstallDefaultProfile() error { + if !IsEnabled() { + return nil + } + + // If the profile already exists, let it be. + if _, err := os.Stat(DefaultProfilePath); err == nil { + return nil + } + + if err := ioutil.WriteFile(DefaultProfilePath, []byte(DefaultProfile), 0644); err != nil { + return err + } + + output, err := exec.Command("/lib/init/apparmor-profile-load", "docker").CombinedOutput() + if err != nil { + return fmt.Errorf("Error loading docker profile: %s (%s)", err, output) + } + return nil +} diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000..3c6d752 --- /dev/null +++ b/libcontainer/capabilities/capabilities.go @@ -0,0 +1,33 @@ +package capabilities + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/syndtr/gocapability/capability" + "os" +) + +// DropCapabilities drops capabilities for the current process based +// on the container's configuration. +func DropCapabilities(container *libcontainer.Container) error { + if drop := getCapabilities(container); len(drop) > 0 { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + } + return nil +} + +// getCapabilities returns the specific cap values for the libcontainer types +func getCapabilities(container *libcontainer.Container) []capability.Cap { + drop := []capability.Cap{} + for _, c := range container.Capabilities { + drop = append(drop, c.Value) + } + return drop +} diff --git a/libcontainer/container.go b/libcontainer/container.go new file mode 100644 index 0000000..a777da5 --- /dev/null +++ b/libcontainer/container.go @@ -0,0 +1,38 @@ +package libcontainer + +import ( + "github.com/dotcloud/docker/pkg/cgroups" +) + +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + +// Container defines configuration options for how a +// container is setup inside a directory and how a process should be executed +type Container struct { + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + NoPivotRoot bool `json:"no_pivot_root,omitempty"` // this can be enabled if you are running in ramdisk + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Tty bool `json:"tty,omitempty"` // setup a proper tty or not + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Networks []*Network `json:"networks,omitempty"` // nil for host's network stack + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups + Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) +} + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack +type Network struct { + Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc + Context Context `json:"context,omitempty"` // generic context for type specific networking options + Address string `json:"address,omitempty"` + Gateway string `json:"gateway,omitempty"` + Mtu int `json:"mtu,omitempty"` +} diff --git a/libcontainer/container.json b/libcontainer/container.json new file mode 100644 index 0000000..83e4074 --- /dev/null +++ b/libcontainer/container.json @@ -0,0 +1,50 @@ +{ + "hostname": "koye", + "tty": true, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS", + "NEWNET" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN", + "NET_ADMIN" + ], + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 5248000 + } +} diff --git a/libcontainer/network/network.go b/libcontainer/network/network.go new file mode 100644 index 0000000..8c7a4b6 --- /dev/null +++ b/libcontainer/network/network.go @@ -0,0 +1,78 @@ +package network + +import ( + "github.com/dotcloud/docker/pkg/netlink" + "net" +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.NetworkSetMaster(iface, masterIface) +} + +func SetDefaultGateway(ip string) error { + return netlink.AddDefaultGw(net.ParseIP(ip)) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go new file mode 100644 index 0000000..234fcc0 --- /dev/null +++ b/libcontainer/network/strategy.go @@ -0,0 +1,32 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +var ( + ErrNotValidStrategyType = errors.New("not a valid network strategy type") +) + +var strategies = map[string]NetworkStrategy{ + "veth": &Veth{}, +} + +// NetworkStrategy represents a specific network configuration for +// a container's networking stack +type NetworkStrategy interface { + Create(*libcontainer.Network, int, libcontainer.Context) error + Initialize(*libcontainer.Network, libcontainer.Context) error +} + +// GetStrategy returns the specific network strategy for the +// provided type. If no strategy is registered for the type an +// ErrNotValidStrategyType is returned. +func GetStrategy(tpe string) (NetworkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, ErrNotValidStrategyType + } + return s, nil +} diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go new file mode 100644 index 0000000..3ab1b23 --- /dev/null +++ b/libcontainer/network/veth.go @@ -0,0 +1,100 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" +) + +// Veth is a network strategy that uses a bridge and creates +// a veth pair, one that stays outside on the host and the other +// is placed inside the container's namespace +type Veth struct { +} + +func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error { + var ( + bridge string + prefix string + exists bool + ) + if bridge, exists = n.Context["bridge"]; !exists { + return fmt.Errorf("bridge does not exist in network context") + } + if prefix, exists = n.Context["prefix"]; !exists { + return fmt.Errorf("veth prefix does not exist in network context") + } + name1, name2, err := createVethPair(prefix) + if err != nil { + return err + } + context["veth-host"] = name1 + context["veth-child"] = name2 + if err := SetInterfaceMaster(name1, bridge); err != nil { + return err + } + if err := SetMtu(name1, n.Mtu); err != nil { + return err + } + if err := InterfaceUp(name1); err != nil { + return err + } + if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { + return err + } + return nil +} + +func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { + var ( + vethChild string + exists bool + ) + if vethChild, exists = context["veth-child"]; !exists { + return fmt.Errorf("vethChild does not exist in network context") + } + if err := InterfaceDown(vethChild); err != nil { + return fmt.Errorf("interface down %s %s", vethChild, err) + } + if err := ChangeInterfaceName(vethChild, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", vethChild, err) + } + if err := SetInterfaceIp("eth0", config.Address); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created +func createVethPair(prefix string) (name1 string, name2 string, err error) { + name1, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + if err = CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go new file mode 100644 index 0000000..5546065 --- /dev/null +++ b/libcontainer/nsinit/command.go @@ -0,0 +1,45 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "os" + "os/exec" +) + +// CommandFactory takes the container's configuration and options passed by the +// parent processes and creates an *exec.Cmd that will be used to fork/exec the +// namespaced init process +type CommandFactory interface { + Create(container *libcontainer.Container, console string, syncFd *os.File, args []string) *exec.Cmd +} + +type DefaultCommandFactory struct { + Root string +} + +// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe *os.File, args []string) *exec.Cmd { + // get our binary name from arg0 so we can always reexec ourself + command := exec.Command(os.Args[0], append([]string{ + "-console", console, + "-pipe", "3", + "-root", c.Root, + "init"}, args...)...) + + system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces))) + command.Env = container.Env + command.ExtraFiles = []*os.File{pipe} + return command +} + +// GetNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= ns.Value + } + return flag +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go new file mode 100644 index 0000000..4963f12 --- /dev/null +++ b/libcontainer/nsinit/exec.go @@ -0,0 +1,96 @@ +// +build linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/system" + "os" + "os/exec" + "syscall" +) + +// Exec performes setup outside of a namespace so that a container can be +// executed. Exec is a high level function for working with container namespaces. +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + var ( + master *os.File + console string + err error + ) + + // create a pipe so that we can syncronize with the namespaced process and + // pass the veth name to the child + syncPipe, err := NewSyncPipe() + if err != nil { + return -1, err + } + + if container.Tty { + master, console, err = system.CreateMasterAndConsole() + if err != nil { + return -1, err + } + term.SetMaster(master) + } + + command := ns.commandFactory.Create(container, console, syncPipe.child, args) + if err := term.Attach(command); err != nil { + return -1, err + } + defer term.Close() + + if err := command.Start(); err != nil { + return -1, err + } + if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + defer ns.stateWriter.DeletePid() + + // Do this before syncing with child so that no children + // can escape the cgroup + if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { + command.Process.Kill() + return -1, err + } + + // Sync with child + syncPipe.Close() + + if err := command.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } + } + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil +} + +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { + if container.Cgroups != nil { + if err := container.Cgroups.Apply(nspid); err != nil { + return err + } + } + return nil +} + +func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { + context := libcontainer.Context{} + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.Create(config, nspid, context); err != nil { + return err + } + } + return pipe.SendToChild(context) +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go new file mode 100644 index 0000000..488fe0e --- /dev/null +++ b/libcontainer/nsinit/execin.go @@ -0,0 +1,94 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "strconv" + "syscall" +) + +// ExecIn uses an existing pid and joins the pid's namespaces with the new command. +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + for _, ns := range container.Namespaces { + if err := system.Unshare(ns.Value); err != nil { + return -1, err + } + } + fds, err := ns.getNsFds(nspid, container) + closeFds := func() { + for _, f := range fds { + system.Closefd(f) + } + } + if err != nil { + closeFds() + return -1, err + } + + // foreach namespace fd, use setns to join an existing container's namespaces + for _, fd := range fds { + if fd > 0 { + if err := system.Setns(fd, 0); err != nil { + closeFds() + return -1, fmt.Errorf("setns %s", err) + } + } + system.Closefd(fd) + } + + // if the container has a new pid and mount namespace we need to + // remount proc and sys to pick up the changes + if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") { + pid, err := system.Fork() + if err != nil { + return -1, err + } + if pid == 0 { + // TODO: make all raw syscalls to be fork safe + if err := system.Unshare(syscall.CLONE_NEWNS); err != nil { + return -1, err + } + if err := remountProc(); err != nil { + return -1, fmt.Errorf("remount proc %s", err) + } + if err := remountSys(); err != nil { + return -1, fmt.Errorf("remount sys %s", err) + } + goto dropAndExec + } + proc, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := proc.Wait() + if err != nil { + return -1, err + } + os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) + } +dropAndExec: + if err := finalizeNamespace(container); err != nil { + return -1, err + } + if err := system.Execv(args[0], args[0:], container.Env); err != nil { + return -1, err + } + panic("unreachable") +} + +func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { + fds := make([]uintptr, len(container.Namespaces)) + for i, ns := range container.Namespaces { + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0) + if err != nil { + return fds, err + } + fds[i] = f.Fd() + } + return fds, nil +} diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go new file mode 100644 index 0000000..336fc1e --- /dev/null +++ b/libcontainer/nsinit/init.go @@ -0,0 +1,147 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/apparmor" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/user" + "os" + "syscall" +) + +// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, +// and other options required for the new container. +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + rootfs, err := utils.ResolveRootfs(uncleanRootfs) + if err != nil { + return err + } + + // We always read this as it is a way to sync with the parent as well + context, err := syncPipe.ReadFromParent() + if err != nil { + syncPipe.Close() + return err + } + syncPipe.Close() + + if console != "" { + slave, err := system.OpenTerminal(console, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + return fmt.Errorf("dup2 slave %s", err) + } + } + if _, err := system.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { + return fmt.Errorf("setup mount namespace %s", err) + } + if err := setupNetwork(container, context); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := system.Sethostname(container.Hostname); err != nil { + return fmt.Errorf("sethostname %s", err) + } + if err := finalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + + if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { + return err + } + return system.Execv(args[0], args[0:], container.Env) +} + +func setupUser(container *libcontainer.Container) error { + switch container.User { + case "root", "": + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } + default: + uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) + if err != nil { + return err + } + if err := system.Setgroups(suppGids); err != nil { + return err + } + if err := system.Setgid(gid); err != nil { + return err + } + if err := system.Setuid(uid); err != nil { + return err + } + } + return nil +} + +// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that +// the slave's fd is 0, or stdin +func dupSlave(slave *os.File) error { + if err := system.Dup2(slave.Fd(), 0); err != nil { + return err + } + if err := system.Dup2(slave.Fd(), 1); err != nil { + return err + } + if err := system.Dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error { + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + return strategy.Initialize(config, context) + } + return nil +} + +// finalizeNamespace drops the caps and sets the correct user +// and working dir before execing the command inside the namespace +func finalizeNamespace(container *libcontainer.Container) error { + if err := capabilities.DropCapabilities(container); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + return fmt.Errorf("setup user %s", err) + } + if container.WorkingDir != "" { + if err := system.Chdir(container.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) + } + } + return nil +} diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go new file mode 100644 index 0000000..83577cf --- /dev/null +++ b/libcontainer/nsinit/mount.go @@ -0,0 +1,277 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" + "path/filepath" + "syscall" +) + +// default mount point flags +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +// setupNewMountNamespace is used to initialize a new mount namespace for an new +// container in the rootfs that is specified. +// +// There is no need to unmount the new mounts because as soon as the mount namespace +// is no longer in use, the mounts will be removed automatically +func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool) error { + flag := syscall.MS_PRIVATE + if noPivotRoot { + flag = syscall.MS_SLAVE + } + if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + if readonly { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting %s as readonly %s", rootfs, err) + } + } + if err := mountSystem(rootfs); err != nil { + return fmt.Errorf("mount system %s", err) + } + if err := copyDevNodes(rootfs); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + // In non-privileged mode, this fails. Discard the error. + setupLoopbackDevices(rootfs) + if err := setupDev(rootfs); err != nil { + return err + } + if console != "" { + if err := setupPtmx(rootfs, console); err != nil { + return err + } + } + if err := system.Chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if noPivotRoot { + if err := rootMsMove(rootfs); err != nil { + return err + } + } else { + if err := rootPivot(rootfs); err != nil { + return err + } + } + + system.Umask(0022) + + return nil +} + +// use a pivot root to setup the rootfs +func rootPivot(rootfs string) error { + pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) + } + if err := system.Pivotroot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + if err := system.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + // path to pivot dir now changed, update + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + if err := os.Remove(pivotDir); err != nil { + return fmt.Errorf("remove pivot_root dir %s", err) + } + return nil +} + +// use MS_MOVE and chroot to setup the rootfs +func rootMsMove(rootfs string) error { + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + if err := system.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + if err := system.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + return nil +} + +// copyDevNodes mknods the hosts devices so the new container has access to them +func copyDevNodes(rootfs string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + for _, node := range []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", + } { + if err := copyDevNode(rootfs, node); err != nil { + return err + } + } + return nil +} + +func setupLoopbackDevices(rootfs string) error { + for i := 0; ; i++ { + if err := copyDevNode(rootfs, fmt.Sprintf("loop%d", i)); err != nil { + if !os.IsNotExist(err) { + return err + } + break + } + + } + return nil +} + +func copyDevNode(rootfs, node string) error { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + return nil +} + +// setupDev symlinks the current processes pipes into the +// appropriate destination on the containers rootfs +func setupDev(rootfs string) error { + for _, link := range []struct { + from string + to string + }{ + {"/proc/kcore", "/dev/core"}, + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } { + dest := filepath.Join(rootfs, link.to) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Symlink(link.from, dest); err != nil { + return fmt.Errorf("symlink %s %s", dest, err) + } + } + return nil +} + +// setupConsole ensures that the container has a proper /dev/console setup +func setupConsole(rootfs, console string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + stat, err := os.Stat(console) + if err != nil { + return fmt.Errorf("stat console %s %s", console, err) + } + var ( + st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, "dev/console") + ) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Chmod(console, 0600); err != nil { + return err + } + if err := os.Chown(console, 0, 0); err != nil { + return err + } + if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", console, dest, err) + } + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string) error { + for _, m := range []struct { + source string + path string + device string + flags int + data string + }{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777,size=65536k"}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + } { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and +// finishes setting up /dev/console +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if err := setupConsole(rootfs, console); err != nil { + return err + } + return nil +} + +// remountProc is used to detach and remount the proc filesystem +// commonly needed with running a new process inside an existing container +func remountProc() error { + if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { + return err + } + return nil +} + +func remountSys() error { + if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/nsinit/nsinit.go b/libcontainer/nsinit/nsinit.go new file mode 100644 index 0000000..f09a130 --- /dev/null +++ b/libcontainer/nsinit/nsinit.go @@ -0,0 +1,26 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +// NsInit is an interface with the public facing methods to provide high level +// exec operations on a container +type NsInit interface { + Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) + ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) + Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error +} + +type linuxNs struct { + root string + commandFactory CommandFactory + stateWriter StateWriter +} + +func NewNsInit(command CommandFactory, state StateWriter) NsInit { + return &linuxNs{ + commandFactory: command, + stateWriter: state, + } +} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go new file mode 100644 index 0000000..61921c5 --- /dev/null +++ b/libcontainer/nsinit/nsinit/main.go @@ -0,0 +1,110 @@ +package main + +import ( + "encoding/json" + "flag" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "io/ioutil" + "log" + "os" + "path/filepath" + "strconv" +) + +var ( + root, console string + pipeFd int +) + +func registerFlags() { + flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.StringVar(&root, "root", ".", "root for storing configuration data") + + flag.Parse() +} + +func main() { + registerFlags() + + if flag.NArg() < 1 { + log.Fatalf("wrong number of argments %d", flag.NArg()) + } + container, err := loadContainer() + if err != nil { + log.Fatal(err) + } + ns, err := newNsInit() + if err != nil { + log.Fatal(err) + } + + switch flag.Arg(0) { + case "exec": // this is executed outside of the namespace in the cwd + var exitCode int + nspid, err := readPid() + if err != nil { + if !os.IsNotExist(err) { + log.Fatal(err) + } + } + if nspid > 0 { + exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:]) + } else { + term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) + exitCode, err = ns.Exec(container, term, flag.Args()[1:]) + } + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + case "init": // this is executed inside of the namespace to setup the container + cwd, err := os.Getwd() + if err != nil { + log.Fatal(err) + } + if flag.NArg() < 2 { + log.Fatalf("wrong number of argments %d", flag.NArg()) + } + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) + if err != nil { + log.Fatal(err) + } + if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { + log.Fatal(err) + } + default: + log.Fatalf("command not supported for nsinit %s", flag.Arg(0)) + } +} + +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open(filepath.Join(root, "container.json")) + if err != nil { + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} + +func readPid() (int, error) { + data, err := ioutil.ReadFile(filepath.Join(root, "pid")) + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} + +func newNsInit() (nsinit.NsInit, error) { + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil +} diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go new file mode 100644 index 0000000..af38008 --- /dev/null +++ b/libcontainer/nsinit/state.go @@ -0,0 +1,28 @@ +package nsinit + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" +) + +// StateWriter handles writing and deleting the pid file +// on disk +type StateWriter interface { + WritePid(pid int) error + DeletePid() error +} + +type DefaultStateWriter struct { + Root string +} + +// writePidFile writes the namespaced processes pid to pid in the rootfs for the container +func (d *DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655) +} + +func (d *DefaultStateWriter) DeletePid() error { + return os.Remove(filepath.Join(d.Root, "pid")) +} diff --git a/libcontainer/nsinit/sync_pipe.go b/libcontainer/nsinit/sync_pipe.go new file mode 100644 index 0000000..f724f52 --- /dev/null +++ b/libcontainer/nsinit/sync_pipe.go @@ -0,0 +1,71 @@ +package nsinit + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" + "os" +) + +// SyncPipe allows communication to and from the child processes +// to it's parent and allows the two independent processes to +// syncronize their state. +type SyncPipe struct { + parent, child *os.File +} + +func NewSyncPipe() (s *SyncPipe, err error) { + s = &SyncPipe{} + s.child, s.parent, err = os.Pipe() + if err != nil { + return nil, err + } + return s, nil +} + +func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) { + s := &SyncPipe{} + if parendFd > 0 { + s.parent = os.NewFile(parendFd, "parendPipe") + } else if childFd > 0 { + s.child = os.NewFile(childFd, "childPipe") + } else { + return nil, fmt.Errorf("no valid sync pipe fd specified") + } + return s, nil +} + +func (s *SyncPipe) SendToChild(context libcontainer.Context) error { + data, err := json.Marshal(context) + if err != nil { + return err + } + s.parent.Write(data) + return nil +} + +func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) { + data, err := ioutil.ReadAll(s.child) + if err != nil { + return nil, fmt.Errorf("error reading from sync pipe %s", err) + } + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + } + return context, nil + +} + +func (s *SyncPipe) Close() error { + if s.parent != nil { + s.parent.Close() + } + if s.child != nil { + s.child.Close() + } + return nil +} diff --git a/libcontainer/nsinit/term.go b/libcontainer/nsinit/term.go new file mode 100644 index 0000000..58dccab --- /dev/null +++ b/libcontainer/nsinit/term.go @@ -0,0 +1,118 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/term" + "io" + "os" + "os/exec" +) + +type Terminal interface { + io.Closer + SetMaster(*os.File) + Attach(*exec.Cmd) error + Resize(h, w int) error +} + +func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { + if tty { + return &TtyTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } + } + return &StdTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } +} + +type TtyTerminal struct { + stdin io.Reader + stdout, stderr io.Writer + master *os.File + state *term.State +} + +func (t *TtyTerminal) Resize(h, w int) error { + return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + +func (t *TtyTerminal) SetMaster(master *os.File) { + t.master = master +} + +func (t *TtyTerminal) Attach(command *exec.Cmd) error { + go io.Copy(t.stdout, t.master) + go io.Copy(t.master, t.stdin) + + state, err := t.setupWindow(t.master, os.Stdin) + if err != nil { + command.Process.Kill() + return err + } + t.state = state + return err +} + +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(parent.Fd()) +} + +func (t *TtyTerminal) Close() error { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + return t.master.Close() +} + +type StdTerminal struct { + stdin io.Reader + stdout, stderr io.Writer +} + +func (s *StdTerminal) SetMaster(*os.File) { + // no need to set master on non tty +} + +func (s *StdTerminal) Close() error { + return nil +} + +func (s *StdTerminal) Resize(h, w int) error { + return nil +} + +func (s *StdTerminal) Attach(command *exec.Cmd) error { + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + go func() { + defer inPipe.Close() + io.Copy(inPipe, s.stdin) + }() + + go io.Copy(s.stdout, outPipe) + go io.Copy(s.stderr, errPipe) + + return nil +} diff --git a/libcontainer/nsinit/unsupported.go b/libcontainer/nsinit/unsupported.go new file mode 100644 index 0000000..2412223 --- /dev/null +++ b/libcontainer/nsinit/unsupported.go @@ -0,0 +1,19 @@ +// +build !linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + return libcontainer.ErrUnsupported +} diff --git a/libcontainer/types.go b/libcontainer/types.go new file mode 100644 index 0000000..94fe876 --- /dev/null +++ b/libcontainer/types.go @@ -0,0 +1,137 @@ +package libcontainer + +import ( + "encoding/json" + "errors" + "github.com/syndtr/gocapability/capability" +) + +var ( + ErrUnkownNamespace = errors.New("Unknown namespace") + ErrUnkownCapability = errors.New("Unknown capability") + ErrUnsupported = errors.New("Unsupported method") +) + +// namespaceList is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace +var ( + namespaceList = Namespaces{} + + capabilityList = Capabilities{ + {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, + {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, + {Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO}, + {Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT}, + {Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN}, + {Key: "SYS_NICE", Value: capability.CAP_SYS_NICE}, + {Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE}, + {Key: "SYS_TIME", Value: capability.CAP_SYS_TIME}, + {Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG}, + {Key: "MKNOD", Value: capability.CAP_MKNOD}, + {Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE}, + {Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL}, + {Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE}, + {Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN}, + {Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN}, + } +) + +type ( + Namespace struct { + Key string + Value int + File string + } + Namespaces []*Namespace +) + +func (ns *Namespace) String() string { + return ns.Key +} + +func (ns *Namespace) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Namespace) UnmarshalJSON(src []byte) error { + var nsName string + if err := json.Unmarshal(src, &nsName); err != nil { + return err + } + ret := GetNamespace(nsName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetNamespace(key string) *Namespace { + for _, ns := range namespaceList { + if ns.Key == key { + return ns + } + } + return nil +} + +// Contains returns true if the specified Namespace is +// in the slice +func (n Namespaces) Contains(ns string) bool { + for _, nsp := range n { + if nsp.Key == ns { + return true + } + } + return false +} + +type ( + Capability struct { + Key string + Value capability.Cap + } + Capabilities []*Capability +) + +func (c *Capability) String() string { + return c.Key +} + +func (c *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(c.Key) +} + +func (c *Capability) UnmarshalJSON(src []byte) error { + var capName string + if err := json.Unmarshal(src, &capName); err != nil { + return err + } + ret := GetCapability(capName) + if ret == nil { + return ErrUnkownCapability + } + *c = *ret + return nil +} + +func GetCapability(key string) *Capability { + for _, capp := range capabilityList { + if capp.Key == key { + return capp + } + } + return nil +} + +// Contains returns true if the specified Capability is +// in the slice +func (c Capabilities) Contains(capp string) bool { + for _, cap := range c { + if cap.Key == capp { + return true + } + } + return false +} diff --git a/libcontainer/types_linux.go b/libcontainer/types_linux.go new file mode 100644 index 0000000..c14531d --- /dev/null +++ b/libcontainer/types_linux.go @@ -0,0 +1,16 @@ +package libcontainer + +import ( + "syscall" +) + +func init() { + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } +} diff --git a/libcontainer/types_test.go b/libcontainer/types_test.go new file mode 100644 index 0000000..52b85a4 --- /dev/null +++ b/libcontainer/types_test.go @@ -0,0 +1,35 @@ +package libcontainer + +import ( + "testing" +) + +func TestNamespacesContains(t *testing.T) { + ns := Namespaces{ + GetNamespace("NEWPID"), + GetNamespace("NEWNS"), + GetNamespace("NEWUTS"), + } + + if ns.Contains("NEWNET") { + t.Fatal("namespaces should not contain NEWNET") + } + + if !ns.Contains("NEWPID") { + t.Fatal("namespaces should contain NEWPID but does not") + } +} + +func TestCapabilitiesContains(t *testing.T) { + caps := Capabilities{ + GetCapability("MKNOD"), + GetCapability("SETPCAP"), + } + + if caps.Contains("SYS_ADMIN") { + t.Fatal("capabilities should not contain SYS_ADMIN") + } + if !caps.Contains("MKNOD") { + t.Fatal("capabilities should container MKNOD but does not") + } +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go new file mode 100644 index 0000000..0d919bc --- /dev/null +++ b/libcontainer/utils/utils.go @@ -0,0 +1,28 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "path/filepath" +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} diff --git a/listenbuffer/buffer.go b/listenbuffer/buffer.go new file mode 100644 index 0000000..c350805 --- /dev/null +++ b/listenbuffer/buffer.go @@ -0,0 +1,61 @@ +/* + Package to allow go applications to immediately start + listening on a socket, unix, tcp, udp but hold connections + until the application has booted and is ready to accept them +*/ +package listenbuffer + +import ( + "fmt" + "net" + "time" +) + +// NewListenBuffer returns a listener listening on addr with the protocol. It sets the +// timeout to wait on first connection before an error is returned +func NewListenBuffer(proto, addr string, activate chan struct{}, timeout time.Duration) (net.Listener, error) { + wrapped, err := net.Listen(proto, addr) + if err != nil { + return nil, err + } + + return &defaultListener{ + wrapped: wrapped, + activate: activate, + timeout: timeout, + }, nil +} + +type defaultListener struct { + wrapped net.Listener // the real listener to wrap + ready bool // is the listner ready to start accpeting connections + activate chan struct{} + timeout time.Duration // how long to wait before we consider this an error +} + +func (l *defaultListener) Close() error { + return l.wrapped.Close() +} + +func (l *defaultListener) Addr() net.Addr { + return l.wrapped.Addr() +} + +func (l *defaultListener) Accept() (net.Conn, error) { + // if the listen has been told it is ready then we can go ahead and + // start returning connections + if l.ready { + return l.wrapped.Accept() + } + + select { + case <-time.After(l.timeout): + // close the connection so any clients are disconnected + l.Close() + return nil, fmt.Errorf("timeout (%s) reached waiting for listener to become ready", l.timeout.String()) + case <-l.activate: + l.ready = true + return l.Accept() + } + panic("unreachable") +} diff --git a/mflag/example/example.go b/mflag/example/example.go index fa26c97..ed940e8 100644 --- a/mflag/example/example.go +++ b/mflag/example/example.go @@ -12,9 +12,10 @@ var ( ) func init() { + flag.Bool([]string{"#hp", "#-halp"}, false, "display the halp") flag.BoolVar(&b, []string{"b"}, false, "a simple bool") - flag.BoolVar(&b2, []string{"-bool"}, false, "a simple bool") - flag.IntVar(&i, []string{"#integer", "-integer"}, -1, "a simple integer") + flag.BoolVar(&b2, []string{"#-bool"}, false, "a simple bool") + flag.IntVar(&i, []string{"-integer", "-number"}, -1, "a simple integer") flag.StringVar(&str, []string{"s", "#hidden", "-string"}, "", "a simple string") //-s -hidden and --string will work, but -hidden won't be in the usage flag.BoolVar(&h, []string{"h", "#help", "-help"}, false, "display the help") flag.Parse() @@ -27,4 +28,5 @@ func main() { fmt.Printf("b: %b\n", b) fmt.Printf("-bool: %b\n", b2) fmt.Printf("s/#hidden/-string(via lookup): %s\n", flag.Lookup("s").Value.String()) + fmt.Printf("ARGS: %v\n", flag.Args()) } diff --git a/mflag/flag.go b/mflag/flag.go index f721e04..7125c03 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -77,6 +77,9 @@ import ( // ErrHelp is the error returned if the flag -help is invoked but no such flag is defined. var ErrHelp = errors.New("flag: help requested") +// ErrRetry is the error returned if you need to try letter by letter +var ErrRetry = errors.New("flag: retry") + // -- bool Value type boolValue bool @@ -287,13 +290,13 @@ type Flag struct { func sortFlags(flags map[string]*Flag) []*Flag { var list sort.StringSlice for _, f := range flags { + fName := strings.TrimPrefix(f.Names[0], "#") if len(f.Names) == 1 { - list = append(list, f.Names[0]) + list = append(list, fName) continue } found := false - fName := strings.TrimPrefix(strings.TrimPrefix(f.Names[0], "#"), "-") for _, name := range list { if name == fName { found = true @@ -401,7 +404,9 @@ func (f *FlagSet) PrintDefaults() { names = append(names, name) } } - fmt.Fprintf(f.out(), format, strings.Join(names, ", -"), flag.DefValue, flag.Usage) + if len(names) > 0 { + fmt.Fprintf(f.out(), format, strings.Join(names, ", -"), flag.DefValue, flag.Usage) + } }) } @@ -733,21 +738,21 @@ func (f *FlagSet) usage() { } // parseOne parses one flag. It reports whether a flag was seen. -func (f *FlagSet) parseOne() (bool, error) { +func (f *FlagSet) parseOne() (bool, string, error) { if len(f.args) == 0 { - return false, nil + return false, "", nil } s := f.args[0] if len(s) == 0 || s[0] != '-' || len(s) == 1 { - return false, nil + return false, "", nil } if s[1] == '-' && len(s) == 2 { // "--" terminates the flags f.args = f.args[1:] - return false, nil + return false, "", nil } name := s[1:] if len(name) == 0 || name[0] == '=' { - return false, f.failf("bad flag syntax: %s", s) + return false, "", f.failf("bad flag syntax: %s", s) } // it's a flag. does it have an argument? @@ -767,14 +772,17 @@ func (f *FlagSet) parseOne() (bool, error) { if !alreadythere { if name == "-help" || name == "help" || name == "h" { // special case for nice help message. f.usage() - return false, ErrHelp + return false, "", ErrHelp } - return false, f.failf("flag provided but not defined: -%s", name) + if len(name) > 0 && name[0] == '-' { + return false, "", f.failf("flag provided but not defined: -%s", name) + } + return false, name, ErrRetry } if fv, ok := flag.Value.(boolFlag); ok && fv.IsBoolFlag() { // special case: doesn't need an arg if has_value { if err := fv.Set(value); err != nil { - return false, f.failf("invalid boolean value %q for -%s: %v", value, name, err) + return false, "", f.failf("invalid boolean value %q for -%s: %v", value, name, err) } } else { fv.Set("true") @@ -787,17 +795,22 @@ func (f *FlagSet) parseOne() (bool, error) { value, f.args = f.args[0], f.args[1:] } if !has_value { - return false, f.failf("flag needs an argument: -%s", name) + return false, "", f.failf("flag needs an argument: -%s", name) } if err := flag.Value.Set(value); err != nil { - return false, f.failf("invalid value %q for flag -%s: %v", value, name, err) + return false, "", f.failf("invalid value %q for flag -%s: %v", value, name, err) } } if f.actual == nil { f.actual = make(map[string]*Flag) } f.actual[name] = flag - return true, nil + for _, n := range flag.Names { + if n == fmt.Sprintf("#%s", name) { + fmt.Fprintf(f.out(), "Warning: '-%s' is deprecated, it will be removed soon. See usage.\n", name) + } + } + return true, "", nil } // Parse parses flag definitions from the argument list, which should not @@ -808,13 +821,34 @@ func (f *FlagSet) Parse(arguments []string) error { f.parsed = true f.args = arguments for { - seen, err := f.parseOne() + seen, name, err := f.parseOne() if seen { continue } if err == nil { break } + if err == ErrRetry { + if len(name) > 1 { + err = nil + for _, letter := range strings.Split(name, "") { + f.args = append([]string{"-" + letter}, f.args...) + seen2, _, err2 := f.parseOne() + if seen2 { + continue + } + if err2 != nil { + err = f.failf("flag provided but not defined: -%s", name) + break + } + } + if err == nil { + continue + } + } else { + err = f.failf("flag provided but not defined: -%s", name) + } + } switch f.errorHandling { case ContinueOnError: return err diff --git a/netlink/MAINTAINERS b/netlink/MAINTAINERS new file mode 100644 index 0000000..e53d933 --- /dev/null +++ b/netlink/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/netlink/netlink.go b/netlink/netlink.go index 5098b4b..5cc7562 100644 --- a/netlink/netlink.go +++ b/netlink/netlink.go @@ -5,7 +5,15 @@ // netlink_darwin.go package netlink -import "net" +import ( + "errors" + "net" +) + +var ( + ErrWrongSockType = errors.New("Wrong socket type") + ErrShortResponse = errors.New("Got short response from netlink") +) // A Route is a subnet associated with the interface to reach it. type Route struct { diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 0ea5b4d..f8bb6ba 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -10,6 +10,15 @@ import ( "unsafe" ) +const ( + IFNAMSIZ = 16 + DEFAULT_CHANGE = 0xFFFFFFFF + IFLA_INFO_KIND = 1 + IFLA_INFO_DATA = 2 + VETH_INFO_PEER = 1 + IFLA_NET_NS_FD = 28 +) + var nextSeqNr int func nativeEndian() binary.ByteOrder { @@ -36,6 +45,7 @@ func getIpFamily(ip net.IP) int { } type NetlinkRequestData interface { + Len() int ToWireFormat() []byte } @@ -44,21 +54,24 @@ type IfInfomsg struct { } func newIfInfomsg(family int) *IfInfomsg { - msg := &IfInfomsg{} - msg.Family = uint8(family) - msg.Type = uint16(0) - msg.Index = int32(0) - msg.Flags = uint32(0) - msg.Change = uint32(0) + return &IfInfomsg{ + IfInfomsg: syscall.IfInfomsg{ + Family: uint8(family), + }, + } +} +func newIfInfomsgChild(parent *RtAttr, family int) *IfInfomsg { + msg := newIfInfomsg(family) + parent.children = append(parent.children, msg) return msg } func (msg *IfInfomsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofIfInfomsg - b := make([]byte, len) + length := syscall.SizeofIfInfomsg + b := make([]byte, length) b[0] = msg.Family b[1] = 0 native.PutUint16(b[2:4], msg.Type) @@ -68,26 +81,27 @@ func (msg *IfInfomsg) ToWireFormat() []byte { return b } +func (msg *IfInfomsg) Len() int { + return syscall.SizeofIfInfomsg +} + type IfAddrmsg struct { syscall.IfAddrmsg } func newIfAddrmsg(family int) *IfAddrmsg { - msg := &IfAddrmsg{} - msg.Family = uint8(family) - msg.Prefixlen = uint8(0) - msg.Flags = uint8(0) - msg.Scope = uint8(0) - msg.Index = uint32(0) - - return msg + return &IfAddrmsg{ + IfAddrmsg: syscall.IfAddrmsg{ + Family: uint8(family), + }, + } } func (msg *IfAddrmsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofIfAddrmsg - b := make([]byte, len) + length := syscall.SizeofIfAddrmsg + b := make([]byte, length) b[0] = msg.Family b[1] = msg.Prefixlen b[2] = msg.Flags @@ -96,26 +110,31 @@ func (msg *IfAddrmsg) ToWireFormat() []byte { return b } +func (msg *IfAddrmsg) Len() int { + return syscall.SizeofIfAddrmsg +} + type RtMsg struct { syscall.RtMsg } func newRtMsg(family int) *RtMsg { - msg := &RtMsg{} - msg.Family = uint8(family) - msg.Table = syscall.RT_TABLE_MAIN - msg.Scope = syscall.RT_SCOPE_UNIVERSE - msg.Protocol = syscall.RTPROT_BOOT - msg.Type = syscall.RTN_UNICAST - - return msg + return &RtMsg{ + RtMsg: syscall.RtMsg{ + Family: uint8(family), + Table: syscall.RT_TABLE_MAIN, + Scope: syscall.RT_SCOPE_UNIVERSE, + Protocol: syscall.RTPROT_BOOT, + Type: syscall.RTN_UNICAST, + }, + } } func (msg *RtMsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofRtMsg - b := make([]byte, len) + length := syscall.SizeofRtMsg + b := make([]byte, length) b[0] = msg.Family b[1] = msg.Dst_len b[2] = msg.Src_len @@ -128,35 +147,70 @@ func (msg *RtMsg) ToWireFormat() []byte { return b } +func (msg *RtMsg) Len() int { + return syscall.SizeofRtMsg +} + func rtaAlignOf(attrlen int) int { return (attrlen + syscall.RTA_ALIGNTO - 1) & ^(syscall.RTA_ALIGNTO - 1) } type RtAttr struct { syscall.RtAttr - Data []byte + Data []byte + children []NetlinkRequestData } func newRtAttr(attrType int, data []byte) *RtAttr { - attr := &RtAttr{} - attr.Type = uint16(attrType) - attr.Data = data + return &RtAttr{ + RtAttr: syscall.RtAttr{ + Type: uint16(attrType), + }, + children: []NetlinkRequestData{}, + Data: data, + } +} +func newRtAttrChild(parent *RtAttr, attrType int, data []byte) *RtAttr { + attr := newRtAttr(attrType, data) + parent.children = append(parent.children, attr) return attr } -func (attr *RtAttr) ToWireFormat() []byte { +func (a *RtAttr) Len() int { + l := 0 + for _, child := range a.children { + l += child.Len() + syscall.SizeofRtAttr + } + if l == 0 { + l++ + } + return rtaAlignOf(l + len(a.Data)) +} + +func (a *RtAttr) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofRtAttr + len(attr.Data) - b := make([]byte, rtaAlignOf(len)) - native.PutUint16(b[0:2], uint16(len)) - native.PutUint16(b[2:4], attr.Type) - for i, d := range attr.Data { - b[4+i] = d + length := a.Len() + buf := make([]byte, rtaAlignOf(length+syscall.SizeofRtAttr)) + + if a.Data != nil { + copy(buf[4:], a.Data) + } else { + next := 4 + for _, child := range a.children { + childBuf := child.ToWireFormat() + copy(buf[next:], childBuf) + next += rtaAlignOf(len(childBuf)) + } } - return b + if l := uint16(rtaAlignOf(length)); l != 0 { + native.PutUint16(buf[0:2], l+1) + } + native.PutUint16(buf[2:4], a.Type) + + return buf } type NetlinkRequest struct { @@ -171,7 +225,7 @@ func (rr *NetlinkRequest) ToWireFormat() []byte { dataBytes := make([][]byte, len(rr.Data)) for i, data := range rr.Data { dataBytes[i] = data.ToWireFormat() - length = length + uint32(len(dataBytes[i])) + length += uint32(len(dataBytes[i])) } b := make([]byte, length) native.PutUint32(b[0:4], length) @@ -180,27 +234,29 @@ func (rr *NetlinkRequest) ToWireFormat() []byte { native.PutUint32(b[8:12], rr.Seq) native.PutUint32(b[12:16], rr.Pid) - i := 16 + next := 16 for _, data := range dataBytes { - for _, dataByte := range data { - b[i] = dataByte - i = i + 1 - } + copy(b[next:], data) + next += len(data) } return b } func (rr *NetlinkRequest) AddData(data NetlinkRequestData) { - rr.Data = append(rr.Data, data) + if data != nil { + rr.Data = append(rr.Data, data) + } } func newNetlinkRequest(proto, flags int) *NetlinkRequest { - rr := &NetlinkRequest{} - rr.Len = uint32(syscall.NLMSG_HDRLEN) - rr.Type = uint16(proto) - rr.Flags = syscall.NLM_F_REQUEST | uint16(flags) - rr.Seq = uint32(getSeq()) - return rr + return &NetlinkRequest{ + NlMsghdr: syscall.NlMsghdr{ + Len: uint32(syscall.NLMSG_HDRLEN), + Type: uint16(proto), + Flags: syscall.NLM_F_REQUEST | uint16(flags), + Seq: uint32(getSeq()), + }, + } } type NetlinkSocket struct { @@ -243,7 +299,7 @@ func (s *NetlinkSocket) Receive() ([]syscall.NetlinkMessage, error) { return nil, err } if nr < syscall.NLMSG_HDRLEN { - return nil, fmt.Errorf("Got short response from netlink") + return nil, ErrShortResponse } rb = rb[:nr] return syscall.ParseNetlinkMessage(rb) @@ -258,7 +314,7 @@ func (s *NetlinkSocket) GetPid() (uint32, error) { case *syscall.SockaddrNetlink: return v.Pid, nil } - return 0, fmt.Errorf("Wrong socket type") + return 0, ErrWrongSockType } func (s *NetlinkSocket) HandleAck(seq uint32) error { @@ -355,6 +411,28 @@ func NetworkLinkUp(iface *net.Interface) error { return s.HandleAck(wb.Seq) } +func NetworkLinkDown(iface *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Change = syscall.IFF_UP + msg.Flags = 0 & ^syscall.IFF_UP + msg.Index = int32(iface.Index) + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + func NetworkSetMTU(iface *net.Interface, mtu int) error { s, err := getNetlinkSocket() if err != nil { @@ -368,7 +446,7 @@ func NetworkSetMTU(iface *net.Interface, mtu int) error { msg.Type = syscall.RTM_SETLINK msg.Flags = syscall.NLM_F_REQUEST msg.Index = int32(iface.Index) - msg.Change = 0xFFFFFFFF + msg.Change = DEFAULT_CHANGE wb.AddData(msg) var ( @@ -386,6 +464,103 @@ func NetworkSetMTU(iface *net.Interface, mtu int) error { return s.HandleAck(wb.Seq) } +// same as ip link set $name master $master +func NetworkSetMaster(iface, master *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(master.Index)) + + data := newRtAttr(syscall.IFLA_MASTER, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(nspid)) + + data := newRtAttr(syscall.IFLA_NET_NS_PID, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +func NetworkSetNsFd(iface *net.Interface, fd int) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(fd)) + + data := newRtAttr(IFLA_NET_NS_FD, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + // Add an Ip address to an interface. This is identical to: // ip addr add $ip/$ipNet dev $iface func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { @@ -426,20 +601,11 @@ func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { } func zeroTerminated(s string) []byte { - bytes := make([]byte, len(s)+1) - for i := 0; i < len(s); i++ { - bytes[i] = s[i] - } - bytes[len(s)] = 0 - return bytes + return []byte(s + "\000") } func nonZeroTerminated(s string) []byte { - bytes := make([]byte, len(s)) - for i := 0; i < len(s); i++ { - bytes[i] = s[i] - } - return bytes + return []byte(s) } // Add a new network link of a specified type. This is identical to @@ -456,10 +622,10 @@ func NetworkLinkAdd(name string, linkType string) error { msg := newIfInfomsg(syscall.AF_UNSPEC) wb.AddData(msg) - nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) - wb.AddData(nameData) - - IFLA_INFO_KIND := 1 + if name != "" { + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) + wb.AddData(nameData) + } kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated(linkType)) @@ -576,3 +742,69 @@ done: return res, nil } + +func getIfSocket() (fd int, err error) { + for _, socket := range []int{ + syscall.AF_INET, + syscall.AF_PACKET, + syscall.AF_INET6, + } { + if fd, err = syscall.Socket(socket, syscall.SOCK_DGRAM, 0); err == nil { + break + } + } + if err == nil { + return fd, nil + } + return -1, err +} + +func NetworkChangeName(iface *net.Interface, newName string) error { + fd, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(fd) + + data := [IFNAMSIZ * 2]byte{} + // the "-1"s here are very important for ensuring we get proper null + // termination of our new C strings + copy(data[:IFNAMSIZ-1], iface.Name) + copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) + + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCSIFNAME, uintptr(unsafe.Pointer(&data[0]))); errno != 0 { + return errno + } + return nil +} + +func NetworkCreateVethPair(name1, name2 string) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name1)) + wb.AddData(nameData) + + nest1 := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(nest1, IFLA_INFO_KIND, zeroTerminated("veth")) + nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) + nest3 := newRtAttrChild(nest2, VETH_INFO_PEER, nil) + + newIfInfomsgChild(nest3, syscall.AF_UNSPEC) + newRtAttrChild(nest3, syscall.IFLA_IFNAME, zeroTerminated(name2)) + + wb.AddData(nest1) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} diff --git a/netlink/netlink_unsupported.go b/netlink/netlink_unsupported.go index cd796b3..bd9e962 100644 --- a/netlink/netlink_unsupported.go +++ b/netlink/netlink_unsupported.go @@ -3,31 +3,59 @@ package netlink import ( - "fmt" + "errors" "net" ) +var ( + ErrNotImplemented = errors.New("not implemented") +) + func NetworkGetRoutes() ([]Route, error) { - return nil, fmt.Errorf("Not implemented") + return nil, ErrNotImplemented } func NetworkLinkAdd(name string, linkType string) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkLinkUp(iface *net.Interface) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func AddDefaultGw(ip net.IP) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkSetMTU(iface *net.Interface, mtu int) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented +} + +func NetworkCreateVethPair(name1, name2 string) error { + return ErrNotImplemented +} + +func NetworkChangeName(iface *net.Interface, newName string) error { + return ErrNotImplemented +} + +func NetworkSetNsFd(iface *net.Interface, fd int) error { + return ErrNotImplemented +} + +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + return ErrNotImplemented +} + +func NetworkSetMaster(iface, master *net.Interface) error { + return ErrNotImplemented +} + +func NetworkLinkDown(iface *net.Interface) error { + return ErrNotImplemented } diff --git a/opts/opts.go b/opts/opts.go new file mode 100644 index 0000000..a1b8752 --- /dev/null +++ b/opts/opts.go @@ -0,0 +1,148 @@ +package opts + +import ( + "fmt" + "github.com/dotcloud/docker/utils" + "os" + "path/filepath" + "regexp" + "strings" +) + +// ListOpts type +type ListOpts struct { + values []string + validator ValidatorFctType +} + +func NewListOpts(validator ValidatorFctType) ListOpts { + return ListOpts{ + validator: validator, + } +} + +func (opts *ListOpts) String() string { + return fmt.Sprintf("%v", []string(opts.values)) +} + +// Set validates if needed the input value and add it to the +// internal slice. +func (opts *ListOpts) Set(value string) error { + if opts.validator != nil { + v, err := opts.validator(value) + if err != nil { + return err + } + value = v + } + opts.values = append(opts.values, value) + return nil +} + +// Delete remove the given element from the slice. +func (opts *ListOpts) Delete(key string) { + for i, k := range opts.values { + if k == key { + opts.values = append(opts.values[:i], opts.values[i+1:]...) + return + } + } +} + +// GetMap returns the content of values in a map in order to avoid +// duplicates. +// FIXME: can we remove this? +func (opts *ListOpts) GetMap() map[string]struct{} { + ret := make(map[string]struct{}) + for _, k := range opts.values { + ret[k] = struct{}{} + } + return ret +} + +// GetAll returns the values' slice. +// FIXME: Can we remove this? +func (opts *ListOpts) GetAll() []string { + return opts.values +} + +// Get checks the existence of the given key. +func (opts *ListOpts) Get(key string) bool { + for _, k := range opts.values { + if k == key { + return true + } + } + return false +} + +// Len returns the amount of element in the slice. +func (opts *ListOpts) Len() int { + return len(opts.values) +} + +// Validators +type ValidatorFctType func(val string) (string, error) + +func ValidateAttach(val string) (string, error) { + if val != "stdin" && val != "stdout" && val != "stderr" { + return val, fmt.Errorf("Unsupported stream name: %s", val) + } + return val, nil +} + +func ValidateLink(val string) (string, error) { + if _, err := parseLink(val); err != nil { + return val, err + } + return val, nil +} + +// FIXME: this is a duplicate of docker.utils.parseLink. +// it can't be moved to a separate links/ package because +// links depends on Container which is defined in the core. +// +// Links come in the format of +// name:alias +func parseLink(rawLink string) (map[string]string, error) { + return utils.PartParser("name:alias", rawLink) +} + +func ValidatePath(val string) (string, error) { + var containerPath string + + if strings.Count(val, ":") > 2 { + return val, fmt.Errorf("bad format for volumes: %s", val) + } + + splited := strings.SplitN(val, ":", 2) + if len(splited) == 1 { + containerPath = splited[0] + val = filepath.Clean(splited[0]) + } else { + containerPath = splited[1] + val = fmt.Sprintf("%s:%s", splited[0], filepath.Clean(splited[1])) + } + + if !filepath.IsAbs(containerPath) { + return val, fmt.Errorf("%s is not an absolute path", containerPath) + } + return val, nil +} + +func ValidateEnv(val string) (string, error) { + arr := strings.Split(val, "=") + if len(arr) > 1 { + return val, nil + } + return fmt.Sprintf("%s=%s", val, os.Getenv(val)), nil +} + +func ValidateIp4Address(val string) (string, error) { + re := regexp.MustCompile(`^(([0-9]+\.){3}([0-9]+))\s*$`) + var ns = re.FindSubmatch([]byte(val)) + if len(ns) > 0 { + return string(ns[1]), nil + } + return "", fmt.Errorf("%s is not an ip4 address", val) +} diff --git a/opts/opts_test.go b/opts/opts_test.go new file mode 100644 index 0000000..a5c1fac --- /dev/null +++ b/opts/opts_test.go @@ -0,0 +1,24 @@ +package opts + +import ( + "testing" +) + +func TestValidateIP4(t *testing.T) { + if ret, err := ValidateIp4Address(`1.2.3.4`); err != nil || ret == "" { + t.Fatalf("ValidateIp4Address(`1.2.3.4`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`127.0.0.1`); err != nil || ret == "" { + t.Fatalf("ValidateIp4Address(`127.0.0.1`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`127`); err == nil || ret != "" { + t.Fatalf("ValidateIp4Address(`127`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`random invalid string`); err == nil || ret != "" { + t.Fatalf("ValidateIp4Address(`random invalid string`) got %s %s", ret, err) + } + +} diff --git a/proxy/MAINTAINERS b/proxy/MAINTAINERS new file mode 100644 index 0000000..1e998f8 --- /dev/null +++ b/proxy/MAINTAINERS @@ -0,0 +1 @@ +Michael Crosby (@crosbymichael) diff --git a/proxy/network_proxy_test.go b/proxy/network_proxy_test.go new file mode 100644 index 0000000..9e38256 --- /dev/null +++ b/proxy/network_proxy_test.go @@ -0,0 +1,216 @@ +package proxy + +import ( + "bytes" + "fmt" + "io" + "net" + "strings" + "testing" + "time" +) + +var testBuf = []byte("Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo") +var testBufSize = len(testBuf) + +type EchoServer interface { + Run() + Close() + LocalAddr() net.Addr +} + +type TCPEchoServer struct { + listener net.Listener + testCtx *testing.T +} + +type UDPEchoServer struct { + conn net.PacketConn + testCtx *testing.T +} + +func NewEchoServer(t *testing.T, proto, address string) EchoServer { + var server EchoServer + if strings.HasPrefix(proto, "tcp") { + listener, err := net.Listen(proto, address) + if err != nil { + t.Fatal(err) + } + server = &TCPEchoServer{listener: listener, testCtx: t} + } else { + socket, err := net.ListenPacket(proto, address) + if err != nil { + t.Fatal(err) + } + server = &UDPEchoServer{conn: socket, testCtx: t} + } + return server +} + +func (server *TCPEchoServer) Run() { + go func() { + for { + client, err := server.listener.Accept() + if err != nil { + return + } + go func(client net.Conn) { + if _, err := io.Copy(client, client); err != nil { + server.testCtx.Logf("can't echo to the client: %v\n", err.Error()) + } + client.Close() + }(client) + } + }() +} + +func (server *TCPEchoServer) LocalAddr() net.Addr { return server.listener.Addr() } +func (server *TCPEchoServer) Close() { server.listener.Addr() } + +func (server *UDPEchoServer) Run() { + go func() { + readBuf := make([]byte, 1024) + for { + read, from, err := server.conn.ReadFrom(readBuf) + if err != nil { + return + } + for i := 0; i != read; { + written, err := server.conn.WriteTo(readBuf[i:read], from) + if err != nil { + break + } + i += written + } + } + }() +} + +func (server *UDPEchoServer) LocalAddr() net.Addr { return server.conn.LocalAddr() } +func (server *UDPEchoServer) Close() { server.conn.Close() } + +func testProxyAt(t *testing.T, proto string, proxy Proxy, addr string) { + defer proxy.Close() + go proxy.Run() + client, err := net.Dial(proto, addr) + if err != nil { + t.Fatalf("Can't connect to the proxy: %v", err) + } + defer client.Close() + client.SetDeadline(time.Now().Add(10 * time.Second)) + if _, err = client.Write(testBuf); err != nil { + t.Fatal(err) + } + recvBuf := make([]byte, testBufSize) + if _, err = client.Read(recvBuf); err != nil { + t.Fatal(err) + } + if !bytes.Equal(testBuf, recvBuf) { + t.Fatal(fmt.Errorf("Expected [%v] but got [%v]", testBuf, recvBuf)) + } +} + +func testProxy(t *testing.T, proto string, proxy Proxy) { + testProxyAt(t, proto, proxy, proxy.FrontendAddr().String()) +} + +func TestTCP4Proxy(t *testing.T) { + backend := NewEchoServer(t, "tcp", "127.0.0.1:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "tcp", proxy) +} + +func TestTCP6Proxy(t *testing.T) { + backend := NewEchoServer(t, "tcp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "tcp", proxy) +} + +func TestTCPDualStackProxy(t *testing.T) { + // If I understand `godoc -src net favoriteAddrFamily` (used by the + // net.Listen* functions) correctly this should work, but it doesn't. + t.Skip("No support for dual stack yet") + backend := NewEchoServer(t, "tcp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + ipv4ProxyAddr := &net.TCPAddr{ + IP: net.IPv4(127, 0, 0, 1), + Port: proxy.FrontendAddr().(*net.TCPAddr).Port, + } + testProxyAt(t, "tcp", proxy, ipv4ProxyAddr.String()) +} + +func TestUDP4Proxy(t *testing.T) { + backend := NewEchoServer(t, "udp", "127.0.0.1:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "udp", proxy) +} + +func TestUDP6Proxy(t *testing.T) { + backend := NewEchoServer(t, "udp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.UDPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "udp", proxy) +} + +func TestUDPWriteError(t *testing.T) { + frontendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + // Hopefully, this port will be free: */ + backendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 25587} + proxy, err := NewProxy(frontendAddr, backendAddr) + if err != nil { + t.Fatal(err) + } + defer proxy.Close() + go proxy.Run() + client, err := net.Dial("udp", "127.0.0.1:25587") + if err != nil { + t.Fatalf("Can't connect to the proxy: %v", err) + } + defer client.Close() + // Make sure the proxy doesn't stop when there is no actual backend: + client.Write(testBuf) + client.Write(testBuf) + backend := NewEchoServer(t, "udp", "127.0.0.1:25587") + defer backend.Close() + backend.Run() + client.SetDeadline(time.Now().Add(10 * time.Second)) + if _, err = client.Write(testBuf); err != nil { + t.Fatal(err) + } + recvBuf := make([]byte, testBufSize) + if _, err = client.Read(recvBuf); err != nil { + t.Fatal(err) + } + if !bytes.Equal(testBuf, recvBuf) { + t.Fatal(fmt.Errorf("Expected [%v] but got [%v]", testBuf, recvBuf)) + } +} diff --git a/proxy/proxy.go b/proxy/proxy.go new file mode 100644 index 0000000..7a711f6 --- /dev/null +++ b/proxy/proxy.go @@ -0,0 +1,29 @@ +package proxy + +import ( + "fmt" + "net" +) + +type Proxy interface { + // Start forwarding traffic back and forth the front and back-end + // addresses. + Run() + // Stop forwarding traffic and close both ends of the Proxy. + Close() + // Return the address on which the proxy is listening. + FrontendAddr() net.Addr + // Return the proxied address. + BackendAddr() net.Addr +} + +func NewProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) { + switch frontendAddr.(type) { + case *net.UDPAddr: + return NewUDPProxy(frontendAddr.(*net.UDPAddr), backendAddr.(*net.UDPAddr)) + case *net.TCPAddr: + return NewTCPProxy(frontendAddr.(*net.TCPAddr), backendAddr.(*net.TCPAddr)) + default: + panic(fmt.Errorf("Unsupported protocol")) + } +} diff --git a/proxy/stub_proxy.go b/proxy/stub_proxy.go new file mode 100644 index 0000000..7684427 --- /dev/null +++ b/proxy/stub_proxy.go @@ -0,0 +1,22 @@ +package proxy + +import ( + "net" +) + +type StubProxy struct { + frontendAddr net.Addr + backendAddr net.Addr +} + +func (p *StubProxy) Run() {} +func (p *StubProxy) Close() {} +func (p *StubProxy) FrontendAddr() net.Addr { return p.frontendAddr } +func (p *StubProxy) BackendAddr() net.Addr { return p.backendAddr } + +func NewStubProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) { + return &StubProxy{ + frontendAddr: frontendAddr, + backendAddr: backendAddr, + }, nil +} diff --git a/proxy/tcp_proxy.go b/proxy/tcp_proxy.go new file mode 100644 index 0000000..1aa6d9f --- /dev/null +++ b/proxy/tcp_proxy.go @@ -0,0 +1,89 @@ +package proxy + +import ( + "io" + "log" + "net" + "syscall" +) + +type TCPProxy struct { + listener *net.TCPListener + frontendAddr *net.TCPAddr + backendAddr *net.TCPAddr +} + +func NewTCPProxy(frontendAddr, backendAddr *net.TCPAddr) (*TCPProxy, error) { + listener, err := net.ListenTCP("tcp", frontendAddr) + if err != nil { + return nil, err + } + // If the port in frontendAddr was 0 then ListenTCP will have a picked + // a port to listen on, hence the call to Addr to get that actual port: + return &TCPProxy{ + listener: listener, + frontendAddr: listener.Addr().(*net.TCPAddr), + backendAddr: backendAddr, + }, nil +} + +func (proxy *TCPProxy) clientLoop(client *net.TCPConn, quit chan bool) { + backend, err := net.DialTCP("tcp", nil, proxy.backendAddr) + if err != nil { + log.Printf("Can't forward traffic to backend tcp/%v: %s\n", proxy.backendAddr, err) + client.Close() + return + } + + event := make(chan int64) + var broker = func(to, from *net.TCPConn) { + written, err := io.Copy(to, from) + if err != nil { + // If the socket we are writing to is shutdown with + // SHUT_WR, forward it to the other end of the pipe: + if err, ok := err.(*net.OpError); ok && err.Err == syscall.EPIPE { + from.CloseWrite() + } + } + to.CloseRead() + event <- written + } + + go broker(client, backend) + go broker(backend, client) + + var transferred int64 = 0 + for i := 0; i < 2; i++ { + select { + case written := <-event: + transferred += written + case <-quit: + // Interrupt the two brokers and "join" them. + client.Close() + backend.Close() + for ; i < 2; i++ { + transferred += <-event + } + return + } + } + client.Close() + backend.Close() +} + +func (proxy *TCPProxy) Run() { + quit := make(chan bool) + defer close(quit) + for { + client, err := proxy.listener.Accept() + if err != nil { + log.Printf("Stopping proxy on tcp/%v for tcp/%v (%s)", proxy.frontendAddr, proxy.backendAddr, err) + return + } + go proxy.clientLoop(client.(*net.TCPConn), quit) + } +} + +func (proxy *TCPProxy) Close() { proxy.listener.Close() } +func (proxy *TCPProxy) FrontendAddr() net.Addr { return proxy.frontendAddr } +func (proxy *TCPProxy) BackendAddr() net.Addr { return proxy.backendAddr } diff --git a/proxy/udp_proxy.go b/proxy/udp_proxy.go new file mode 100644 index 0000000..14f2306 --- /dev/null +++ b/proxy/udp_proxy.go @@ -0,0 +1,156 @@ +package proxy + +import ( + "encoding/binary" + "log" + "net" + "strings" + "sync" + "syscall" + "time" +) + +const ( + UDPConnTrackTimeout = 90 * time.Second + UDPBufSize = 2048 +) + +// A net.Addr where the IP is split into two fields so you can use it as a key +// in a map: +type connTrackKey struct { + IPHigh uint64 + IPLow uint64 + Port int +} + +func newConnTrackKey(addr *net.UDPAddr) *connTrackKey { + if len(addr.IP) == net.IPv4len { + return &connTrackKey{ + IPHigh: 0, + IPLow: uint64(binary.BigEndian.Uint32(addr.IP)), + Port: addr.Port, + } + } + return &connTrackKey{ + IPHigh: binary.BigEndian.Uint64(addr.IP[:8]), + IPLow: binary.BigEndian.Uint64(addr.IP[8:]), + Port: addr.Port, + } +} + +type connTrackMap map[connTrackKey]*net.UDPConn + +type UDPProxy struct { + listener *net.UDPConn + frontendAddr *net.UDPAddr + backendAddr *net.UDPAddr + connTrackTable connTrackMap + connTrackLock sync.Mutex +} + +func NewUDPProxy(frontendAddr, backendAddr *net.UDPAddr) (*UDPProxy, error) { + listener, err := net.ListenUDP("udp", frontendAddr) + if err != nil { + return nil, err + } + return &UDPProxy{ + listener: listener, + frontendAddr: listener.LocalAddr().(*net.UDPAddr), + backendAddr: backendAddr, + connTrackTable: make(connTrackMap), + }, nil +} + +func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr, clientKey *connTrackKey) { + defer func() { + proxy.connTrackLock.Lock() + delete(proxy.connTrackTable, *clientKey) + proxy.connTrackLock.Unlock() + proxyConn.Close() + }() + + readBuf := make([]byte, UDPBufSize) + for { + proxyConn.SetReadDeadline(time.Now().Add(UDPConnTrackTimeout)) + again: + read, err := proxyConn.Read(readBuf) + if err != nil { + if err, ok := err.(*net.OpError); ok && err.Err == syscall.ECONNREFUSED { + // This will happen if the last write failed + // (e.g: nothing is actually listening on the + // proxied port on the container), ignore it + // and continue until UDPConnTrackTimeout + // expires: + goto again + } + return + } + for i := 0; i != read; { + written, err := proxy.listener.WriteToUDP(readBuf[i:read], clientAddr) + if err != nil { + return + } + i += written + } + } +} + +func (proxy *UDPProxy) Run() { + readBuf := make([]byte, UDPBufSize) + for { + read, from, err := proxy.listener.ReadFromUDP(readBuf) + if err != nil { + // NOTE: Apparently ReadFrom doesn't return + // ECONNREFUSED like Read do (see comment in + // UDPProxy.replyLoop) + if !isClosedError(err) { + log.Printf("Stopping proxy on udp/%v for udp/%v (%s)", proxy.frontendAddr, proxy.backendAddr, err) + } + break + } + + fromKey := newConnTrackKey(from) + proxy.connTrackLock.Lock() + proxyConn, hit := proxy.connTrackTable[*fromKey] + if !hit { + proxyConn, err = net.DialUDP("udp", nil, proxy.backendAddr) + if err != nil { + log.Printf("Can't proxy a datagram to udp/%s: %s\n", proxy.backendAddr, err) + continue + } + proxy.connTrackTable[*fromKey] = proxyConn + go proxy.replyLoop(proxyConn, from, fromKey) + } + proxy.connTrackLock.Unlock() + for i := 0; i != read; { + written, err := proxyConn.Write(readBuf[i:read]) + if err != nil { + log.Printf("Can't proxy a datagram to udp/%s: %s\n", proxy.backendAddr, err) + break + } + i += written + } + } +} + +func (proxy *UDPProxy) Close() { + proxy.listener.Close() + proxy.connTrackLock.Lock() + defer proxy.connTrackLock.Unlock() + for _, conn := range proxy.connTrackTable { + conn.Close() + } +} + +func (proxy *UDPProxy) FrontendAddr() net.Addr { return proxy.frontendAddr } +func (proxy *UDPProxy) BackendAddr() net.Addr { return proxy.backendAddr } + +func isClosedError(err error) bool { + /* This comparison is ugly, but unfortunately, net.go doesn't export errClosing. + * See: + * http://golang.org/src/pkg/net/net.go + * https://code.google.com/p/go/issues/detail?id=4337 + * https://groups.google.com/forum/#!msg/golang-nuts/0_aaCvBmOcM/SptmDyX1XJMJ + */ + return strings.HasSuffix(err.Error(), "use of closed network connection") +} diff --git a/system/calls_linux.go b/system/calls_linux.go new file mode 100644 index 0000000..bf667c5 --- /dev/null +++ b/system/calls_linux.go @@ -0,0 +1,145 @@ +package system + +import ( + "os/exec" + "syscall" +) + +func Chroot(dir string) error { + return syscall.Chroot(dir) +} + +func Chdir(dir string) error { + return syscall.Chdir(dir) +} + +func Exec(cmd string, args []string, env []string) error { + return syscall.Exec(cmd, args, env) +} + +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + return Exec(name, args, env) +} + +func Fork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func Mount(source, target, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} + +func Unmount(target string, flags int) error { + return syscall.Unmount(target, flags) +} + +func Pivotroot(newroot, putold string) error { + return syscall.PivotRoot(newroot, putold) +} + +func Unshare(flags int) error { + return syscall.Unshare(flags) +} + +func Clone(flags uintptr) (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func UsetCloseOnExec(fd uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { + return err + } + return nil +} + +func Setgroups(gids []int) error { + return syscall.Setgroups(gids) +} + +func Setresgid(rgid, egid, sgid int) error { + return syscall.Setresgid(rgid, egid, sgid) +} + +func Setresuid(ruid, euid, suid int) error { + return syscall.Setresuid(ruid, euid, suid) +} + +func Setgid(gid int) error { + return syscall.Setgid(gid) +} + +func Setuid(uid int) error { + return syscall.Setuid(uid) +} + +func Sethostname(name string) error { + return syscall.Sethostname([]byte(name)) +} + +func Setsid() (int, error) { + return syscall.Setsid() +} + +func Ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +func Closefd(fd uintptr) error { + return syscall.Close(int(fd)) +} + +func Dup2(fd1, fd2 uintptr) error { + return syscall.Dup2(int(fd1), int(fd2)) +} + +func Mknod(path string, mode uint32, dev int) error { + return syscall.Mknod(path, mode, dev) +} + +func ParentDeathSignal() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 { + return err + } + return nil +} + +func Setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} + +func Mkfifo(name string, mode uint32) error { + return syscall.Mkfifo(name, mode) +} + +func Umask(mask int) int { + return syscall.Umask(mask) +} + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Cloneflags = flag +} diff --git a/system/errors.go b/system/errors.go new file mode 100644 index 0000000..6304518 --- /dev/null +++ b/system/errors.go @@ -0,0 +1,9 @@ +package system + +import ( + "errors" +) + +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) diff --git a/system/pty_linux.go b/system/pty_linux.go new file mode 100644 index 0000000..ca588d8 --- /dev/null +++ b/system/pty_linux.go @@ -0,0 +1,58 @@ +package system + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// Unlockpt should be called before opening the slave side of a pseudoterminal. +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// Ptsname retrieves the name of the first available pts for the given master. +func Ptsname(f *os.File) (string, error) { + var n int + + if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container +func CreateMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + console, err := Ptsname(master) + if err != nil { + return nil, "", err + } + if err := Unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + +// OpenPtmx opens /dev/ptmx, i.e. the PTY master. +func OpenPtmx() (*os.File, error) { + // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace +func OpenTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/system/setns_linux.go b/system/setns_linux.go new file mode 100644 index 0000000..2b6f9e7 --- /dev/null +++ b/system/setns_linux.go @@ -0,0 +1,27 @@ +package system + +import ( + "fmt" + "runtime" + "syscall" +) + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +// +// We need different setns values for the different platforms and arch +// We are declaring the macro here because the SETNS syscall does not exist in th stdlib +var setNsMap = map[string]uintptr{ + "linux/amd64": 308, +} + +func Setns(fd uintptr, flags uintptr) error { + ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + if !exists { + return ErrNotSupportedPlatform + } + _, _, err := syscall.RawSyscall(ns, fd, flags, 0) + if err != 0 { + return err + } + return nil +} diff --git a/system/stat_linux.go b/system/stat_linux.go new file mode 100644 index 0000000..e702200 --- /dev/null +++ b/system/stat_linux.go @@ -0,0 +1,13 @@ +package system + +import ( + "syscall" +) + +func GetLastAccess(stat *syscall.Stat_t) syscall.Timespec { + return stat.Atim +} + +func GetLastModification(stat *syscall.Stat_t) syscall.Timespec { + return stat.Mtim +} diff --git a/system/stat_unsupported.go b/system/stat_unsupported.go new file mode 100644 index 0000000..4686a4c --- /dev/null +++ b/system/stat_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux + +package system + +import "syscall" + +func GetLastAccess(stat *syscall.Stat_t) syscall.Timespec { + return stat.Atimespec +} + +func GetLastModification(stat *syscall.Stat_t) syscall.Timespec { + return stat.Mtimespec +} diff --git a/system/unsupported.go b/system/unsupported.go new file mode 100644 index 0000000..eb3ec7e --- /dev/null +++ b/system/unsupported.go @@ -0,0 +1,15 @@ +// +build !linux + +package system + +import ( + "os/exec" +) + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + +} + +func UsetCloseOnExec(fd uintptr) error { + return ErrNotSupportedPlatform +} diff --git a/system/utimes_linux.go b/system/utimes_linux.go new file mode 100644 index 0000000..c00f402 --- /dev/null +++ b/system/utimes_linux.go @@ -0,0 +1,31 @@ +package system + +import ( + "syscall" + "unsafe" +) + +func LUtimesNano(path string, ts []syscall.Timespec) error { + // These are not currently available in syscall + AT_FDCWD := -100 + AT_SYMLINK_NOFOLLOW := 0x100 + + var _path *byte + _path, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + + if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(AT_FDCWD), uintptr(unsafe.Pointer(_path)), uintptr(unsafe.Pointer(&ts[0])), uintptr(AT_SYMLINK_NOFOLLOW), 0, 0); err != 0 && err != syscall.ENOSYS { + return err + } + + return nil +} + +func UtimesNano(path string, ts []syscall.Timespec) error { + if err := syscall.UtimesNano(path, ts); err != nil { + return err + } + return nil +} diff --git a/system/utimes_unsupported.go b/system/utimes_unsupported.go new file mode 100644 index 0000000..d247ba2 --- /dev/null +++ b/system/utimes_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux + +package system + +import "syscall" + +func LUtimesNano(path string, ts []syscall.Timespec) error { + return ErrNotSupportedPlatform +} + +func UtimesNano(path string, ts []syscall.Timespec) error { + return ErrNotSupportedPlatform +} diff --git a/system/xattrs_linux.go b/system/xattrs_linux.go new file mode 100644 index 0000000..00edb20 --- /dev/null +++ b/system/xattrs_linux.go @@ -0,0 +1,59 @@ +package system + +import ( + "syscall" + "unsafe" +) + +// Returns a nil slice and nil error if the xattr is not set +func Lgetxattr(path string, attr string) ([]byte, error) { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return nil, err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return nil, err + } + + dest := make([]byte, 128) + destBytes := unsafe.Pointer(&dest[0]) + sz, _, errno := syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + if errno == syscall.ENODATA { + return nil, nil + } + if errno == syscall.ERANGE { + dest = make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + } + if errno != 0 { + return nil, errno + } + + return dest[:sz], nil +} + +var _zero uintptr + +func Lsetxattr(path string, attr string, data []byte, flags int) error { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return err + } + var dataBytes unsafe.Pointer + if len(data) > 0 { + dataBytes = unsafe.Pointer(&data[0]) + } else { + dataBytes = unsafe.Pointer(&_zero) + } + _, _, errno := syscall.Syscall6(syscall.SYS_LSETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(dataBytes), uintptr(len(data)), uintptr(flags), 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/system/xattrs_unsupported.go b/system/xattrs_unsupported.go new file mode 100644 index 0000000..0060c16 --- /dev/null +++ b/system/xattrs_unsupported.go @@ -0,0 +1,11 @@ +// +build !linux + +package system + +func Lgetxattr(path string, attr string) ([]byte, error) { + return nil, ErrNotSupportedPlatform +} + +func Lsetxattr(path string, attr string, data []byte, flags int) error { + return ErrNotSupportedPlatform +} diff --git a/systemd/MAINTAINERS b/systemd/MAINTAINERS new file mode 100644 index 0000000..51228b3 --- /dev/null +++ b/systemd/MAINTAINERS @@ -0,0 +1 @@ +Brandon Philips (@philips) diff --git a/user/MAINTAINERS b/user/MAINTAINERS new file mode 100644 index 0000000..18e05a3 --- /dev/null +++ b/user/MAINTAINERS @@ -0,0 +1 @@ +Tianon Gravi (@tianon) diff --git a/user/user.go b/user/user.go new file mode 100644 index 0000000..1672f7e --- /dev/null +++ b/user/user.go @@ -0,0 +1,241 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "strconv" + "strings" +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +func parseLine(line string, v ...interface{}) { + if line == "" { + return + } + + parts := strings.Split(line, ":") + for i, p := range parts { + if len(v) <= i { + // if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files + break + } + + switch e := v[i].(type) { + case *string: + // "root", "adm", "/bin/bash" + *e = p + case *int: + // "0", "4", "1000" + // ignore string to int conversion errors, for great "tolerance" of naughty configuration files + *e, _ = strconv.Atoi(p) + case *[]string: + // "", "root", "root,adm,daemon" + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: + // panic, because this is a programming/logic error, not a runtime one + panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") + } + } +} + +func ParsePasswd() ([]*User, error) { + return ParsePasswdFilter(nil) +} + +func ParsePasswdFilter(filter func(*User) bool) ([]*User, error) { + f, err := os.Open("/etc/passwd") + if err != nil { + return nil, err + } + defer f.Close() + return parsePasswdFile(f, filter) +} + +func parsePasswdFile(r io.Reader, filter func(*User) bool) ([]*User, error) { + var ( + s = bufio.NewScanner(r) + out = []*User{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := strings.TrimSpace(s.Text()) + if text == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := &User{} + parseLine( + text, + &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseGroup() ([]*Group, error) { + return ParseGroupFilter(nil) +} + +func ParseGroupFilter(filter func(*Group) bool) ([]*Group, error) { + f, err := os.Open("/etc/group") + if err != nil { + return nil, err + } + defer f.Close() + return parseGroupFile(f, filter) +} + +func parseGroupFile(r io.Reader, filter func(*Group) bool) ([]*Group, error) { + var ( + s = bufio.NewScanner(r) + out = []*Group{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := &Group{} + parseLine( + text, + &p.Name, &p.Pass, &p.Gid, &p.List, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +// Given a string like "user", "1000", "user:group", "1000:1000", returns the uid, gid, and list of supplementary group IDs, if possible. +func GetUserGroupSupplementary(userSpec string, defaultUid int, defaultGid int) (int, int, []int, error) { + var ( + uid = defaultUid + gid = defaultGid + suppGids = []int{} + + userArg, groupArg string + ) + + // allow for userArg to have either "user" syntax, or optionally "user:group" syntax + parseLine(userSpec, &userArg, &groupArg) + + users, err := ParsePasswdFilter(func(u *User) bool { + if userArg == "" { + return u.Uid == uid + } + return u.Name == userArg || strconv.Itoa(u.Uid) == userArg + }) + if err != nil && !os.IsNotExist(err) { + if userArg == "" { + userArg = strconv.Itoa(uid) + } + return 0, 0, nil, fmt.Errorf("Unable to find user %v: %v", userArg, err) + } + + haveUser := users != nil && len(users) > 0 + if haveUser { + // if we found any user entries that matched our filter, let's take the first one as "correct" + uid = users[0].Uid + gid = users[0].Gid + } else if userArg != "" { + // we asked for a user but didn't find them... let's check to see if we wanted a numeric user + uid, err = strconv.Atoi(userArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, fmt.Errorf("Unable to find user %v", userArg) + } + + // if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit + } + + if groupArg != "" || (haveUser && users[0].Name != "") { + groups, err := ParseGroupFilter(func(g *Group) bool { + if groupArg != "" { + return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg + } + for _, u := range g.List { + if u == users[0].Name { + return true + } + } + return false + }) + if err != nil && !os.IsNotExist(err) { + return 0, 0, nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err) + } + + haveGroup := groups != nil && len(groups) > 0 + if groupArg != "" { + if haveGroup { + // if we found any group entries that matched our filter, let's take the first one as "correct" + gid = groups[0].Gid + } else { + // we asked for a group but didn't find id... let's check to see if we wanted a numeric group + gid, err = strconv.Atoi(groupArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, fmt.Errorf("Unable to find group %v", groupArg) + } + + // if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit + } + } else if haveGroup { + suppGids = make([]int, len(groups)) + for i, group := range groups { + suppGids[i] = group.Gid + } + } + } + + return uid, gid, suppGids, nil +} diff --git a/user/user_test.go b/user/user_test.go new file mode 100644 index 0000000..136632c --- /dev/null +++ b/user/user_test.go @@ -0,0 +1,94 @@ +package user + +import ( + "strings" + "testing" +) + +func TestUserParseLine(t *testing.T) { + var ( + a, b string + c []string + d int + ) + + parseLine("", &a, &b) + if a != "" || b != "" { + t.Fatalf("a and b should be empty ('%v', '%v')", a, b) + } + + parseLine("a", &a, &b) + if a != "a" || b != "" { + t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) + } + + parseLine("bad boys:corny cows", &a, &b) + if a != "bad boys" || b != "corny cows" { + t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) + } + + parseLine("", &c) + if len(c) != 0 { + t.Fatalf("c should be empty (%#v)", c) + } + + parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { + t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("::::::::::", &a, &b, &c) + if a != "" || b != "" || len(c) != 0 { + t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("not a number", &d) + if d != 0 { + t.Fatalf("d should be 0 (%v)", d) + } + + parseLine("b:12:c", &a, &d, &b) + if a != "b" || b != "c" || d != 12 { + t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) + } +} + +func TestUserParsePasswd(t *testing.T) { + users, err := parsePasswdFile(strings.NewReader(` +root:x:0:0:root:/root:/bin/bash +adm:x:3:4:adm:/var/adm:/bin/false +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(users) != 3 { + t.Fatalf("Expected 3 users, got %v", len(users)) + } + if users[0].Uid != 0 || users[0].Name != "root" { + t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name) + } + if users[1].Uid != 3 || users[1].Name != "adm" { + t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name) + } +} + +func TestUserParseGroup(t *testing.T) { + groups, err := parseGroupFile(strings.NewReader(` +root:x:0:root +adm:x:4:root,adm,daemon +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(groups) != 3 { + t.Fatalf("Expected 3 groups, got %v", len(groups)) + } + if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { + t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) + } + if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 { + t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List)) + } +} diff --git a/version/version.go b/version/version.go new file mode 100644 index 0000000..3721d64 --- /dev/null +++ b/version/version.go @@ -0,0 +1,52 @@ +package version + +import ( + "strconv" + "strings" +) + +type Version string + +func (me Version) compareTo(other string) int { + var ( + meTab = strings.Split(string(me), ".") + otherTab = strings.Split(other, ".") + ) + for i, s := range meTab { + var meInt, otherInt int + meInt, _ = strconv.Atoi(s) + if len(otherTab) > i { + otherInt, _ = strconv.Atoi(otherTab[i]) + } + if meInt > otherInt { + return 1 + } + if otherInt > meInt { + return -1 + } + } + if len(otherTab) > len(meTab) { + return -1 + } + return 0 +} + +func (me Version) LessThan(other string) bool { + return me.compareTo(other) == -1 +} + +func (me Version) LessThanOrEqualTo(other string) bool { + return me.compareTo(other) <= 0 +} + +func (me Version) GreaterThan(other string) bool { + return me.compareTo(other) == 1 +} + +func (me Version) GreaterThanOrEqualTo(other string) bool { + return me.compareTo(other) >= 0 +} + +func (me Version) Equal(other string) bool { + return me.compareTo(other) == 0 +} diff --git a/version/version_test.go b/version/version_test.go new file mode 100644 index 0000000..4bebd0c --- /dev/null +++ b/version/version_test.go @@ -0,0 +1,25 @@ +package version + +import ( + "testing" +) + +func assertVersion(t *testing.T, a, b string, result int) { + if r := Version(a).compareTo(b); r != result { + t.Fatalf("Unexpected version comparison result. Found %d, expected %d", r, result) + } +} + +func TestCompareVersion(t *testing.T) { + assertVersion(t, "1.12", "1.12", 0) + assertVersion(t, "1.05.00.0156", "1.0.221.9289", 1) + assertVersion(t, "1", "1.0.1", -1) + assertVersion(t, "1.0.1", "1", 1) + assertVersion(t, "1.0.1", "1.0.2", -1) + assertVersion(t, "1.0.2", "1.0.3", -1) + assertVersion(t, "1.0.3", "1.1", -1) + assertVersion(t, "1.1", "1.1.1", -1) + assertVersion(t, "1.1.1", "1.1.2", -1) + assertVersion(t, "1.1.2", "1.2", -1) + +}