From ac97c2a2f19f5b6dae4302f2d24cebc4991538fe Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Fri, 27 Dec 2013 10:47:42 -0700 Subject: [PATCH 001/117] Move UserLookup functionality into a separate pkg/user submodule that implements proper parsing of /etc/passwd and /etc/group, and use that to add support for "docker run -u user:group" and for getting supplementary groups (if ":group" is not specified) Docker-DCO-1.1-Signed-off-by: Andrew Page (github: tianon) --- user/MAINTAINERS | 1 + user/user.go | 245 ++++++++++++++++++++++++++++++++++++++++++++++ user/user_test.go | 94 ++++++++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 user/MAINTAINERS create mode 100644 user/user.go create mode 100644 user/user_test.go diff --git a/user/MAINTAINERS b/user/MAINTAINERS new file mode 100644 index 0000000..18e05a3 --- /dev/null +++ b/user/MAINTAINERS @@ -0,0 +1 @@ +Tianon Gravi (@tianon) diff --git a/user/user.go b/user/user.go new file mode 100644 index 0000000..30fc90f --- /dev/null +++ b/user/user.go @@ -0,0 +1,245 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "reflect" + "strconv" + "strings" +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +func parseLine(line string, v ...interface{}) { + if line == "" { + return + } + + parts := strings.Split(line, ":") + for i, p := range parts { + if len(v) <= i { + // if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files + break + } + + t := reflect.TypeOf(v[i]) + if t.Kind() != reflect.Ptr { + // panic, because this is a programming/logic error, not a runtime one + panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") + } + + switch t.Elem().Kind() { + case reflect.String: + // "root", "adm", "/bin/bash" + *v[i].(*string) = p + case reflect.Int: + // "0", "4", "1000" + *v[i].(*int), _ = strconv.Atoi(p) + // ignore string to int conversion errors, for great "tolerance" of naughty configuration files + case reflect.Slice, reflect.Array: + // "", "root", "root,adm,daemon" + list := []string{} + if p != "" { + list = strings.Split(p, ",") + } + *v[i].(*[]string) = list + } + } +} + +func ParsePasswd() ([]*User, error) { + return ParsePasswdFilter(nil) +} + +func ParsePasswdFilter(filter func(*User) bool) ([]*User, error) { + f, err := os.Open("/etc/passwd") + if err != nil { + return nil, err + } + defer f.Close() + return parsePasswdFile(f, filter) +} + +func parsePasswdFile(r io.Reader, filter func(*User) bool) ([]*User, error) { + var ( + s = bufio.NewScanner(r) + out = []*User{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := strings.TrimSpace(s.Text()) + if text == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := &User{} + parseLine( + text, + &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseGroup() ([]*Group, error) { + return ParseGroupFilter(nil) +} + +func ParseGroupFilter(filter func(*Group) bool) ([]*Group, error) { + f, err := os.Open("/etc/group") + if err != nil { + return nil, err + } + defer f.Close() + return parseGroupFile(f, filter) +} + +func parseGroupFile(r io.Reader, filter func(*Group) bool) ([]*Group, error) { + var ( + s = bufio.NewScanner(r) + out = []*Group{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := &Group{} + parseLine( + text, + &p.Name, &p.Pass, &p.Gid, &p.List, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +// Given a string like "user", "1000", "user:group", "1000:1000", returns the uid, gid, and list of supplementary group IDs, if possible. +func GetUserGroupSupplementary(userSpec string, defaultUid int, defaultGid int) (int, int, []int, error) { + var ( + uid = defaultUid + gid = defaultGid + suppGids = []int{} + + userArg, groupArg string + ) + + // allow for userArg to have either "user" syntax, or optionally "user:group" syntax + parseLine(userSpec, &userArg, &groupArg) + + users, err := ParsePasswdFilter(func(u *User) bool { + if userArg == "" { + return u.Uid == uid + } + return u.Name == userArg || strconv.Itoa(u.Uid) == userArg + }) + if err != nil && !os.IsNotExist(err) { + if userArg == "" { + userArg = strconv.Itoa(uid) + } + return 0, 0, nil, fmt.Errorf("Unable to find user %v: %v", userArg, err) + } + + haveUser := users != nil && len(users) > 0 + if haveUser { + // if we found any user entries that matched our filter, let's take the first one as "correct" + uid = users[0].Uid + gid = users[0].Gid + } else if userArg != "" { + // we asked for a user but didn't find them... let's check to see if we wanted a numeric user + uid, err = strconv.Atoi(userArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, fmt.Errorf("Unable to find user %v", userArg) + } + + // if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit + } + + if groupArg != "" || (haveUser && users[0].Name != "") { + groups, err := ParseGroupFilter(func(g *Group) bool { + if groupArg != "" { + return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg + } + for _, u := range g.List { + if u == users[0].Name { + return true + } + } + return false + }) + if err != nil && !os.IsNotExist(err) { + return 0, 0, nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err) + } + + haveGroup := groups != nil && len(groups) > 0 + if groupArg != "" { + if haveGroup { + // if we found any group entries that matched our filter, let's take the first one as "correct" + gid = groups[0].Gid + } else { + // we asked for a group but didn't find id... let's check to see if we wanted a numeric group + gid, err = strconv.Atoi(groupArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, fmt.Errorf("Unable to find group %v", groupArg) + } + + // if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit + } + } else if haveGroup { + suppGids = make([]int, len(groups)) + for i, group := range groups { + suppGids[i] = group.Gid + } + } + } + + return uid, gid, suppGids, nil +} diff --git a/user/user_test.go b/user/user_test.go new file mode 100644 index 0000000..136632c --- /dev/null +++ b/user/user_test.go @@ -0,0 +1,94 @@ +package user + +import ( + "strings" + "testing" +) + +func TestUserParseLine(t *testing.T) { + var ( + a, b string + c []string + d int + ) + + parseLine("", &a, &b) + if a != "" || b != "" { + t.Fatalf("a and b should be empty ('%v', '%v')", a, b) + } + + parseLine("a", &a, &b) + if a != "a" || b != "" { + t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) + } + + parseLine("bad boys:corny cows", &a, &b) + if a != "bad boys" || b != "corny cows" { + t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) + } + + parseLine("", &c) + if len(c) != 0 { + t.Fatalf("c should be empty (%#v)", c) + } + + parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { + t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("::::::::::", &a, &b, &c) + if a != "" || b != "" || len(c) != 0 { + t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("not a number", &d) + if d != 0 { + t.Fatalf("d should be 0 (%v)", d) + } + + parseLine("b:12:c", &a, &d, &b) + if a != "b" || b != "c" || d != 12 { + t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) + } +} + +func TestUserParsePasswd(t *testing.T) { + users, err := parsePasswdFile(strings.NewReader(` +root:x:0:0:root:/root:/bin/bash +adm:x:3:4:adm:/var/adm:/bin/false +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(users) != 3 { + t.Fatalf("Expected 3 users, got %v", len(users)) + } + if users[0].Uid != 0 || users[0].Name != "root" { + t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name) + } + if users[1].Uid != 3 || users[1].Name != "adm" { + t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name) + } +} + +func TestUserParseGroup(t *testing.T) { + groups, err := parseGroupFile(strings.NewReader(` +root:x:0:root +adm:x:4:root,adm,daemon +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(groups) != 3 { + t.Fatalf("Expected 3 groups, got %v", len(groups)) + } + if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { + t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) + } + if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 { + t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List)) + } +} From 91ac5f5f609d12f5aa5ea10a1c0542da57f6cecb Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 17 Jan 2014 13:41:38 -0800 Subject: [PATCH 002/117] Use type switch instead of reflection Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- user/user.go | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/user/user.go b/user/user.go index 30fc90f..1672f7e 100644 --- a/user/user.go +++ b/user/user.go @@ -5,7 +5,6 @@ import ( "fmt" "io" "os" - "reflect" "strconv" "strings" ) @@ -39,28 +38,25 @@ func parseLine(line string, v ...interface{}) { break } - t := reflect.TypeOf(v[i]) - if t.Kind() != reflect.Ptr { + switch e := v[i].(type) { + case *string: + // "root", "adm", "/bin/bash" + *e = p + case *int: + // "0", "4", "1000" + // ignore string to int conversion errors, for great "tolerance" of naughty configuration files + *e, _ = strconv.Atoi(p) + case *[]string: + // "", "root", "root,adm,daemon" + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: // panic, because this is a programming/logic error, not a runtime one panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") } - - switch t.Elem().Kind() { - case reflect.String: - // "root", "adm", "/bin/bash" - *v[i].(*string) = p - case reflect.Int: - // "0", "4", "1000" - *v[i].(*int), _ = strconv.Atoi(p) - // ignore string to int conversion errors, for great "tolerance" of naughty configuration files - case reflect.Slice, reflect.Array: - // "", "root", "root,adm,daemon" - list := []string{} - if p != "" { - list = strings.Split(p, ",") - } - *v[i].(*[]string) = list - } } } From 1eb74699b50017fbc33ef173322010c1b8ecfea9 Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Thu, 6 Feb 2014 11:34:25 -0800 Subject: [PATCH 003/117] pkg: systemd: add initial MAINTAINERS I volunteered for pkg/systemd MAINTAINER and there were no objections during the #docker-dev meeting. For context I wrote most of the stuff in here and wrote the dependent calls in api.go. Plus, I actively test the code via CoreOS. Docker-DCO-1.1-Signed-off-by: Brandon Philips (github: philips) --- systemd/MAINTAINERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 systemd/MAINTAINERS diff --git a/systemd/MAINTAINERS b/systemd/MAINTAINERS new file mode 100644 index 0000000..51228b3 --- /dev/null +++ b/systemd/MAINTAINERS @@ -0,0 +1 @@ +Brandon Philips (@philips) From a5bf8abb7723ae7d47188ab9f97f06129a53f726 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 8 Feb 2014 09:53:04 -0800 Subject: [PATCH 004/117] Add set master for interface Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 0ea5b4d..01b3ff0 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -386,6 +386,39 @@ func NetworkSetMTU(iface *net.Interface, mtu int) error { return s.HandleAck(wb.Seq) } +// same as ip link set $name master $master +func NetworkSetMaster(iface, master *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = 0xFFFFFFFF + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(master.Index)) + + data := newRtAttr(syscall.IFLA_MASTER, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + // Add an Ip address to an interface. This is identical to: // ip addr add $ip/$ipNet dev $iface func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { From 708c7be9d213cdf85c5279d082f480eaf072e6cd Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 8 Feb 2014 10:03:16 -0800 Subject: [PATCH 005/117] Add network set interface in namespace by pid Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 01b3ff0..e69635e 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -419,6 +419,38 @@ func NetworkSetMaster(iface, master *net.Interface) error { return s.HandleAck(wb.Seq) } +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = 0xFFFFFFFF + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(nspid)) + + data := newRtAttr(syscall.IFLA_NET_NS_PID, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + // Add an Ip address to an interface. This is identical to: // ip addr add $ip/$ipNet dev $iface func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { From 971834148cb0c2e688740f2bdac13b1ebae65cd7 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 8 Feb 2014 20:44:04 -0800 Subject: [PATCH 006/117] Use c to change interface name Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 66 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index e69635e..77ebfcd 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -2,6 +2,60 @@ package netlink +/* +#include +#include +#include +#include + +static int get_socket(void) { + int s_errno; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, 0); + if (fd >= 0) { + return fd; + } + s_errno = errno; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd >= 0) { + return fd; + } + + fd = socket(PF_INET6, SOCK_DGRAM, 0); + if (fd >= 0) { + return fd; + } + errno = s_errno; + return -1; +} + + +static int change_name(const char *old_name, const char *new_name) { + struct ifreq ifr; + int err; + int fd; + + fd = get_socket(); + if (fd < 0) { + return -1; + } + + strncpy(ifr.ifr_name, old_name, IFNAMSIZ); + strncpy(ifr.ifr_newname, new_name, IFNAMSIZ); + + err = ioctl(fd, SIOCSIFNAME, &ifr); + if (err) { + close(fd); + return -1; + } + close(fd); + return err; +} +*/ +import "C" + import ( "encoding/binary" "fmt" @@ -641,3 +695,15 @@ done: return res, nil } + +func NetworkChangeName(oldName, newName string) error { + var ( + cold = C.CString(oldName) + cnew = C.CString(newName) + ) + + if errno := int(C.change_name(cold, cnew)); errno != 0 { + return fmt.Errorf("unable to change name %d", errno) + } + return nil +} From 4f9817a3caabaeba27086187aabf63231034d978 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sun, 9 Feb 2014 05:54:13 -0800 Subject: [PATCH 007/117] Replace my C code with tianons Go code Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 97 +++++++++++++++------------------------- 1 file changed, 35 insertions(+), 62 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 77ebfcd..46bd3d8 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -2,60 +2,6 @@ package netlink -/* -#include -#include -#include -#include - -static int get_socket(void) { - int s_errno; - int fd; - - fd = socket(PF_INET, SOCK_DGRAM, 0); - if (fd >= 0) { - return fd; - } - s_errno = errno; - - fd = socket(PF_PACKET, SOCK_DGRAM, 0); - if (fd >= 0) { - return fd; - } - - fd = socket(PF_INET6, SOCK_DGRAM, 0); - if (fd >= 0) { - return fd; - } - errno = s_errno; - return -1; -} - - -static int change_name(const char *old_name, const char *new_name) { - struct ifreq ifr; - int err; - int fd; - - fd = get_socket(); - if (fd < 0) { - return -1; - } - - strncpy(ifr.ifr_name, old_name, IFNAMSIZ); - strncpy(ifr.ifr_newname, new_name, IFNAMSIZ); - - err = ioctl(fd, SIOCSIFNAME, &ifr); - if (err) { - close(fd); - return -1; - } - close(fd); - return err; -} -*/ -import "C" - import ( "encoding/binary" "fmt" @@ -696,14 +642,41 @@ done: return res, nil } -func NetworkChangeName(oldName, newName string) error { - var ( - cold = C.CString(oldName) - cnew = C.CString(newName) - ) - - if errno := int(C.change_name(cold, cnew)); errno != 0 { - return fmt.Errorf("unable to change name %d", errno) +func getIfSocket() (int, error) { + fd, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, 0) + if err == nil { + return fd, err } + sErr := err + + fd, err = syscall.Socket(syscall.AF_PACKET, syscall.SOCK_DGRAM, 0) + if err == nil { + return fd, err + } + + fd, err = syscall.Socket(syscall.AF_INET6, syscall.SOCK_DGRAM, 0) + if err == nil { + return fd, err + } + + return -1, sErr +} + +func NetworkChangeName(oldName, newName string) error { + fd, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(fd) + IFNAMSIZ := 16 + + data := [32]byte{} + copy(data[:IFNAMSIZ-1], oldName) + copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) + + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCSIFNAME, uintptr(unsafe.Pointer(&data[0]))); errno != 0 { + return errno + } + return nil } From e44920cbe6770b7df8d50c489dd74ef90b1f8006 Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Sun, 9 Feb 2014 18:12:43 -0700 Subject: [PATCH 008/117] Update NetworkChangeName to be more similar to my original (moving IFNAMSIZ constant outside the function like it should've been) Docker-DCO-1.1-Signed-off-by: Andrew Page (github: tianon) --- netlink/netlink_linux.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 46bd3d8..b94cbf6 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -662,15 +662,17 @@ func getIfSocket() (int, error) { return -1, sErr } +// from +const IFNAMSIZ = 16 + func NetworkChangeName(oldName, newName string) error { fd, err := getIfSocket() if err != nil { return err } defer syscall.Close(fd) - IFNAMSIZ := 16 - data := [32]byte{} + data := [IFNAMSIZ * 2]byte{} copy(data[:IFNAMSIZ-1], oldName) copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) From 4b4a12a4c3fcab1c06524f38854faa6b25c00e64 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 10 Feb 2014 11:36:23 -0800 Subject: [PATCH 009/117] Improve get if socket loop Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 43 +++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index b94cbf6..4e091ef 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -10,6 +10,12 @@ import ( "unsafe" ) +const ( + IFNAMSIZ = 16 + DEFAULT_CHANGE = 0xFFFFFFFF + IFLA_INFO_KIND = 1 +) + var nextSeqNr int func nativeEndian() binary.ByteOrder { @@ -368,7 +374,7 @@ func NetworkSetMTU(iface *net.Interface, mtu int) error { msg.Type = syscall.RTM_SETLINK msg.Flags = syscall.NLM_F_REQUEST msg.Index = int32(iface.Index) - msg.Change = 0xFFFFFFFF + msg.Change = DEFAULT_CHANGE wb.AddData(msg) var ( @@ -400,7 +406,7 @@ func NetworkSetMaster(iface, master *net.Interface) error { msg.Type = syscall.RTM_SETLINK msg.Flags = syscall.NLM_F_REQUEST msg.Index = int32(iface.Index) - msg.Change = 0xFFFFFFFF + msg.Change = DEFAULT_CHANGE wb.AddData(msg) var ( @@ -432,7 +438,7 @@ func NetworkSetNsPid(iface *net.Interface, nspid int) error { msg.Type = syscall.RTM_SETLINK msg.Flags = syscall.NLM_F_REQUEST msg.Index = int32(iface.Index) - msg.Change = 0xFFFFFFFF + msg.Change = DEFAULT_CHANGE wb.AddData(msg) var ( @@ -524,8 +530,6 @@ func NetworkLinkAdd(name string, linkType string) error { nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) wb.AddData(nameData) - IFLA_INFO_KIND := 1 - kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated(linkType)) infoData := newRtAttr(syscall.IFLA_LINKINFO, kindData.ToWireFormat()) @@ -642,29 +646,22 @@ done: return res, nil } -func getIfSocket() (int, error) { - fd, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, 0) - if err == nil { - return fd, err +func getIfSocket() (fd int, err error) { + for _, socket := range []int{ + syscall.AF_INET, + syscall.AF_PACKET, + syscall.AF_INET6, + } { + if fd, err = syscall.Socket(socket, syscall.SOCK_DGRAM, 0); err == nil { + break + } } - sErr := err - - fd, err = syscall.Socket(syscall.AF_PACKET, syscall.SOCK_DGRAM, 0) if err == nil { - return fd, err + return fd, nil } - - fd, err = syscall.Socket(syscall.AF_INET6, syscall.SOCK_DGRAM, 0) - if err == nil { - return fd, err - } - - return -1, sErr + return -1, err } -// from -const IFNAMSIZ = 16 - func NetworkChangeName(oldName, newName string) error { fd, err := getIfSocket() if err != nil { From f62dcac1c5ccc9a9a32df6689f7689d181348727 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 10 Feb 2014 13:37:16 -0800 Subject: [PATCH 010/117] Create veth pair via netlink Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 43 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 4e091ef..52c77a2 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -14,6 +14,8 @@ const ( IFNAMSIZ = 16 DEFAULT_CHANGE = 0xFFFFFFFF IFLA_INFO_KIND = 1 + IFLA_INFO_DATA = 2 + VETH_PEER = 1 ) var nextSeqNr int @@ -197,7 +199,9 @@ func (rr *NetlinkRequest) ToWireFormat() []byte { } func (rr *NetlinkRequest) AddData(data NetlinkRequestData) { - rr.Data = append(rr.Data, data) + if data != nil { + rr.Data = append(rr.Data, data) + } } func newNetlinkRequest(proto, flags int) *NetlinkRequest { @@ -676,6 +680,41 @@ func NetworkChangeName(oldName, newName string) error { if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCSIFNAME, uintptr(unsafe.Pointer(&data[0]))); errno != 0 { return errno } - return nil } + +func NetworkCreateVethPair(name1, name2 string) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated("veth")) + info := newRtAttr(syscall.IFLA_LINKINFO, kindData.ToWireFormat()) + // wb.AddData(info) + + peerName := newRtAttr(syscall.IFLA_IFNAME, nonZeroTerminated(name2)) + peer := newRtAttr(VETH_PEER, peerName.ToWireFormat()) + // wb.AddData(peer) + + b := []byte{} + b = append(b, peer.ToWireFormat()...) + b = append(b, info.ToWireFormat()...) + + infoData := newRtAttr(IFLA_INFO_DATA, b) + wb.AddData(infoData) + + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name1)) + wb.AddData(nameData) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} From a37785b64f003c7e55346a14d48018cceee54e33 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 10 Feb 2014 16:41:16 -0800 Subject: [PATCH 011/117] Allow add of empty name Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 52c77a2..23dba0c 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -531,8 +531,10 @@ func NetworkLinkAdd(name string, linkType string) error { msg := newIfInfomsg(syscall.AF_UNSPEC) wb.AddData(msg) - nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) - wb.AddData(nameData) + if name != "" { + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) + wb.AddData(nameData) + } kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated(linkType)) From 1d3028efa35fc58204236c31dbb6293a1ea782d0 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 10 Feb 2014 22:32:07 -0800 Subject: [PATCH 012/117] Add more netlink functions for set ns by fd and bring iface down Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 61 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 23dba0c..3b21a38 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -15,7 +15,8 @@ const ( DEFAULT_CHANGE = 0xFFFFFFFF IFLA_INFO_KIND = 1 IFLA_INFO_DATA = 2 - VETH_PEER = 1 + VETH_INFO_PEER = 1 + IFLA_NET_NS_FD = 28 ) var nextSeqNr int @@ -365,6 +366,28 @@ func NetworkLinkUp(iface *net.Interface) error { return s.HandleAck(wb.Seq) } +func NetworkLinkDown(iface *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Change = syscall.IFF_UP + msg.Flags = 0 & ^syscall.IFF_UP + msg.Index = int32(iface.Index) + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + func NetworkSetMTU(iface *net.Interface, mtu int) error { s, err := getNetlinkSocket() if err != nil { @@ -461,6 +484,38 @@ func NetworkSetNsPid(iface *net.Interface, nspid int) error { return s.HandleAck(wb.Seq) } +func NetworkSetNsFd(iface *net.Interface, fd int) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + var ( + b = make([]byte, 4) + native = nativeEndian() + ) + native.PutUint32(b, uint32(fd)) + + data := newRtAttr(IFLA_NET_NS_FD, b) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + // Add an Ip address to an interface. This is identical to: // ip addr add $ip/$ipNet dev $iface func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { @@ -668,7 +723,7 @@ func getIfSocket() (fd int, err error) { return -1, err } -func NetworkChangeName(oldName, newName string) error { +func NetworkChangeName(iface *net.Interface, newName string) error { fd, err := getIfSocket() if err != nil { return err @@ -676,7 +731,7 @@ func NetworkChangeName(oldName, newName string) error { defer syscall.Close(fd) data := [IFNAMSIZ * 2]byte{} - copy(data[:IFNAMSIZ-1], oldName) + copy(data[:IFNAMSIZ-1], iface.Name) copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCSIFNAME, uintptr(unsafe.Pointer(&data[0]))); errno != 0 { From f697f1d648b5340474b056cdef61bd350ae21493 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 11 Feb 2014 03:32:35 -0800 Subject: [PATCH 013/117] Exec out to ip right now for creating the veth pair Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_linux.go | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 3b21a38..b9e04a3 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -6,6 +6,7 @@ import ( "encoding/binary" "fmt" "net" + "os/exec" "syscall" "unsafe" ) @@ -741,37 +742,8 @@ func NetworkChangeName(iface *net.Interface, newName string) error { } func NetworkCreateVethPair(name1, name2 string) error { - s, err := getNetlinkSocket() - if err != nil { - return err + if data, err := exec.Command("ip", "link", "add", name1, "type", "veth", "peer", "name", name2).Output(); err != nil { + return fmt.Errorf("%s %s", data, err) } - defer s.Close() - - wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) - - msg := newIfInfomsg(syscall.AF_UNSPEC) - wb.AddData(msg) - - kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated("veth")) - info := newRtAttr(syscall.IFLA_LINKINFO, kindData.ToWireFormat()) - // wb.AddData(info) - - peerName := newRtAttr(syscall.IFLA_IFNAME, nonZeroTerminated(name2)) - peer := newRtAttr(VETH_PEER, peerName.ToWireFormat()) - // wb.AddData(peer) - - b := []byte{} - b = append(b, peer.ToWireFormat()...) - b = append(b, info.ToWireFormat()...) - - infoData := newRtAttr(IFLA_INFO_DATA, b) - wb.AddData(infoData) - - nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name1)) - wb.AddData(nameData) - - if err := s.Send(wb); err != nil { - return err - } - return s.HandleAck(wb.Seq) + return nil } From 6675d69513ce71e2b4acf35e008922afa132d47d Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Mon, 10 Feb 2014 15:11:17 -0700 Subject: [PATCH 014/117] Add comment clarifying null termination Docker-DCO-1.1-Signed-off-by: Andrew Page (github: tianon) --- netlink/netlink_linux.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 3b21a38..85dac51 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -731,6 +731,8 @@ func NetworkChangeName(iface *net.Interface, newName string) error { defer syscall.Close(fd) data := [IFNAMSIZ * 2]byte{} + // the "-1"s here are very important for ensuring we get proper null + // termination of our new C strings copy(data[:IFNAMSIZ-1], iface.Name) copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) From cf90100c6cf091f284af627f7419dc772b289c81 Mon Sep 17 00:00:00 2001 From: Solomon Hykes Date: Tue, 11 Feb 2014 18:46:55 -0800 Subject: [PATCH 015/117] pkg/opts: a collection of custom value parsers implementing flag.Value This facilitates the refactoring of commands.go. Docker-DCO-1.1-Signed-off-by: Solomon Hykes (github: shykes) --- opts/opts.go | 148 ++++++++++++++++++++++++++++++++++++++++++++++ opts/opts_test.go | 24 ++++++++ 2 files changed, 172 insertions(+) create mode 100644 opts/opts.go create mode 100644 opts/opts_test.go diff --git a/opts/opts.go b/opts/opts.go new file mode 100644 index 0000000..a1b8752 --- /dev/null +++ b/opts/opts.go @@ -0,0 +1,148 @@ +package opts + +import ( + "fmt" + "github.com/dotcloud/docker/utils" + "os" + "path/filepath" + "regexp" + "strings" +) + +// ListOpts type +type ListOpts struct { + values []string + validator ValidatorFctType +} + +func NewListOpts(validator ValidatorFctType) ListOpts { + return ListOpts{ + validator: validator, + } +} + +func (opts *ListOpts) String() string { + return fmt.Sprintf("%v", []string(opts.values)) +} + +// Set validates if needed the input value and add it to the +// internal slice. +func (opts *ListOpts) Set(value string) error { + if opts.validator != nil { + v, err := opts.validator(value) + if err != nil { + return err + } + value = v + } + opts.values = append(opts.values, value) + return nil +} + +// Delete remove the given element from the slice. +func (opts *ListOpts) Delete(key string) { + for i, k := range opts.values { + if k == key { + opts.values = append(opts.values[:i], opts.values[i+1:]...) + return + } + } +} + +// GetMap returns the content of values in a map in order to avoid +// duplicates. +// FIXME: can we remove this? +func (opts *ListOpts) GetMap() map[string]struct{} { + ret := make(map[string]struct{}) + for _, k := range opts.values { + ret[k] = struct{}{} + } + return ret +} + +// GetAll returns the values' slice. +// FIXME: Can we remove this? +func (opts *ListOpts) GetAll() []string { + return opts.values +} + +// Get checks the existence of the given key. +func (opts *ListOpts) Get(key string) bool { + for _, k := range opts.values { + if k == key { + return true + } + } + return false +} + +// Len returns the amount of element in the slice. +func (opts *ListOpts) Len() int { + return len(opts.values) +} + +// Validators +type ValidatorFctType func(val string) (string, error) + +func ValidateAttach(val string) (string, error) { + if val != "stdin" && val != "stdout" && val != "stderr" { + return val, fmt.Errorf("Unsupported stream name: %s", val) + } + return val, nil +} + +func ValidateLink(val string) (string, error) { + if _, err := parseLink(val); err != nil { + return val, err + } + return val, nil +} + +// FIXME: this is a duplicate of docker.utils.parseLink. +// it can't be moved to a separate links/ package because +// links depends on Container which is defined in the core. +// +// Links come in the format of +// name:alias +func parseLink(rawLink string) (map[string]string, error) { + return utils.PartParser("name:alias", rawLink) +} + +func ValidatePath(val string) (string, error) { + var containerPath string + + if strings.Count(val, ":") > 2 { + return val, fmt.Errorf("bad format for volumes: %s", val) + } + + splited := strings.SplitN(val, ":", 2) + if len(splited) == 1 { + containerPath = splited[0] + val = filepath.Clean(splited[0]) + } else { + containerPath = splited[1] + val = fmt.Sprintf("%s:%s", splited[0], filepath.Clean(splited[1])) + } + + if !filepath.IsAbs(containerPath) { + return val, fmt.Errorf("%s is not an absolute path", containerPath) + } + return val, nil +} + +func ValidateEnv(val string) (string, error) { + arr := strings.Split(val, "=") + if len(arr) > 1 { + return val, nil + } + return fmt.Sprintf("%s=%s", val, os.Getenv(val)), nil +} + +func ValidateIp4Address(val string) (string, error) { + re := regexp.MustCompile(`^(([0-9]+\.){3}([0-9]+))\s*$`) + var ns = re.FindSubmatch([]byte(val)) + if len(ns) > 0 { + return string(ns[1]), nil + } + return "", fmt.Errorf("%s is not an ip4 address", val) +} diff --git a/opts/opts_test.go b/opts/opts_test.go new file mode 100644 index 0000000..a5c1fac --- /dev/null +++ b/opts/opts_test.go @@ -0,0 +1,24 @@ +package opts + +import ( + "testing" +) + +func TestValidateIP4(t *testing.T) { + if ret, err := ValidateIp4Address(`1.2.3.4`); err != nil || ret == "" { + t.Fatalf("ValidateIp4Address(`1.2.3.4`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`127.0.0.1`); err != nil || ret == "" { + t.Fatalf("ValidateIp4Address(`127.0.0.1`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`127`); err == nil || ret != "" { + t.Fatalf("ValidateIp4Address(`127`) got %s %s", ret, err) + } + + if ret, err := ValidateIp4Address(`random invalid string`); err == nil || ret != "" { + t.Fatalf("ValidateIp4Address(`random invalid string`) got %s %s", ret, err) + } + +} From 000df043485ae009044172ecd64cd712075f8143 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 12 Feb 2014 04:09:56 -0800 Subject: [PATCH 016/117] Implement create veth Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- netlink/netlink_linux.go | 84 +++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 23dba0c..b16dec0 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -142,29 +142,61 @@ func rtaAlignOf(attrlen int) int { type RtAttr struct { syscall.RtAttr - Data []byte + Data []byte + children []*RtAttr + prefix int } func newRtAttr(attrType int, data []byte) *RtAttr { - attr := &RtAttr{} + attr := &RtAttr{ + children: []*RtAttr{}, + } attr.Type = uint16(attrType) attr.Data = data return attr } -func (attr *RtAttr) ToWireFormat() []byte { +func newRtAttrChild(parent *RtAttr, attrType int, data []byte) *RtAttr { + attr := newRtAttr(attrType, data) + parent.children = append(parent.children, attr) + return attr +} + +func (a *RtAttr) length() int { + l := 0 + for _, child := range a.children { + l += child.length() + syscall.SizeofRtAttr + child.prefix + } + if l == 0 { + l++ + } + return rtaAlignOf(l + len(a.Data)) +} + +func (a *RtAttr) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofRtAttr + len(attr.Data) - b := make([]byte, rtaAlignOf(len)) - native.PutUint16(b[0:2], uint16(len)) - native.PutUint16(b[2:4], attr.Type) - for i, d := range attr.Data { - b[4+i] = d + length := a.length() + buf := make([]byte, rtaAlignOf(length+syscall.SizeofRtAttr)) + + if a.Data != nil { + copy(buf[4:], a.Data) + } else { + next := 4 + for _, child := range a.children { + childBuf := child.ToWireFormat() + copy(buf[next+child.prefix:], childBuf) + next += rtaAlignOf(len(childBuf)) + } } - return b + if l := uint16(rtaAlignOf(length)); l != 0 { + native.PutUint16(buf[0:2], l+1) + } + native.PutUint16(buf[2:4], a.Type) + + return buf } type NetlinkRequest struct { @@ -501,12 +533,7 @@ func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { } func zeroTerminated(s string) []byte { - bytes := make([]byte, len(s)+1) - for i := 0; i < len(s); i++ { - bytes[i] = s[i] - } - bytes[len(s)] = 0 - return bytes + return []byte(s + "\000") } func nonZeroTerminated(s string) []byte { @@ -697,24 +724,19 @@ func NetworkCreateVethPair(name1, name2 string) error { msg := newIfInfomsg(syscall.AF_UNSPEC) wb.AddData(msg) - kindData := newRtAttr(IFLA_INFO_KIND, nonZeroTerminated("veth")) - info := newRtAttr(syscall.IFLA_LINKINFO, kindData.ToWireFormat()) - // wb.AddData(info) - - peerName := newRtAttr(syscall.IFLA_IFNAME, nonZeroTerminated(name2)) - peer := newRtAttr(VETH_PEER, peerName.ToWireFormat()) - // wb.AddData(peer) - - b := []byte{} - b = append(b, peer.ToWireFormat()...) - b = append(b, info.ToWireFormat()...) - - infoData := newRtAttr(IFLA_INFO_DATA, b) - wb.AddData(infoData) - nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name1)) wb.AddData(nameData) + nest1 := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(nest1, IFLA_INFO_KIND, zeroTerminated("veth")) + nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) + nest3 := newRtAttrChild(nest2, VETH_PEER, nil) + + last := newRtAttrChild(nest3, syscall.IFLA_IFNAME, zeroTerminated(name2)) + last.prefix = syscall.SizeofIfInfomsg + + wb.AddData(nest1) + if err := s.Send(wb); err != nil { return err } From a890f18e9aa26470d435b875c826a2bf1966ee35 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 12 Feb 2014 09:29:06 -0800 Subject: [PATCH 017/117] Simplify code + Allow more generic attr children + remove prefix Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- netlink/netlink.go | 10 ++- netlink/netlink_linux.go | 131 +++++++++++++++++++++------------------ 2 files changed, 79 insertions(+), 62 deletions(-) diff --git a/netlink/netlink.go b/netlink/netlink.go index 5098b4b..5cc7562 100644 --- a/netlink/netlink.go +++ b/netlink/netlink.go @@ -5,7 +5,15 @@ // netlink_darwin.go package netlink -import "net" +import ( + "errors" + "net" +) + +var ( + ErrWrongSockType = errors.New("Wrong socket type") + ErrShortResponse = errors.New("Got short response from netlink") +) // A Route is a subnet associated with the interface to reach it. type Route struct { diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 1f48a64..f8bb6ba 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -45,6 +45,7 @@ func getIpFamily(ip net.IP) int { } type NetlinkRequestData interface { + Len() int ToWireFormat() []byte } @@ -53,21 +54,24 @@ type IfInfomsg struct { } func newIfInfomsg(family int) *IfInfomsg { - msg := &IfInfomsg{} - msg.Family = uint8(family) - msg.Type = uint16(0) - msg.Index = int32(0) - msg.Flags = uint32(0) - msg.Change = uint32(0) + return &IfInfomsg{ + IfInfomsg: syscall.IfInfomsg{ + Family: uint8(family), + }, + } +} +func newIfInfomsgChild(parent *RtAttr, family int) *IfInfomsg { + msg := newIfInfomsg(family) + parent.children = append(parent.children, msg) return msg } func (msg *IfInfomsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofIfInfomsg - b := make([]byte, len) + length := syscall.SizeofIfInfomsg + b := make([]byte, length) b[0] = msg.Family b[1] = 0 native.PutUint16(b[2:4], msg.Type) @@ -77,26 +81,27 @@ func (msg *IfInfomsg) ToWireFormat() []byte { return b } +func (msg *IfInfomsg) Len() int { + return syscall.SizeofIfInfomsg +} + type IfAddrmsg struct { syscall.IfAddrmsg } func newIfAddrmsg(family int) *IfAddrmsg { - msg := &IfAddrmsg{} - msg.Family = uint8(family) - msg.Prefixlen = uint8(0) - msg.Flags = uint8(0) - msg.Scope = uint8(0) - msg.Index = uint32(0) - - return msg + return &IfAddrmsg{ + IfAddrmsg: syscall.IfAddrmsg{ + Family: uint8(family), + }, + } } func (msg *IfAddrmsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofIfAddrmsg - b := make([]byte, len) + length := syscall.SizeofIfAddrmsg + b := make([]byte, length) b[0] = msg.Family b[1] = msg.Prefixlen b[2] = msg.Flags @@ -105,26 +110,31 @@ func (msg *IfAddrmsg) ToWireFormat() []byte { return b } +func (msg *IfAddrmsg) Len() int { + return syscall.SizeofIfAddrmsg +} + type RtMsg struct { syscall.RtMsg } func newRtMsg(family int) *RtMsg { - msg := &RtMsg{} - msg.Family = uint8(family) - msg.Table = syscall.RT_TABLE_MAIN - msg.Scope = syscall.RT_SCOPE_UNIVERSE - msg.Protocol = syscall.RTPROT_BOOT - msg.Type = syscall.RTN_UNICAST - - return msg + return &RtMsg{ + RtMsg: syscall.RtMsg{ + Family: uint8(family), + Table: syscall.RT_TABLE_MAIN, + Scope: syscall.RT_SCOPE_UNIVERSE, + Protocol: syscall.RTPROT_BOOT, + Type: syscall.RTN_UNICAST, + }, + } } func (msg *RtMsg) ToWireFormat() []byte { native := nativeEndian() - len := syscall.SizeofRtMsg - b := make([]byte, len) + length := syscall.SizeofRtMsg + b := make([]byte, length) b[0] = msg.Family b[1] = msg.Dst_len b[2] = msg.Src_len @@ -137,6 +147,10 @@ func (msg *RtMsg) ToWireFormat() []byte { return b } +func (msg *RtMsg) Len() int { + return syscall.SizeofRtMsg +} + func rtaAlignOf(attrlen int) int { return (attrlen + syscall.RTA_ALIGNTO - 1) & ^(syscall.RTA_ALIGNTO - 1) } @@ -144,18 +158,17 @@ func rtaAlignOf(attrlen int) int { type RtAttr struct { syscall.RtAttr Data []byte - children []*RtAttr - prefix int + children []NetlinkRequestData } func newRtAttr(attrType int, data []byte) *RtAttr { - attr := &RtAttr{ - children: []*RtAttr{}, + return &RtAttr{ + RtAttr: syscall.RtAttr{ + Type: uint16(attrType), + }, + children: []NetlinkRequestData{}, + Data: data, } - attr.Type = uint16(attrType) - attr.Data = data - - return attr } func newRtAttrChild(parent *RtAttr, attrType int, data []byte) *RtAttr { @@ -164,10 +177,10 @@ func newRtAttrChild(parent *RtAttr, attrType int, data []byte) *RtAttr { return attr } -func (a *RtAttr) length() int { +func (a *RtAttr) Len() int { l := 0 for _, child := range a.children { - l += child.length() + syscall.SizeofRtAttr + child.prefix + l += child.Len() + syscall.SizeofRtAttr } if l == 0 { l++ @@ -178,7 +191,7 @@ func (a *RtAttr) length() int { func (a *RtAttr) ToWireFormat() []byte { native := nativeEndian() - length := a.length() + length := a.Len() buf := make([]byte, rtaAlignOf(length+syscall.SizeofRtAttr)) if a.Data != nil { @@ -187,7 +200,7 @@ func (a *RtAttr) ToWireFormat() []byte { next := 4 for _, child := range a.children { childBuf := child.ToWireFormat() - copy(buf[next+child.prefix:], childBuf) + copy(buf[next:], childBuf) next += rtaAlignOf(len(childBuf)) } } @@ -212,7 +225,7 @@ func (rr *NetlinkRequest) ToWireFormat() []byte { dataBytes := make([][]byte, len(rr.Data)) for i, data := range rr.Data { dataBytes[i] = data.ToWireFormat() - length = length + uint32(len(dataBytes[i])) + length += uint32(len(dataBytes[i])) } b := make([]byte, length) native.PutUint32(b[0:4], length) @@ -221,12 +234,10 @@ func (rr *NetlinkRequest) ToWireFormat() []byte { native.PutUint32(b[8:12], rr.Seq) native.PutUint32(b[12:16], rr.Pid) - i := 16 + next := 16 for _, data := range dataBytes { - for _, dataByte := range data { - b[i] = dataByte - i = i + 1 - } + copy(b[next:], data) + next += len(data) } return b } @@ -238,12 +249,14 @@ func (rr *NetlinkRequest) AddData(data NetlinkRequestData) { } func newNetlinkRequest(proto, flags int) *NetlinkRequest { - rr := &NetlinkRequest{} - rr.Len = uint32(syscall.NLMSG_HDRLEN) - rr.Type = uint16(proto) - rr.Flags = syscall.NLM_F_REQUEST | uint16(flags) - rr.Seq = uint32(getSeq()) - return rr + return &NetlinkRequest{ + NlMsghdr: syscall.NlMsghdr{ + Len: uint32(syscall.NLMSG_HDRLEN), + Type: uint16(proto), + Flags: syscall.NLM_F_REQUEST | uint16(flags), + Seq: uint32(getSeq()), + }, + } } type NetlinkSocket struct { @@ -286,7 +299,7 @@ func (s *NetlinkSocket) Receive() ([]syscall.NetlinkMessage, error) { return nil, err } if nr < syscall.NLMSG_HDRLEN { - return nil, fmt.Errorf("Got short response from netlink") + return nil, ErrShortResponse } rb = rb[:nr] return syscall.ParseNetlinkMessage(rb) @@ -301,7 +314,7 @@ func (s *NetlinkSocket) GetPid() (uint32, error) { case *syscall.SockaddrNetlink: return v.Pid, nil } - return 0, fmt.Errorf("Wrong socket type") + return 0, ErrWrongSockType } func (s *NetlinkSocket) HandleAck(seq uint32) error { @@ -592,11 +605,7 @@ func zeroTerminated(s string) []byte { } func nonZeroTerminated(s string) []byte { - bytes := make([]byte, len(s)) - for i := 0; i < len(s); i++ { - bytes[i] = s[i] - } - return bytes + return []byte(s) } // Add a new network link of a specified type. This is identical to @@ -789,8 +798,8 @@ func NetworkCreateVethPair(name1, name2 string) error { nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) nest3 := newRtAttrChild(nest2, VETH_INFO_PEER, nil) - last := newRtAttrChild(nest3, syscall.IFLA_IFNAME, zeroTerminated(name2)) - last.prefix = syscall.SizeofIfInfomsg + newIfInfomsgChild(nest3, syscall.AF_UNSPEC) + newRtAttrChild(nest3, syscall.IFLA_IFNAME, zeroTerminated(name2)) wb.AddData(nest1) From 3b7915ab05d3c22d860b2b5fe6e796a0af33c2fe Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 14 Feb 2014 12:12:35 -0800 Subject: [PATCH 018/117] Add new functions to unsupported file Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/netlink_unsupported.go | 42 ++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/netlink/netlink_unsupported.go b/netlink/netlink_unsupported.go index cd796b3..bd9e962 100644 --- a/netlink/netlink_unsupported.go +++ b/netlink/netlink_unsupported.go @@ -3,31 +3,59 @@ package netlink import ( - "fmt" + "errors" "net" ) +var ( + ErrNotImplemented = errors.New("not implemented") +) + func NetworkGetRoutes() ([]Route, error) { - return nil, fmt.Errorf("Not implemented") + return nil, ErrNotImplemented } func NetworkLinkAdd(name string, linkType string) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkLinkUp(iface *net.Interface) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func AddDefaultGw(ip net.IP) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented } func NetworkSetMTU(iface *net.Interface, mtu int) error { - return fmt.Errorf("Not implemented") + return ErrNotImplemented +} + +func NetworkCreateVethPair(name1, name2 string) error { + return ErrNotImplemented +} + +func NetworkChangeName(iface *net.Interface, newName string) error { + return ErrNotImplemented +} + +func NetworkSetNsFd(iface *net.Interface, fd int) error { + return ErrNotImplemented +} + +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + return ErrNotImplemented +} + +func NetworkSetMaster(iface, master *net.Interface) error { + return ErrNotImplemented +} + +func NetworkLinkDown(iface *net.Interface) error { + return ErrNotImplemented } From d3da259b24cf729d15b73b202f256d31e4255e9c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 14 Feb 2014 15:25:25 -0800 Subject: [PATCH 019/117] Move proxy into pkg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- proxy/MAINTAINERS | 1 + proxy/network_proxy_test.go | 216 ++++++++++++++++++++++++++++++++++++ proxy/proxy.go | 29 +++++ proxy/stub_proxy.go | 22 ++++ proxy/tcp_proxy.go | 93 ++++++++++++++++ proxy/udp_proxy.go | 162 +++++++++++++++++++++++++++ 6 files changed, 523 insertions(+) create mode 100644 proxy/MAINTAINERS create mode 100644 proxy/network_proxy_test.go create mode 100644 proxy/proxy.go create mode 100644 proxy/stub_proxy.go create mode 100644 proxy/tcp_proxy.go create mode 100644 proxy/udp_proxy.go diff --git a/proxy/MAINTAINERS b/proxy/MAINTAINERS new file mode 100644 index 0000000..1e998f8 --- /dev/null +++ b/proxy/MAINTAINERS @@ -0,0 +1 @@ +Michael Crosby (@crosbymichael) diff --git a/proxy/network_proxy_test.go b/proxy/network_proxy_test.go new file mode 100644 index 0000000..9e38256 --- /dev/null +++ b/proxy/network_proxy_test.go @@ -0,0 +1,216 @@ +package proxy + +import ( + "bytes" + "fmt" + "io" + "net" + "strings" + "testing" + "time" +) + +var testBuf = []byte("Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo") +var testBufSize = len(testBuf) + +type EchoServer interface { + Run() + Close() + LocalAddr() net.Addr +} + +type TCPEchoServer struct { + listener net.Listener + testCtx *testing.T +} + +type UDPEchoServer struct { + conn net.PacketConn + testCtx *testing.T +} + +func NewEchoServer(t *testing.T, proto, address string) EchoServer { + var server EchoServer + if strings.HasPrefix(proto, "tcp") { + listener, err := net.Listen(proto, address) + if err != nil { + t.Fatal(err) + } + server = &TCPEchoServer{listener: listener, testCtx: t} + } else { + socket, err := net.ListenPacket(proto, address) + if err != nil { + t.Fatal(err) + } + server = &UDPEchoServer{conn: socket, testCtx: t} + } + return server +} + +func (server *TCPEchoServer) Run() { + go func() { + for { + client, err := server.listener.Accept() + if err != nil { + return + } + go func(client net.Conn) { + if _, err := io.Copy(client, client); err != nil { + server.testCtx.Logf("can't echo to the client: %v\n", err.Error()) + } + client.Close() + }(client) + } + }() +} + +func (server *TCPEchoServer) LocalAddr() net.Addr { return server.listener.Addr() } +func (server *TCPEchoServer) Close() { server.listener.Addr() } + +func (server *UDPEchoServer) Run() { + go func() { + readBuf := make([]byte, 1024) + for { + read, from, err := server.conn.ReadFrom(readBuf) + if err != nil { + return + } + for i := 0; i != read; { + written, err := server.conn.WriteTo(readBuf[i:read], from) + if err != nil { + break + } + i += written + } + } + }() +} + +func (server *UDPEchoServer) LocalAddr() net.Addr { return server.conn.LocalAddr() } +func (server *UDPEchoServer) Close() { server.conn.Close() } + +func testProxyAt(t *testing.T, proto string, proxy Proxy, addr string) { + defer proxy.Close() + go proxy.Run() + client, err := net.Dial(proto, addr) + if err != nil { + t.Fatalf("Can't connect to the proxy: %v", err) + } + defer client.Close() + client.SetDeadline(time.Now().Add(10 * time.Second)) + if _, err = client.Write(testBuf); err != nil { + t.Fatal(err) + } + recvBuf := make([]byte, testBufSize) + if _, err = client.Read(recvBuf); err != nil { + t.Fatal(err) + } + if !bytes.Equal(testBuf, recvBuf) { + t.Fatal(fmt.Errorf("Expected [%v] but got [%v]", testBuf, recvBuf)) + } +} + +func testProxy(t *testing.T, proto string, proxy Proxy) { + testProxyAt(t, proto, proxy, proxy.FrontendAddr().String()) +} + +func TestTCP4Proxy(t *testing.T) { + backend := NewEchoServer(t, "tcp", "127.0.0.1:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "tcp", proxy) +} + +func TestTCP6Proxy(t *testing.T) { + backend := NewEchoServer(t, "tcp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "tcp", proxy) +} + +func TestTCPDualStackProxy(t *testing.T) { + // If I understand `godoc -src net favoriteAddrFamily` (used by the + // net.Listen* functions) correctly this should work, but it doesn't. + t.Skip("No support for dual stack yet") + backend := NewEchoServer(t, "tcp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.TCPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + ipv4ProxyAddr := &net.TCPAddr{ + IP: net.IPv4(127, 0, 0, 1), + Port: proxy.FrontendAddr().(*net.TCPAddr).Port, + } + testProxyAt(t, "tcp", proxy, ipv4ProxyAddr.String()) +} + +func TestUDP4Proxy(t *testing.T) { + backend := NewEchoServer(t, "udp", "127.0.0.1:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "udp", proxy) +} + +func TestUDP6Proxy(t *testing.T) { + backend := NewEchoServer(t, "udp", "[::1]:0") + defer backend.Close() + backend.Run() + frontendAddr := &net.UDPAddr{IP: net.IPv6loopback, Port: 0} + proxy, err := NewProxy(frontendAddr, backend.LocalAddr()) + if err != nil { + t.Fatal(err) + } + testProxy(t, "udp", proxy) +} + +func TestUDPWriteError(t *testing.T) { + frontendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 0} + // Hopefully, this port will be free: */ + backendAddr := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 25587} + proxy, err := NewProxy(frontendAddr, backendAddr) + if err != nil { + t.Fatal(err) + } + defer proxy.Close() + go proxy.Run() + client, err := net.Dial("udp", "127.0.0.1:25587") + if err != nil { + t.Fatalf("Can't connect to the proxy: %v", err) + } + defer client.Close() + // Make sure the proxy doesn't stop when there is no actual backend: + client.Write(testBuf) + client.Write(testBuf) + backend := NewEchoServer(t, "udp", "127.0.0.1:25587") + defer backend.Close() + backend.Run() + client.SetDeadline(time.Now().Add(10 * time.Second)) + if _, err = client.Write(testBuf); err != nil { + t.Fatal(err) + } + recvBuf := make([]byte, testBufSize) + if _, err = client.Read(recvBuf); err != nil { + t.Fatal(err) + } + if !bytes.Equal(testBuf, recvBuf) { + t.Fatal(fmt.Errorf("Expected [%v] but got [%v]", testBuf, recvBuf)) + } +} diff --git a/proxy/proxy.go b/proxy/proxy.go new file mode 100644 index 0000000..7a711f6 --- /dev/null +++ b/proxy/proxy.go @@ -0,0 +1,29 @@ +package proxy + +import ( + "fmt" + "net" +) + +type Proxy interface { + // Start forwarding traffic back and forth the front and back-end + // addresses. + Run() + // Stop forwarding traffic and close both ends of the Proxy. + Close() + // Return the address on which the proxy is listening. + FrontendAddr() net.Addr + // Return the proxied address. + BackendAddr() net.Addr +} + +func NewProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) { + switch frontendAddr.(type) { + case *net.UDPAddr: + return NewUDPProxy(frontendAddr.(*net.UDPAddr), backendAddr.(*net.UDPAddr)) + case *net.TCPAddr: + return NewTCPProxy(frontendAddr.(*net.TCPAddr), backendAddr.(*net.TCPAddr)) + default: + panic(fmt.Errorf("Unsupported protocol")) + } +} diff --git a/proxy/stub_proxy.go b/proxy/stub_proxy.go new file mode 100644 index 0000000..7684427 --- /dev/null +++ b/proxy/stub_proxy.go @@ -0,0 +1,22 @@ +package proxy + +import ( + "net" +) + +type StubProxy struct { + frontendAddr net.Addr + backendAddr net.Addr +} + +func (p *StubProxy) Run() {} +func (p *StubProxy) Close() {} +func (p *StubProxy) FrontendAddr() net.Addr { return p.frontendAddr } +func (p *StubProxy) BackendAddr() net.Addr { return p.backendAddr } + +func NewStubProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) { + return &StubProxy{ + frontendAddr: frontendAddr, + backendAddr: backendAddr, + }, nil +} diff --git a/proxy/tcp_proxy.go b/proxy/tcp_proxy.go new file mode 100644 index 0000000..b84483e --- /dev/null +++ b/proxy/tcp_proxy.go @@ -0,0 +1,93 @@ +package proxy + +import ( + "io" + "log" + "net" + "syscall" +) + +type TCPProxy struct { + listener *net.TCPListener + frontendAddr *net.TCPAddr + backendAddr *net.TCPAddr +} + +func NewTCPProxy(frontendAddr, backendAddr *net.TCPAddr) (*TCPProxy, error) { + listener, err := net.ListenTCP("tcp", frontendAddr) + if err != nil { + return nil, err + } + // If the port in frontendAddr was 0 then ListenTCP will have a picked + // a port to listen on, hence the call to Addr to get that actual port: + return &TCPProxy{ + listener: listener, + frontendAddr: listener.Addr().(*net.TCPAddr), + backendAddr: backendAddr, + }, nil +} + +func (proxy *TCPProxy) clientLoop(client *net.TCPConn, quit chan bool) { + backend, err := net.DialTCP("tcp", nil, proxy.backendAddr) + if err != nil { + log.Printf("Can't forward traffic to backend tcp/%v: %v\n", proxy.backendAddr, err.Error()) + client.Close() + return + } + + event := make(chan int64) + var broker = func(to, from *net.TCPConn) { + written, err := io.Copy(to, from) + if err != nil { + // If the socket we are writing to is shutdown with + // SHUT_WR, forward it to the other end of the pipe: + if err, ok := err.(*net.OpError); ok && err.Err == syscall.EPIPE { + from.CloseWrite() + } + } + to.CloseRead() + event <- written + } + + log.Printf("Forwarding traffic between tcp/%v and tcp/%v", client.RemoteAddr(), backend.RemoteAddr()) + go broker(client, backend) + go broker(backend, client) + + var transferred int64 = 0 + for i := 0; i < 2; i++ { + select { + case written := <-event: + transferred += written + case <-quit: + // Interrupt the two brokers and "join" them. + client.Close() + backend.Close() + for ; i < 2; i++ { + transferred += <-event + } + goto done + } + } + client.Close() + backend.Close() +done: + log.Printf("%v bytes transferred between tcp/%v and tcp/%v", transferred, client.RemoteAddr(), backend.RemoteAddr()) +} + +func (proxy *TCPProxy) Run() { + quit := make(chan bool) + defer close(quit) + log.Printf("Starting proxy on tcp/%v for tcp/%v", proxy.frontendAddr, proxy.backendAddr) + for { + client, err := proxy.listener.Accept() + if err != nil { + log.Printf("Stopping proxy on tcp/%v for tcp/%v (%v)", proxy.frontendAddr, proxy.backendAddr, err.Error()) + return + } + go proxy.clientLoop(client.(*net.TCPConn), quit) + } +} + +func (proxy *TCPProxy) Close() { proxy.listener.Close() } +func (proxy *TCPProxy) FrontendAddr() net.Addr { return proxy.frontendAddr } +func (proxy *TCPProxy) BackendAddr() net.Addr { return proxy.backendAddr } diff --git a/proxy/udp_proxy.go b/proxy/udp_proxy.go new file mode 100644 index 0000000..9395516 --- /dev/null +++ b/proxy/udp_proxy.go @@ -0,0 +1,162 @@ +package proxy + +import ( + "encoding/binary" + "log" + "net" + "strings" + "sync" + "syscall" + "time" +) + +const ( + UDPConnTrackTimeout = 90 * time.Second + UDPBufSize = 2048 +) + +// A net.Addr where the IP is split into two fields so you can use it as a key +// in a map: +type connTrackKey struct { + IPHigh uint64 + IPLow uint64 + Port int +} + +func newConnTrackKey(addr *net.UDPAddr) *connTrackKey { + if len(addr.IP) == net.IPv4len { + return &connTrackKey{ + IPHigh: 0, + IPLow: uint64(binary.BigEndian.Uint32(addr.IP)), + Port: addr.Port, + } + } + return &connTrackKey{ + IPHigh: binary.BigEndian.Uint64(addr.IP[:8]), + IPLow: binary.BigEndian.Uint64(addr.IP[8:]), + Port: addr.Port, + } +} + +type connTrackMap map[connTrackKey]*net.UDPConn + +type UDPProxy struct { + listener *net.UDPConn + frontendAddr *net.UDPAddr + backendAddr *net.UDPAddr + connTrackTable connTrackMap + connTrackLock sync.Mutex +} + +func NewUDPProxy(frontendAddr, backendAddr *net.UDPAddr) (*UDPProxy, error) { + listener, err := net.ListenUDP("udp", frontendAddr) + if err != nil { + return nil, err + } + return &UDPProxy{ + listener: listener, + frontendAddr: listener.LocalAddr().(*net.UDPAddr), + backendAddr: backendAddr, + connTrackTable: make(connTrackMap), + }, nil +} + +func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr, clientKey *connTrackKey) { + defer func() { + proxy.connTrackLock.Lock() + delete(proxy.connTrackTable, *clientKey) + proxy.connTrackLock.Unlock() + log.Printf("Done proxying between udp/%v and udp/%v", clientAddr.String(), proxy.backendAddr.String()) + proxyConn.Close() + }() + + readBuf := make([]byte, UDPBufSize) + for { + proxyConn.SetReadDeadline(time.Now().Add(UDPConnTrackTimeout)) + again: + read, err := proxyConn.Read(readBuf) + if err != nil { + if err, ok := err.(*net.OpError); ok && err.Err == syscall.ECONNREFUSED { + // This will happen if the last write failed + // (e.g: nothing is actually listening on the + // proxied port on the container), ignore it + // and continue until UDPConnTrackTimeout + // expires: + goto again + } + return + } + for i := 0; i != read; { + written, err := proxy.listener.WriteToUDP(readBuf[i:read], clientAddr) + if err != nil { + return + } + i += written + log.Printf("Forwarded %v/%v bytes to udp/%v", i, read, clientAddr.String()) + } + } +} + +func (proxy *UDPProxy) Run() { + readBuf := make([]byte, UDPBufSize) + log.Printf("Starting proxy on udp/%v for udp/%v", proxy.frontendAddr, proxy.backendAddr) + for { + read, from, err := proxy.listener.ReadFromUDP(readBuf) + if err != nil { + // NOTE: Apparently ReadFrom doesn't return + // ECONNREFUSED like Read do (see comment in + // UDPProxy.replyLoop) + if isClosedError(err) { + log.Printf("Stopping proxy on udp/%v for udp/%v (socket was closed)", proxy.frontendAddr, proxy.backendAddr) + } else { + log.Printf("Stopping proxy on udp/%v for udp/%v (%v)", proxy.frontendAddr, proxy.backendAddr, err.Error()) + } + break + } + + fromKey := newConnTrackKey(from) + proxy.connTrackLock.Lock() + proxyConn, hit := proxy.connTrackTable[*fromKey] + if !hit { + proxyConn, err = net.DialUDP("udp", nil, proxy.backendAddr) + if err != nil { + log.Printf("Can't proxy a datagram to udp/%s: %v\n", proxy.backendAddr.String(), err) + continue + } + proxy.connTrackTable[*fromKey] = proxyConn + go proxy.replyLoop(proxyConn, from, fromKey) + } + proxy.connTrackLock.Unlock() + for i := 0; i != read; { + written, err := proxyConn.Write(readBuf[i:read]) + if err != nil { + log.Printf("Can't proxy a datagram to udp/%s: %v\n", proxy.backendAddr.String(), err) + break + } + i += written + log.Printf("Forwarded %v/%v bytes to udp/%v", i, read, proxy.backendAddr.String()) + } + } +} + +func (proxy *UDPProxy) Close() { + proxy.listener.Close() + proxy.connTrackLock.Lock() + defer proxy.connTrackLock.Unlock() + for _, conn := range proxy.connTrackTable { + conn.Close() + } +} + +func (proxy *UDPProxy) FrontendAddr() net.Addr { return proxy.frontendAddr } +func (proxy *UDPProxy) BackendAddr() net.Addr { return proxy.backendAddr } + +func isClosedError(err error) bool { + /* This comparison is ugly, but unfortunately, net.go doesn't export errClosing. + * See: + * http://golang.org/src/pkg/net/net.go + * https://code.google.com/p/go/issues/detail?id=4337 + * https://groups.google.com/forum/#!msg/golang-nuts/0_aaCvBmOcM/SptmDyX1XJMJ + */ + return strings.HasSuffix(err.Error(), "use of closed network connection") +} From fb500991ec0443dc3d4d74520c558137d10c4e3e Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 15 Feb 2014 20:02:54 -0800 Subject: [PATCH 020/117] Add socket activation for go apps Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- socketactivation/activation.go | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 socketactivation/activation.go diff --git a/socketactivation/activation.go b/socketactivation/activation.go new file mode 100644 index 0000000..0edbcaa --- /dev/null +++ b/socketactivation/activation.go @@ -0,0 +1,61 @@ +/* + Package to allow go applications to immediately start + listening on a socket, unix, tcp, udp but hold connections + until the application has booted and is ready to accept them +*/ +package socketactivation + +import ( + "fmt" + "net" + "time" +) + +// NewActivationListener returns a listener listening on addr with the protocol. It sets the +// timeout to wait on first connection before an error is returned +func NewActivationListener(proto, addr string, activate chan struct{}, timeout time.Duration) (net.Listener, error) { + wrapped, err := net.Listen(proto, addr) + if err != nil { + return nil, err + } + + return &defaultListener{ + wrapped: wrapped, + activate: activate, + timeout: timeout, + }, nil +} + +type defaultListener struct { + wrapped net.Listener // the real listener to wrap + ready bool // is the listner ready to start accpeting connections + activate chan struct{} + timeout time.Duration // how long to wait before we consider this an error +} + +func (l *defaultListener) Close() error { + return l.wrapped.Close() +} + +func (l *defaultListener) Addr() net.Addr { + return l.wrapped.Addr() +} + +func (l *defaultListener) Accept() (net.Conn, error) { + // if the listen has been told it is ready then we can go ahead and + // start returning connections + if l.ready { + return l.wrapped.Accept() + } + + select { + case <-time.After(l.timeout): + // close the connection so any clients are disconnected + l.Close() + return nil, fmt.Errorf("timeout (%s) reached waiting for listener to become ready", l.timeout.String()) + case <-l.activate: + l.ready = true + return l.Accept() + } + panic("unreachable") +} From 16db4da0517edb6c74e1a40dad52417d6d333210 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 15 Feb 2014 21:10:37 -0800 Subject: [PATCH 021/117] Change name to listenbuffer Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- socketactivation/activation.go => listenbuffer/buffer.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename socketactivation/activation.go => listenbuffer/buffer.go (85%) diff --git a/socketactivation/activation.go b/listenbuffer/buffer.go similarity index 85% rename from socketactivation/activation.go rename to listenbuffer/buffer.go index 0edbcaa..c350805 100644 --- a/socketactivation/activation.go +++ b/listenbuffer/buffer.go @@ -3,7 +3,7 @@ listening on a socket, unix, tcp, udp but hold connections until the application has booted and is ready to accept them */ -package socketactivation +package listenbuffer import ( "fmt" @@ -11,9 +11,9 @@ import ( "time" ) -// NewActivationListener returns a listener listening on addr with the protocol. It sets the +// NewListenBuffer returns a listener listening on addr with the protocol. It sets the // timeout to wait on first connection before an error is returned -func NewActivationListener(proto, addr string, activate chan struct{}, timeout time.Duration) (net.Listener, error) { +func NewListenBuffer(proto, addr string, activate chan struct{}, timeout time.Duration) (net.Listener, error) { wrapped, err := net.Listen(proto, addr) if err != nil { return nil, err From 0c6b7299a412f47b9b09ae0e610ba0cff17f3594 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 17 Feb 2014 11:17:05 -0800 Subject: [PATCH 022/117] Make crosbymichael and creack netlink maintainers Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- netlink/MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 netlink/MAINTAINERS diff --git a/netlink/MAINTAINERS b/netlink/MAINTAINERS new file mode 100644 index 0000000..e53d933 --- /dev/null +++ b/netlink/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) From dbf686ccc3bb38950fdcf3f3036f81a1cf25a598 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 17 Feb 2014 13:31:13 -0800 Subject: [PATCH 023/117] Remove verbose logging for non errors Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- proxy/tcp_proxy.go | 10 +++------- proxy/udp_proxy.go | 14 ++++---------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/proxy/tcp_proxy.go b/proxy/tcp_proxy.go index b84483e..1aa6d9f 100644 --- a/proxy/tcp_proxy.go +++ b/proxy/tcp_proxy.go @@ -30,7 +30,7 @@ func NewTCPProxy(frontendAddr, backendAddr *net.TCPAddr) (*TCPProxy, error) { func (proxy *TCPProxy) clientLoop(client *net.TCPConn, quit chan bool) { backend, err := net.DialTCP("tcp", nil, proxy.backendAddr) if err != nil { - log.Printf("Can't forward traffic to backend tcp/%v: %v\n", proxy.backendAddr, err.Error()) + log.Printf("Can't forward traffic to backend tcp/%v: %s\n", proxy.backendAddr, err) client.Close() return } @@ -49,7 +49,6 @@ func (proxy *TCPProxy) clientLoop(client *net.TCPConn, quit chan bool) { event <- written } - log.Printf("Forwarding traffic between tcp/%v and tcp/%v", client.RemoteAddr(), backend.RemoteAddr()) go broker(client, backend) go broker(backend, client) @@ -65,23 +64,20 @@ func (proxy *TCPProxy) clientLoop(client *net.TCPConn, quit chan bool) { for ; i < 2; i++ { transferred += <-event } - goto done + return } } client.Close() backend.Close() -done: - log.Printf("%v bytes transferred between tcp/%v and tcp/%v", transferred, client.RemoteAddr(), backend.RemoteAddr()) } func (proxy *TCPProxy) Run() { quit := make(chan bool) defer close(quit) - log.Printf("Starting proxy on tcp/%v for tcp/%v", proxy.frontendAddr, proxy.backendAddr) for { client, err := proxy.listener.Accept() if err != nil { - log.Printf("Stopping proxy on tcp/%v for tcp/%v (%v)", proxy.frontendAddr, proxy.backendAddr, err.Error()) + log.Printf("Stopping proxy on tcp/%v for tcp/%v (%s)", proxy.frontendAddr, proxy.backendAddr, err) return } go proxy.clientLoop(client.(*net.TCPConn), quit) diff --git a/proxy/udp_proxy.go b/proxy/udp_proxy.go index 9395516..14f2306 100644 --- a/proxy/udp_proxy.go +++ b/proxy/udp_proxy.go @@ -66,7 +66,6 @@ func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr proxy.connTrackLock.Lock() delete(proxy.connTrackTable, *clientKey) proxy.connTrackLock.Unlock() - log.Printf("Done proxying between udp/%v and udp/%v", clientAddr.String(), proxy.backendAddr.String()) proxyConn.Close() }() @@ -92,24 +91,20 @@ func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr return } i += written - log.Printf("Forwarded %v/%v bytes to udp/%v", i, read, clientAddr.String()) } } } func (proxy *UDPProxy) Run() { readBuf := make([]byte, UDPBufSize) - log.Printf("Starting proxy on udp/%v for udp/%v", proxy.frontendAddr, proxy.backendAddr) for { read, from, err := proxy.listener.ReadFromUDP(readBuf) if err != nil { // NOTE: Apparently ReadFrom doesn't return // ECONNREFUSED like Read do (see comment in // UDPProxy.replyLoop) - if isClosedError(err) { - log.Printf("Stopping proxy on udp/%v for udp/%v (socket was closed)", proxy.frontendAddr, proxy.backendAddr) - } else { - log.Printf("Stopping proxy on udp/%v for udp/%v (%v)", proxy.frontendAddr, proxy.backendAddr, err.Error()) + if !isClosedError(err) { + log.Printf("Stopping proxy on udp/%v for udp/%v (%s)", proxy.frontendAddr, proxy.backendAddr, err) } break } @@ -120,7 +115,7 @@ func (proxy *UDPProxy) Run() { if !hit { proxyConn, err = net.DialUDP("udp", nil, proxy.backendAddr) if err != nil { - log.Printf("Can't proxy a datagram to udp/%s: %v\n", proxy.backendAddr.String(), err) + log.Printf("Can't proxy a datagram to udp/%s: %s\n", proxy.backendAddr, err) continue } proxy.connTrackTable[*fromKey] = proxyConn @@ -130,11 +125,10 @@ func (proxy *UDPProxy) Run() { for i := 0; i != read; { written, err := proxyConn.Write(readBuf[i:read]) if err != nil { - log.Printf("Can't proxy a datagram to udp/%s: %v\n", proxy.backendAddr.String(), err) + log.Printf("Can't proxy a datagram to udp/%s: %s\n", proxy.backendAddr, err) break } i += written - log.Printf("Forwarded %v/%v bytes to udp/%v", i, read, proxy.backendAddr.String()) } } } From 81d2c6749227e2959d48da20de1f5b050d6da292 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 16:56:11 -0800 Subject: [PATCH 024/117] Initial commit of libcontainer Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/MAINTAINERS | 2 + libcontainer/README.md | 63 +++++ libcontainer/capabilities/capabilities.go | 49 ++++ libcontainer/cli/main.go | 171 ++++++++++++++ libcontainer/container.go | 27 +++ libcontainer/container.json | 38 ++++ libcontainer/errors.go | 9 + libcontainer/namespaces/calls_linux.go | 164 +++++++++++++ libcontainer/namespaces/exec.go | 266 ++++++++++++++++++++++ libcontainer/namespaces/linux_x86_64.go | 7 + libcontainer/namespaces/mount.go | 207 +++++++++++++++++ libcontainer/namespaces/namespaces.go | 70 ++++++ libcontainer/namespaces/ns_linux.go | 35 +++ libcontainer/namespaces/utils.go | 108 +++++++++ libcontainer/network/network.go | 104 +++++++++ libcontainer/network/veth.go | 85 +++++++ libcontainer/privileged.json | 22 ++ libcontainer/types.go | 49 ++++ libcontainer/ubuntu.json | 22 ++ libcontainer/utils/utils.go | 33 +++ 20 files changed, 1531 insertions(+) create mode 100644 libcontainer/MAINTAINERS create mode 100644 libcontainer/README.md create mode 100644 libcontainer/capabilities/capabilities.go create mode 100644 libcontainer/cli/main.go create mode 100644 libcontainer/container.go create mode 100644 libcontainer/container.json create mode 100644 libcontainer/errors.go create mode 100644 libcontainer/namespaces/calls_linux.go create mode 100644 libcontainer/namespaces/exec.go create mode 100644 libcontainer/namespaces/linux_x86_64.go create mode 100644 libcontainer/namespaces/mount.go create mode 100644 libcontainer/namespaces/namespaces.go create mode 100644 libcontainer/namespaces/ns_linux.go create mode 100644 libcontainer/namespaces/utils.go create mode 100644 libcontainer/network/network.go create mode 100644 libcontainer/network/veth.go create mode 100644 libcontainer/privileged.json create mode 100644 libcontainer/types.go create mode 100644 libcontainer/ubuntu.json create mode 100644 libcontainer/utils/utils.go diff --git a/libcontainer/MAINTAINERS b/libcontainer/MAINTAINERS new file mode 100644 index 0000000..e53d933 --- /dev/null +++ b/libcontainer/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/libcontainer/README.md b/libcontainer/README.md new file mode 100644 index 0000000..91d7478 --- /dev/null +++ b/libcontainer/README.md @@ -0,0 +1,63 @@ +## libcontainer - reference implementation for containers + +#### playground + + +Use the cli package to test out functionality + +First setup a container configuration. You will need a root fs, better go the path to a +stopped docker container and use that. + + +```json +{ + "id": "koye", + "namespace_pid": 12265, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/development/gocode/src/github.com/docker/libcontainer/namespaces/ubuntu", + "network": null, + "user": "", + "working_dir": "", + "namespaces": [ + "NEWNET", + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} +``` + +After you have a json file and a rootfs path to use just run: +`./cli exec container.json` + + +If you want to attach to an existing namespace just use the same json +file with the container still running and do: +`./cli execin container.json` diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000..3301e10 --- /dev/null +++ b/libcontainer/capabilities/capabilities.go @@ -0,0 +1,49 @@ +package capabilities + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/syndtr/gocapability/capability" + "os" +) + +var capMap = map[libcontainer.Capability]capability.Cap{ + libcontainer.CAP_SETPCAP: capability.CAP_SETPCAP, + libcontainer.CAP_SYS_MODULE: capability.CAP_SYS_MODULE, + libcontainer.CAP_SYS_RAWIO: capability.CAP_SYS_RAWIO, + libcontainer.CAP_SYS_PACCT: capability.CAP_SYS_PACCT, + libcontainer.CAP_SYS_ADMIN: capability.CAP_SYS_ADMIN, + libcontainer.CAP_SYS_NICE: capability.CAP_SYS_NICE, + libcontainer.CAP_SYS_RESOURCE: capability.CAP_SYS_RESOURCE, + libcontainer.CAP_SYS_TIME: capability.CAP_SYS_TIME, + libcontainer.CAP_SYS_TTY_CONFIG: capability.CAP_SYS_TTY_CONFIG, + libcontainer.CAP_MKNOD: capability.CAP_MKNOD, + libcontainer.CAP_AUDIT_WRITE: capability.CAP_AUDIT_WRITE, + libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, + libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, + libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, +} + +// DropCapabilities drops capabilities for the current process based +// on the container's configuration. +func DropCapabilities(container *libcontainer.Container) error { + if drop := getCapabilities(container); len(drop) > 0 { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + } + return nil +} + +func getCapabilities(container *libcontainer.Container) []capability.Cap { + drop := []capability.Cap{} + for _, c := range container.Capabilities { + drop = append(drop, capMap[c]) + } + return drop +} diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go new file mode 100644 index 0000000..490135e --- /dev/null +++ b/libcontainer/cli/main.go @@ -0,0 +1,171 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" +) + +var ( + displayPid bool + newCommand string + usrNet bool +) + +func init() { + flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") + flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") + flag.BoolVar(&usrNet, "net", false, "user a net namespace") + flag.Parse() +} + +func exec(container *libcontainer.Container) error { + var ( + netFile *os.File + err error + ) + container.NetNsFd = 0 + + if usrNet { + netFile, err = os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = netFile.Fd() + } + + pid, err := namespaces.Exec(container) + if err != nil { + return fmt.Errorf("error exec container %s", err) + } + + if displayPid { + fmt.Println(pid) + } + + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + fmt.Println(exitcode) + if usrNet { + netFile.Close() + if err := network.DeleteNetworkNamespace("/root/nsroot/test"); err != nil { + return err + } + } + os.Exit(exitcode) + return nil +} + +func execIn(container *libcontainer.Container) error { + f, err := os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = f.Fd() + pid, err := namespaces.ExecIn(container, &libcontainer.Command{ + Env: container.Command.Env, + Args: []string{ + newCommand, + }, + }) + if err != nil { + return fmt.Errorf("error exexin container %s", err) + } + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + os.Exit(exitcode) + return nil +} + +func createNet(config *libcontainer.Network) error { + root := "/root/nsroot" + if err := network.SetupNamespaceMountDir(root); err != nil { + return err + } + + nspath := root + "/test" + if err := network.CreateNetworkNamespace(nspath); err != nil { + return nil + } + if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { + return err + } + if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { + return err + } + if err := network.InterfaceUp("veth0"); err != nil { + return err + } + + f, err := os.Open(nspath) + if err != nil { + return err + } + defer f.Close() + + if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { + return err + } + + /* + if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + return err + } + */ + return nil +} + +func printErr(err error) { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) +} + +func main() { + var ( + err error + cliCmd = flag.Arg(0) + config = flag.Arg(1) + ) + f, err := os.Open(config) + if err != nil { + printErr(err) + } + + dec := json.NewDecoder(f) + var container *libcontainer.Container + + if err := dec.Decode(&container); err != nil { + printErr(err) + } + f.Close() + + switch cliCmd { + case "exec": + err = exec(container) + case "execin": + err = execIn(container) + case "net": + err = createNet(&libcontainer.Network{ + TempVethName: "veth1", + IP: "172.17.0.100/16", + Gateway: "172.17.42.1", + Mtu: 1500, + Bridge: "docker0", + }) + default: + err = fmt.Errorf("command not supported: %s", cliCmd) + } + + if err != nil { + printErr(err) + } +} diff --git a/libcontainer/container.go b/libcontainer/container.go new file mode 100644 index 0000000..b77890f --- /dev/null +++ b/libcontainer/container.go @@ -0,0 +1,27 @@ +package libcontainer + +type Container struct { + ID string `json:"id,omitempty"` + NsPid int `json:"namespace_pid,omitempty"` + Command *Command `json:"command,omitempty"` + RootFs string `json:"rootfs,omitempty"` + ReadonlyFs bool `json:"readonly_fs,omitempty"` + NetNsFd uintptr `json:"network_namespace_fd,omitempty"` + User string `json:"user,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + Namespaces Namespaces `json:"namespaces,omitempty"` + Capabilities Capabilities `json:"capabilities,omitempty"` +} + +type Command struct { + Args []string `json:"args,omitempty"` + Env []string `json:"environment,omitempty"` +} + +type Network struct { + TempVethName string `json:"temp_veth,omitempty"` + IP string `json:"ip,omitempty"` + Gateway string `json:"gateway,omitempty"` + Bridge string `json:"bridge,omitempty"` + Mtu int `json:"mtu,omitempty"` +} diff --git a/libcontainer/container.json b/libcontainer/container.json new file mode 100644 index 0000000..ed8eb1b --- /dev/null +++ b/libcontainer/container.json @@ -0,0 +1,38 @@ +{ + "id": "koye", + "namespace_pid": 3117, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} diff --git a/libcontainer/errors.go b/libcontainer/errors.go new file mode 100644 index 0000000..c6964ee --- /dev/null +++ b/libcontainer/errors.go @@ -0,0 +1,9 @@ +package libcontainer + +import ( + "errors" +) + +var ( + ErrInvalidPid = errors.New("no ns pid found") +) diff --git a/libcontainer/namespaces/calls_linux.go b/libcontainer/namespaces/calls_linux.go new file mode 100644 index 0000000..793e940 --- /dev/null +++ b/libcontainer/namespaces/calls_linux.go @@ -0,0 +1,164 @@ +package namespaces + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +const ( + TIOCGPTN = 0x80045430 + TIOCSPTLCK = 0x40045431 +) + +func chroot(dir string) error { + return syscall.Chroot(dir) +} + +func chdir(dir string) error { + return syscall.Chdir(dir) +} + +func exec(cmd string, args []string, env []string) error { + return syscall.Exec(cmd, args, env) +} + +func fork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func vfork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func mount(source, target, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} + +func unmount(target string, flags int) error { + return syscall.Unmount(target, flags) +} + +func pivotroot(newroot, putold string) error { + return syscall.PivotRoot(newroot, putold) +} + +func unshare(flags int) error { + return syscall.Unshare(flags) +} + +func clone(flags uintptr) (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func setns(fd uintptr, flags uintptr) error { + _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + if err != 0 { + return err + } + return nil +} + +func usetCloseOnExec(fd uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { + return err + } + return nil +} + +func setgroups(gids []int) error { + return syscall.Setgroups(gids) +} + +func setresgid(rgid, egid, sgid int) error { + return syscall.Setresgid(rgid, egid, sgid) +} + +func setresuid(ruid, euid, suid int) error { + return syscall.Setresuid(ruid, euid, suid) +} + +func sethostname(name string) error { + return syscall.Sethostname([]byte(name)) +} + +func setsid() (int, error) { + return syscall.Setsid() +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +func openpmtx() (*os.File, error) { + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +func unlockpt(f *os.File) error { + var u int + return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +func ptsname(f *os.File) (string, error) { + var n int + if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +func closefd(fd uintptr) error { + return syscall.Close(int(fd)) +} + +func dup2(fd1, fd2 uintptr) error { + return syscall.Dup2(int(fd1), int(fd2)) +} + +func mknod(path string, mode uint32, dev int) error { + return syscall.Mknod(path, mode, dev) +} + +func parentDeathSignal() error { + if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { + return err + } + return nil +} + +func setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} + +func mkfifo(name string, mode uint32) error { + return syscall.Mkfifo(name, mode) +} + +func umask(mask int) int { + return syscall.Umask(mask) +} diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go new file mode 100644 index 0000000..893b302 --- /dev/null +++ b/libcontainer/namespaces/exec.go @@ -0,0 +1,266 @@ +/* + Higher level convience functions for setting up a container +*/ + +package namespaces + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "io" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace") +) + +// Exec will spawn new namespaces with the specified Container configuration +// in the RootFs path and return the pid of the new containerized process. +// +// If an existing network namespace is specified the container +// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is, +// the container will be spawned with a new network namespace with no configuration. Omiting an +// existing network namespace and the CLONE_NEWNET option in the container configuration will allow +// the container to the the host's networking options and configuration. +func Exec(container *libcontainer.Container) (pid int, err error) { + // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join + if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { + return -1, ErrExistingNetworkNamespace + } + + rootfs, err := resolveRootfs(container) + if err != nil { + return -1, err + } + + master, console, err := createMasterAndConsole() + if err != nil { + return -1, err + } + + logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) + if err != nil { + return -1, err + } + log.SetOutput(logger) + + // we need CLONE_VFORK so we can wait on the child + flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK + + if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil { + return -1, fmt.Errorf("error cloning process: %s", err) + } + + if pid == 0 { + // welcome to your new namespace ;) + // + // any errors encoutered inside the namespace we should write + // out to a log or a pipe to our parent and exit(1) + // because writing to stderr will not work after we close + if err := closeMasterAndStd(master); err != nil { + writeError("close master and std %s", err) + } + slave, err := openTerminal(console, syscall.O_RDWR) + if err != nil { + writeError("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + writeError("dup2 slave %s", err) + } + + if container.NetNsFd > 0 { + if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { + writeError("join existing net namespace %s", err) + } + } + + if _, err := setsid(); err != nil { + writeError("setsid %s", err) + } + if err := setctty(); err != nil { + writeError("setctty %s", err) + } + if err := parentDeathSignal(); err != nil { + writeError("parent deth signal %s", err) + } + if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { + writeError("setup mount namespace %s", err) + } + if err := sethostname(container.ID); err != nil { + writeError("sethostname %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + writeError("setup user %s", err) + } + if container.WorkingDir != "" { + if err := chdir(container.WorkingDir); err != nil { + writeError("chdir to %s %s", container.WorkingDir, err) + } + } + if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + + go func() { + if _, err := io.Copy(os.Stdout, master); err != nil { + log.Println(err) + } + }() + go func() { + if _, err := io.Copy(master, os.Stdin); err != nil { + log.Println(err) + } + }() + return pid, nil +} + +// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's +// pid and namespace configuration is needed along with the specific capabilities that should +// be dropped once inside the namespace. +func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { + if container.NsPid <= 0 { + return -1, libcontainer.ErrInvalidPid + } + + fds, err := getNsFds(container) + if err != nil { + return -1, err + } + + if container.NetNsFd > 0 { + fds = append(fds, container.NetNsFd) + } + + pid, err := fork() + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + return -1, err + } + + if pid == 0 { + for _, fd := range fds { + if fd > 0 { + if err := JoinExistingNamespace(fd, ""); err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + writeError("join existing namespace for %d %s", fd, err) + } + } + syscall.Close(int(fd)) + } + + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + // important: + // + // we need to fork and unshare so that re can remount proc and sys within + // the namespace so the CLONE_NEWPID namespace will take effect + // if we don't fork we would end up unmounting proc and sys for the entire + // namespace + child, err := fork() + if err != nil { + writeError("fork child %s", err) + } + + if child == 0 { + if err := unshare(CLONE_NEWNS); err != nil { + writeError("unshare newns %s", err) + } + if err := remountProc(); err != nil { + writeError("remount proc %s", err) + } + if err := remountSys(); err != nil { + writeError("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop caps %s", err) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + exit, err := utils.WaitOnPid(child) + if err != nil { + writeError("wait on child %s", err) + } + os.Exit(exit) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + return pid, err +} + +func resolveRootfs(container *libcontainer.Container) (string, error) { + rootfs, err := filepath.Abs(container.RootFs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func createMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + + console, err := ptsname(master) + if err != nil { + return nil, "", err + } + + if err := unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + +func closeMasterAndStd(master *os.File) error { + closefd(master.Fd()) + closefd(0) + closefd(1) + closefd(2) + + return nil +} + +func dupSlave(slave *os.File) error { + // we close Stdin,etc so our pty slave should have fd 0 + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := dup2(slave.Fd(), 1); err != nil { + return err + } + if err := dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +func openTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/libcontainer/namespaces/linux_x86_64.go b/libcontainer/namespaces/linux_x86_64.go new file mode 100644 index 0000000..ac9a014 --- /dev/null +++ b/libcontainer/namespaces/linux_x86_64.go @@ -0,0 +1,7 @@ +// +build linux,x86_64 +package namespaces + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +const ( + SYS_SETNS = 308 +) diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go new file mode 100644 index 0000000..6d867c9 --- /dev/null +++ b/libcontainer/namespaces/mount.go @@ -0,0 +1,207 @@ +package namespaces + +import ( + "fmt" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + // default mount point options + defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +) + +func SetupNewMountNamespace(rootfs, console string, readonly bool) error { + if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + + if readonly { + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting %s as readonly %s", rootfs, err) + } + } + + if err := mountSystem(rootfs); err != nil { + return fmt.Errorf("mount system %s", err) + } + + if err := copyDevNodes(rootfs); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + + if err := setupDev(rootfs); err != nil { + return err + } + + if err := setupConsole(rootfs, console); err != nil { + return err + } + + if err := chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + + if err := chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + + if err := chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + + umask(0022) + + return nil +} + +func copyDevNodes(rootfs string) error { + umask(0000) + + for _, node := range []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", + } { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + + log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) + if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + } + return nil +} + +func setupDev(rootfs string) error { + for _, link := range []struct { + from string + to string + }{ + {"/proc/kcore", "/dev/core"}, + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } { + dest := filepath.Join(rootfs, link.to) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Symlink(link.from, dest); err != nil { + return fmt.Errorf("symlink %s %s", dest, err) + } + } + return nil +} + +func setupConsole(rootfs, console string) error { + umask(0000) + + stat, err := os.Stat(console) + if err != nil { + return fmt.Errorf("stat console %s %s", console, err) + } + st := stat.Sys().(*syscall.Stat_t) + + dest := filepath.Join(rootfs, "dev/console") + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + + if err := os.Chmod(console, 0600); err != nil { + return err + } + if err := os.Chown(console, 0, 0); err != nil { + return err + } + + if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + + if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", console, dest, err) + } + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string) error { + mounts := []struct { + source string + path string + device string + flags int + data string + }{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults}, + {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, + } + for _, m := range mounts { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +func remountProc() error { + if err := unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + return err + } + return nil +} + +func remountSys() error { + if err := unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go new file mode 100644 index 0000000..2a50847 --- /dev/null +++ b/libcontainer/namespaces/namespaces.go @@ -0,0 +1,70 @@ +/* + TODO + pivot root + cgroups + more mount stuff that I probably am forgetting + apparmor +*/ + +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" + "path/filepath" + "syscall" +) + +// CreateNewNamespace creates a new namespace and binds it's fd to the specified path +func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error { + var ( + flag = namespaceMap[namespace] + name = namespaceFileMap[namespace] + nspath = filepath.Join("/proc/self/ns", name) + ) + // TODO: perform validation on name and flag + + pid, err := fork() + if err != nil { + return err + } + + if pid == 0 { + if err := unshare(flag); err != nil { + writeError("unshare %s", err) + } + if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil { + writeError("bind mount %s", err) + } + os.Exit(0) + } + exit, err := utils.WaitOnPid(pid) + if err != nil { + return err + } + if exit != 0 { + return fmt.Errorf("exit status %d", exit) + } + return err +} + +// JoinExistingNamespace uses the fd of an existing linux namespace and +// has the current process join that namespace or the spacespace specified by ns +func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { + flag := namespaceMap[ns] + if err := setns(fd, uintptr(flag)); err != nil { + return err + } + return nil +} + +// getNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= namespaceMap[ns] + } + return +} diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go new file mode 100644 index 0000000..b0e5119 --- /dev/null +++ b/libcontainer/namespaces/ns_linux.go @@ -0,0 +1,35 @@ +package namespaces + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +const ( + SIGCHLD = 0x14 + CLONE_VFORK = 0x00004000 + CLONE_NEWNS = 0x00020000 + CLONE_NEWUTS = 0x04000000 + CLONE_NEWIPC = 0x08000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 +) + +var namespaceMap = map[libcontainer.Namespace]int{ + "": 0, + libcontainer.CLONE_NEWNS: CLONE_NEWNS, + libcontainer.CLONE_NEWUTS: CLONE_NEWUTS, + libcontainer.CLONE_NEWIPC: CLONE_NEWIPC, + libcontainer.CLONE_NEWUSER: CLONE_NEWUSER, + libcontainer.CLONE_NEWPID: CLONE_NEWPID, + libcontainer.CLONE_NEWNET: CLONE_NEWNET, +} + +var namespaceFileMap = map[libcontainer.Namespace]string{ + libcontainer.CLONE_NEWNS: "mnt", + libcontainer.CLONE_NEWUTS: "uts", + libcontainer.CLONE_NEWIPC: "ipc", + libcontainer.CLONE_NEWUSER: "user", + libcontainer.CLONE_NEWPID: "pid", + libcontainer.CLONE_NEWNET: "net", +} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go new file mode 100644 index 0000000..438d896 --- /dev/null +++ b/libcontainer/namespaces/utils.go @@ -0,0 +1,108 @@ +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +func addEnvIfNotSet(container *libcontainer.Container, key, value string) { + jv := fmt.Sprintf("%s=%s", key, value) + if len(container.Command.Env) == 0 { + container.Command.Env = []string{jv} + return + } + + for _, v := range container.Command.Env { + parts := strings.Split(v, "=") + if parts[0] == key { + return + } + } + container.Command.Env = append(container.Command.Env, jv) +} + +// print and error to stderr and exit(1) +func writeError(format string, v ...interface{}) { + fmt.Fprintf(os.Stderr, format, v...) + os.Exit(1) +} + +// getNsFds inspects the container's namespace configuration and opens the fds to +// each of the namespaces. +func getNsFds(container *libcontainer.Container) ([]uintptr, error) { + var ( + namespaces = []string{} + fds = []uintptr{} + ) + + for _, ns := range container.Namespaces { + namespaces = append(namespaces, namespaceFileMap[ns]) + } + + for _, ns := range namespaces { + fd, err := getNsFd(container.NsPid, ns) + if err != nil { + for _, fd = range fds { + syscall.Close(int(fd)) + } + return nil, err + } + fds = append(fds, fd) + } + return fds, nil +} + +// getNsFd returns the fd for a specific pid and namespace option +func getNsFd(pid int, ns string) (uintptr, error) { + nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) + // OpenFile adds closOnExec + f, err := os.OpenFile(nspath, os.O_RDONLY, 0666) + if err != nil { + return 0, err + } + return f.Fd(), nil +} + +// setupEnvironment adds additional environment variables to the container's +// Command such as USER, LOGNAME, container, and TERM +func setupEnvironment(container *libcontainer.Container) { + addEnvIfNotSet(container, "container", "docker") + // TODO: check if pty + addEnvIfNotSet(container, "TERM", "xterm") + // TODO: get username from container + addEnvIfNotSet(container, "USER", "root") + addEnvIfNotSet(container, "LOGNAME", "root") +} + +func setupUser(container *libcontainer.Container) error { + // TODO: honor user passed on container + if err := setgroups(nil); err != nil { + return err + } + if err := setresgid(0, 0, 0); err != nil { + return err + } + if err := setresuid(0, 0, 0); err != nil { + return err + } + return nil +} + +func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { + master, err := openpmtx() + if err != nil { + return "", nil, err + } + + console, err := ptsname(master) + if err != nil { + master.Close() + return "", nil, err + } + return console, master, nil +} diff --git a/libcontainer/network/network.go b/libcontainer/network/network.go new file mode 100644 index 0000000..31c5d32 --- /dev/null +++ b/libcontainer/network/network.go @@ -0,0 +1,104 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/netlink" + "net" +) + +var ( + ErrNoDefaultRoute = errors.New("no default network route found") +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceInNamespaceFd(name string, fd int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsFd(iface, fd) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.NetworkSetMaster(iface, masterIface) +} + +func SetDefaultGateway(ip string) error { + return netlink.AddDefaultGw(net.ParseIP(ip)) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} + +func GetDefaultMtu() (int, error) { + routes, err := netlink.NetworkGetRoutes() + if err != nil { + return -1, err + } + for _, r := range routes { + if r.Default { + return r.Iface.MTU, nil + } + } + return -1, ErrNoDefaultRoute +} diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go new file mode 100644 index 0000000..dc207b3 --- /dev/null +++ b/libcontainer/network/veth.go @@ -0,0 +1,85 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "os" + "syscall" +) + +// SetupVeth sets up an existing network namespace with the specified +// network configuration. +func SetupVeth(config *libcontainer.Network) error { + if err := InterfaceDown(config.TempVethName); err != nil { + return fmt.Errorf("interface down %s %s", config.TempVethName, err) + } + if err := ChangeInterfaceName(config.TempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", config.TempVethName, err) + } + if err := SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// SetupNamespaceMountDir prepares a new root for use as a mount +// source for bind mounting namespace fd to an outside path +func SetupNamespaceMountDir(root string) error { + if err := os.MkdirAll(root, 0666); err != nil { + return err + } + // make sure mounts are not unmounted by other mnt namespaces + if err := syscall.Mount("", root, "none", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil && err != syscall.EINVAL { + return err + } + if err := syscall.Mount(root, root, "none", syscall.MS_BIND, ""); err != nil { + return err + } + return nil +} + +// CreateNetworkNamespace creates a new network namespace and binds it's fd +// at the binding path +func CreateNetworkNamespace(bindingPath string) error { + f, err := os.OpenFile(bindingPath, os.O_RDONLY|os.O_CREATE|os.O_EXCL, 0) + if err != nil { + return err + } + f.Close() + + if err := namespaces.CreateNewNamespace(libcontainer.CLONE_NEWNET, bindingPath); err != nil { + return err + } + return nil +} + +// DeleteNetworkNamespace unmounts the binding path and removes the +// file so that no references to the fd are present and the network +// namespace is automatically cleaned up +func DeleteNetworkNamespace(bindingPath string) error { + if err := syscall.Unmount(bindingPath, 0); err != nil { + return err + } + return os.Remove(bindingPath) +} diff --git a/libcontainer/privileged.json b/libcontainer/privileged.json new file mode 100644 index 0000000..be877ad --- /dev/null +++ b/libcontainer/privileged.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/usr/lib/systemd/systemd" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/libcontainer/types.go b/libcontainer/types.go new file mode 100644 index 0000000..db1c3b9 --- /dev/null +++ b/libcontainer/types.go @@ -0,0 +1,49 @@ +package libcontainer + +type Namespace string +type Namespaces []Namespace + +func (n Namespaces) Contains(ns Namespace) bool { + for _, nns := range n { + if nns == ns { + return true + } + } + return false +} + +type Capability string +type Capabilities []Capability + +func (c Capabilities) Contains(capp Capability) bool { + for _, cc := range c { + if cc == capp { + return true + } + } + return false +} + +const ( + CAP_SETPCAP Capability = "SETPCAP" + CAP_SYS_MODULE Capability = "SYS_MODULE" + CAP_SYS_RAWIO Capability = "SYS_RAWIO" + CAP_SYS_PACCT Capability = "SYS_PACCT" + CAP_SYS_ADMIN Capability = "SYS_ADMIN" + CAP_SYS_NICE Capability = "SYS_NICE" + CAP_SYS_RESOURCE Capability = "SYS_RESOURCE" + CAP_SYS_TIME Capability = "SYS_TIME" + CAP_SYS_TTY_CONFIG Capability = "SYS_TTY_CONFIG" + CAP_MKNOD Capability = "MKNOD" + CAP_AUDIT_WRITE Capability = "AUDIT_WRITE" + CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" + CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" + CAP_MAC_ADMIN Capability = "MAC_ADMIN" + + CLONE_NEWNS Namespace = "NEWNS" // mount + CLONE_NEWUTS Namespace = "NEWUTS" // utsname + CLONE_NEWIPC Namespace = "NEWIPC" // ipc + CLONE_NEWUSER Namespace = "NEWUSER" // user + CLONE_NEWPID Namespace = "NEWPID" // pid + CLONE_NEWNET Namespace = "NEWNET" // network +) diff --git a/libcontainer/ubuntu.json b/libcontainer/ubuntu.json new file mode 100644 index 0000000..0a450ae --- /dev/null +++ b/libcontainer/ubuntu.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/sbin/init" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/var/lib/docker/btrfs/subvolumes/7c0f15df1ad2e2fe04d7a6e079aec17406e9465a6a37dd16cb0dd754fc0167b3", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go new file mode 100644 index 0000000..7289fec --- /dev/null +++ b/libcontainer/utils/utils.go @@ -0,0 +1,33 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "os" + "syscall" +) + +func WaitOnPid(pid int) (exitcode int, err error) { + child, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := child.Wait() + if err != nil { + return -1, err + } + return getExitCode(state), nil +} + +func getExitCode(state *os.ProcessState) int { + return state.Sys().(syscall.WaitStatus).ExitStatus() +} + +func GenerateRandomName(size int) (string, error) { + id := make([]byte, size) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return hex.EncodeToString(id), nil +} From b0cdb601b9728975e697a4762b886acc58342b66 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 17:52:06 -0800 Subject: [PATCH 025/117] Make separate nsinit pkg for a dockerinit like init Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 2 + libcontainer/namespaces/calls_linux.go | 80 ++++---- libcontainer/namespaces/exec.go | 270 ++++++++----------------- libcontainer/namespaces/mount.go | 36 ++-- libcontainer/namespaces/namespaces.go | 40 +--- libcontainer/namespaces/nsinit/init.go | 140 +++++++++++++ libcontainer/namespaces/utils.go | 24 +-- 7 files changed, 285 insertions(+), 307 deletions(-) create mode 100644 libcontainer/namespaces/nsinit/init.go diff --git a/libcontainer/container.go b/libcontainer/container.go index b77890f..dd5e728 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,6 +11,8 @@ type Container struct { WorkingDir string `json:"working_dir,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` + Master uintptr `json:"master"` + Console string `json:"console"` } type Command struct { diff --git a/libcontainer/namespaces/calls_linux.go b/libcontainer/namespaces/calls_linux.go index 793e940..f006d56 100644 --- a/libcontainer/namespaces/calls_linux.go +++ b/libcontainer/namespaces/calls_linux.go @@ -12,19 +12,19 @@ const ( TIOCSPTLCK = 0x40045431 ) -func chroot(dir string) error { +func Chroot(dir string) error { return syscall.Chroot(dir) } -func chdir(dir string) error { +func Chdir(dir string) error { return syscall.Chdir(dir) } -func exec(cmd string, args []string, env []string) error { +func Exec(cmd string, args []string, env []string) error { return syscall.Exec(cmd, args, env) } -func fork() (int, error) { +func Fork() (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) syscall.ForkLock.Unlock() @@ -34,33 +34,23 @@ func fork() (int, error) { return int(pid), nil } -func vfork() (int, error) { - syscall.ForkLock.Lock() - pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0) - syscall.ForkLock.Unlock() - if err != 0 { - return -1, err - } - return int(pid), nil -} - -func mount(source, target, fstype string, flags uintptr, data string) error { +func Mount(source, target, fstype string, flags uintptr, data string) error { return syscall.Mount(source, target, fstype, flags, data) } -func unmount(target string, flags int) error { +func Unmount(target string, flags int) error { return syscall.Unmount(target, flags) } -func pivotroot(newroot, putold string) error { +func Pivotroot(newroot, putold string) error { return syscall.PivotRoot(newroot, putold) } -func unshare(flags int) error { +func Unshare(flags int) error { return syscall.Unshare(flags) } -func clone(flags uintptr) (int, error) { +func Clone(flags uintptr) (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) syscall.ForkLock.Unlock() @@ -70,7 +60,7 @@ func clone(flags uintptr) (int, error) { return int(pid), nil } -func setns(fd uintptr, flags uintptr) error { +func Setns(fd uintptr, flags uintptr) error { _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) if err != 0 { return err @@ -78,87 +68,87 @@ func setns(fd uintptr, flags uintptr) error { return nil } -func usetCloseOnExec(fd uintptr) error { +func UsetCloseOnExec(fd uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { return err } return nil } -func setgroups(gids []int) error { +func Setgroups(gids []int) error { return syscall.Setgroups(gids) } -func setresgid(rgid, egid, sgid int) error { +func Setresgid(rgid, egid, sgid int) error { return syscall.Setresgid(rgid, egid, sgid) } -func setresuid(ruid, euid, suid int) error { +func Setresuid(ruid, euid, suid int) error { return syscall.Setresuid(ruid, euid, suid) } -func sethostname(name string) error { +func Sethostname(name string) error { return syscall.Sethostname([]byte(name)) } -func setsid() (int, error) { +func Setsid() (int, error) { return syscall.Setsid() } -func ioctl(fd uintptr, flag, data uintptr) error { +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +func Ioctl(fd uintptr, flag, data uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { return err } return nil } -func openpmtx() (*os.File, error) { - return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) -} - -func unlockpt(f *os.File) error { - var u int - return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) -} - -func ptsname(f *os.File) (string, error) { +func Ptsname(f *os.File) (string, error) { var n int - if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + if err := Ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { return "", err } return fmt.Sprintf("/dev/pts/%d", n), nil } -func closefd(fd uintptr) error { +func Openpmtx() (*os.File, error) { + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +func Closefd(fd uintptr) error { return syscall.Close(int(fd)) } -func dup2(fd1, fd2 uintptr) error { +func Dup2(fd1, fd2 uintptr) error { return syscall.Dup2(int(fd1), int(fd2)) } -func mknod(path string, mode uint32, dev int) error { +func Mknod(path string, mode uint32, dev int) error { return syscall.Mknod(path, mode, dev) } -func parentDeathSignal() error { +func ParentDeathSignal() error { if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { return err } return nil } -func setctty() error { +func Setctty() error { if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { return err } return nil } -func mkfifo(name string, mode uint32) error { +func Mkfifo(name string, mode uint32) error { return syscall.Mkfifo(name, mode) } -func umask(mask int) int { +func Umask(mask int) int { return syscall.Umask(mask) } diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 893b302..0077a0b 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -8,12 +8,10 @@ import ( "errors" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/capabilities" - "github.com/dotcloud/docker/pkg/libcontainer/utils" "io" "log" "os" - "path/filepath" + "os/exec" "syscall" ) @@ -29,89 +27,31 @@ var ( // the container will be spawned with a new network namespace with no configuration. Omiting an // existing network namespace and the CLONE_NEWNET option in the container configuration will allow // the container to the the host's networking options and configuration. -func Exec(container *libcontainer.Container) (pid int, err error) { +func ExecContainer(container *libcontainer.Container) (pid int, err error) { // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { return -1, ErrExistingNetworkNamespace } - rootfs, err := resolveRootfs(container) - if err != nil { - return -1, err - } - master, console, err := createMasterAndConsole() if err != nil { return -1, err } - - logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) - if err != nil { - return -1, err - } - log.SetOutput(logger) + container.Console = console + container.Master = master.Fd() // we need CLONE_VFORK so we can wait on the child - flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK + flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil { - return -1, fmt.Errorf("error cloning process: %s", err) - } - - if pid == 0 { - // welcome to your new namespace ;) - // - // any errors encoutered inside the namespace we should write - // out to a log or a pipe to our parent and exit(1) - // because writing to stderr will not work after we close - if err := closeMasterAndStd(master); err != nil { - writeError("close master and std %s", err) - } - slave, err := openTerminal(console, syscall.O_RDWR) - if err != nil { - writeError("open terminal %s", err) - } - if err := dupSlave(slave); err != nil { - writeError("dup2 slave %s", err) - } - - if container.NetNsFd > 0 { - if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { - writeError("join existing net namespace %s", err) - } - } - - if _, err := setsid(); err != nil { - writeError("setsid %s", err) - } - if err := setctty(); err != nil { - writeError("setctty %s", err) - } - if err := parentDeathSignal(); err != nil { - writeError("parent deth signal %s", err) - } - if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { - writeError("setup mount namespace %s", err) - } - if err := sethostname(container.ID); err != nil { - writeError("sethostname %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop capabilities %s", err) - } - if err := setupUser(container); err != nil { - writeError("setup user %s", err) - } - if container.WorkingDir != "" { - if err := chdir(container.WorkingDir); err != nil { - writeError("chdir to %s %s", container.WorkingDir, err) - } - } - if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") + command := exec.Command("/.nsinit") + command.SysProcAttr = &syscall.SysProcAttr{} + command.SysProcAttr.Cloneflags = flag + command.SysProcAttr.Setctty = true + + if err := command.Start(); err != nil { + return -1, err } + pid = command.Process.Pid go func() { if _, err := io.Copy(os.Stdout, master); err != nil { @@ -130,91 +70,86 @@ func Exec(container *libcontainer.Container) (pid int, err error) { // pid and namespace configuration is needed along with the specific capabilities that should // be dropped once inside the namespace. func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { - if container.NsPid <= 0 { - return -1, libcontainer.ErrInvalidPid - } - - fds, err := getNsFds(container) - if err != nil { - return -1, err - } - - if container.NetNsFd > 0 { - fds = append(fds, container.NetNsFd) - } - - pid, err := fork() - if err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) + return -1, fmt.Errorf("not implemented") + /* + if container.NsPid <= 0 { + return -1, libcontainer.ErrInvalidPid } - return -1, err - } - if pid == 0 { - for _, fd := range fds { - if fd > 0 { - if err := JoinExistingNamespace(fd, ""); err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) + fds, err := getNsFds(container) + if err != nil { + return -1, err + } + + if container.NetNsFd > 0 { + fds = append(fds, container.NetNsFd) + } + + pid, err := fork() + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + return -1, err + } + + if pid == 0 { + for _, fd := range fds { + if fd > 0 { + if err := JoinExistingNamespace(fd, ""); err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + writeError("join existing namespace for %d %s", fd, err) } - writeError("join existing namespace for %d %s", fd, err) } - } - syscall.Close(int(fd)) - } - - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - // important: - // - // we need to fork and unshare so that re can remount proc and sys within - // the namespace so the CLONE_NEWPID namespace will take effect - // if we don't fork we would end up unmounting proc and sys for the entire - // namespace - child, err := fork() - if err != nil { - writeError("fork child %s", err) + syscall.Close(int(fd)) } - if child == 0 { - if err := unshare(CLONE_NEWNS); err != nil { - writeError("unshare newns %s", err) + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + // important: + // + // we need to fork and unshare so that re can remount proc and sys within + // the namespace so the CLONE_NEWPID namespace will take effect + // if we don't fork we would end up unmounting proc and sys for the entire + // namespace + child, err := fork() + if err != nil { + writeError("fork child %s", err) } - if err := remountProc(); err != nil { - writeError("remount proc %s", err) - } - if err := remountSys(); err != nil { - writeError("remount sys %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop caps %s", err) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - exit, err := utils.WaitOnPid(child) - if err != nil { - writeError("wait on child %s", err) - } - os.Exit(exit) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - return pid, err -} -func resolveRootfs(container *libcontainer.Container) (string, error) { - rootfs, err := filepath.Abs(container.RootFs) - if err != nil { - return "", err - } - return filepath.EvalSymlinks(rootfs) + if child == 0 { + if err := unshare(CLONE_NEWNS); err != nil { + writeError("unshare newns %s", err) + } + if err := remountProc(); err != nil { + writeError("remount proc %s", err) + } + if err := remountSys(); err != nil { + writeError("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop caps %s", err) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + exit, err := utils.WaitOnPid(child) + if err != nil { + writeError("wait on child %s", err) + } + os.Exit(exit) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + return pid, err + */ } func createMasterAndConsole() (*os.File, string, error) { @@ -223,44 +158,13 @@ func createMasterAndConsole() (*os.File, string, error) { return nil, "", err } - console, err := ptsname(master) + console, err := Ptsname(master) if err != nil { return nil, "", err } - if err := unlockpt(master); err != nil { + if err := Unlockpt(master); err != nil { return nil, "", err } return master, console, nil } - -func closeMasterAndStd(master *os.File) error { - closefd(master.Fd()) - closefd(0) - closefd(1) - closefd(2) - - return nil -} - -func dupSlave(slave *os.File) error { - // we close Stdin,etc so our pty slave should have fd 0 - if slave.Fd() != 0 { - return fmt.Errorf("slave fd not 0 %d", slave.Fd()) - } - if err := dup2(slave.Fd(), 1); err != nil { - return err - } - if err := dup2(slave.Fd(), 2); err != nil { - return err - } - return nil -} - -func openTerminal(name string, flag int) (*os.File, error) { - r, e := syscall.Open(name, flag, 0) - if e != nil { - return nil, &os.PathError{"open", name, e} - } - return os.NewFile(uintptr(r), name), nil -} diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index 6d867c9..8e7c54b 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -14,16 +14,16 @@ var ( ) func SetupNewMountNamespace(rootfs, console string, readonly bool) error { - if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } if readonly { - if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } @@ -52,29 +52,29 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { return err } - if err := chdir(rootfs); err != nil { + if err := Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + if err := Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := chroot("."); err != nil { + if err := Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := chdir("/"); err != nil { + if err := Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - umask(0022) + Umask(0022) return nil } func copyDevNodes(rootfs string) error { - umask(0000) + Umask(0000) for _, node := range []string{ "null", @@ -95,7 +95,7 @@ func copyDevNodes(rootfs string) error { ) log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) - if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + if err := Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } } @@ -125,7 +125,7 @@ func setupDev(rootfs string) error { } func setupConsole(rootfs, console string) error { - umask(0000) + Umask(0000) stat, err := os.Stat(console) if err != nil { @@ -145,11 +145,11 @@ func setupConsole(rootfs, console string) error { return err } - if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + if err := Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + if err := Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } return nil @@ -176,7 +176,7 @@ func mountSystem(rootfs string) error { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) } - if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + if err := Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) } } @@ -184,22 +184,22 @@ func mountSystem(rootfs string) error { } func remountProc() error { - if err := unmount("/proc", syscall.MNT_DETACH); err != nil { + if err := Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { return err } return nil } func remountSys() error { - if err := unmount("/sys", syscall.MNT_DETACH); err != nil { + if err := Unmount("/sys", syscall.MNT_DETACH); err != nil { if err != syscall.EINVAL { return err } } else { - if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { return err } } diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go index 2a50847..05ef0ac 100644 --- a/libcontainer/namespaces/namespaces.go +++ b/libcontainer/namespaces/namespaces.go @@ -9,52 +9,14 @@ package namespaces import ( - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/utils" - "os" - "path/filepath" - "syscall" ) -// CreateNewNamespace creates a new namespace and binds it's fd to the specified path -func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error { - var ( - flag = namespaceMap[namespace] - name = namespaceFileMap[namespace] - nspath = filepath.Join("/proc/self/ns", name) - ) - // TODO: perform validation on name and flag - - pid, err := fork() - if err != nil { - return err - } - - if pid == 0 { - if err := unshare(flag); err != nil { - writeError("unshare %s", err) - } - if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil { - writeError("bind mount %s", err) - } - os.Exit(0) - } - exit, err := utils.WaitOnPid(pid) - if err != nil { - return err - } - if exit != 0 { - return fmt.Errorf("exit status %d", exit) - } - return err -} - // JoinExistingNamespace uses the fd of an existing linux namespace and // has the current process join that namespace or the spacespace specified by ns func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { flag := namespaceMap[ns] - if err := setns(fd, uintptr(flag)); err != nil { + if err := Setns(fd, uintptr(flag)); err != nil { return err } return nil diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go new file mode 100644 index 0000000..9a75636 --- /dev/null +++ b/libcontainer/namespaces/nsinit/init.go @@ -0,0 +1,140 @@ +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "log" + "os" + "path/filepath" + "syscall" +) + +// InitNamespace should be run inside an existing namespace to setup +// common mounts, drop capabilities, and setup network interfaces +func InitNamespace(container *libcontainer.Container) error { + rootfs, err := resolveRootfs(container) + if err != nil { + return err + } + + // any errors encoutered inside the namespace we should write + // out to a log or a pipe to our parent and exit(1) + // because writing to stderr will not work after we close + if err := closeMasterAndStd(container.Master); err != nil { + log.Fatalf("close master and std %s", err) + return err + } + + slave, err := openTerminal(container.Console, syscall.O_RDWR) + if err != nil { + log.Fatalf("open terminal %s", err) + return err + } + if err := dupSlave(slave); err != nil { + log.Fatalf("dup2 slave %s", err) + return err + } + + /* + if container.NetNsFd > 0 { + if err := joinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { + log.Fatalf("join existing net namespace %s", err) + } + } + */ + + if _, err := namespaces.Setsid(); err != nil { + log.Fatalf("setsid %s", err) + return err + } + if err := namespaces.Setctty(); err != nil { + log.Fatalf("setctty %s", err) + return err + } + if err := namespaces.ParentDeathSignal(); err != nil { + log.Fatalf("parent deth signal %s", err) + return err + } + if err := namespaces.SetupNewMountNamespace(rootfs, container.Console, container.ReadonlyFs); err != nil { + log.Fatalf("setup mount namespace %s", err) + return err + } + if err := namespaces.Sethostname(container.ID); err != nil { + log.Fatalf("sethostname %s", err) + return err + } + if err := capabilities.DropCapabilities(container); err != nil { + log.Fatalf("drop capabilities %s", err) + return err + } + if err := setupUser(container); err != nil { + log.Fatalf("setup user %s", err) + return err + } + if container.WorkingDir != "" { + if err := namespaces.Chdir(container.WorkingDir); err != nil { + log.Fatalf("chdir to %s %s", container.WorkingDir, err) + return err + } + } + if err := namespaces.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + log.Fatalf("exec %s", err) + return err + } + panic("unreachable") +} + +func resolveRootfs(container *libcontainer.Container) (string, error) { + rootfs, err := filepath.Abs(container.RootFs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func closeMasterAndStd(master uintptr) error { + namespaces.Closefd(master) + namespaces.Closefd(0) + namespaces.Closefd(1) + namespaces.Closefd(2) + + return nil +} + +func setupUser(container *libcontainer.Container) error { + // TODO: honor user passed on container + if err := namespaces.Setgroups(nil); err != nil { + return err + } + if err := namespaces.Setresgid(0, 0, 0); err != nil { + return err + } + if err := namespaces.Setresuid(0, 0, 0); err != nil { + return err + } + return nil +} + +func dupSlave(slave *os.File) error { + // we close Stdin,etc so our pty slave should have fd 0 + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := namespaces.Dup2(slave.Fd(), 1); err != nil { + return err + } + if err := namespaces.Dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +func openTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index 438d896..fd195c0 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -26,12 +26,6 @@ func addEnvIfNotSet(container *libcontainer.Container, key, value string) { container.Command.Env = append(container.Command.Env, jv) } -// print and error to stderr and exit(1) -func writeError(format string, v ...interface{}) { - fmt.Fprintf(os.Stderr, format, v...) - os.Exit(1) -} - // getNsFds inspects the container's namespace configuration and opens the fds to // each of the namespaces. func getNsFds(container *libcontainer.Container) ([]uintptr, error) { @@ -79,27 +73,13 @@ func setupEnvironment(container *libcontainer.Container) { addEnvIfNotSet(container, "LOGNAME", "root") } -func setupUser(container *libcontainer.Container) error { - // TODO: honor user passed on container - if err := setgroups(nil); err != nil { - return err - } - if err := setresgid(0, 0, 0); err != nil { - return err - } - if err := setresuid(0, 0, 0); err != nil { - return err - } - return nil -} - func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { - master, err := openpmtx() + master, err := Openpmtx() if err != nil { return "", nil, err } - console, err := ptsname(master) + console, err := Ptsname(master) if err != nil { master.Close() return "", nil, err From c2777d46113666f683bfee89960bfefa4da39ff1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 18 Feb 2014 18:15:41 -0800 Subject: [PATCH 026/117] WIP moving to nsini Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 78 +++++++++++++++++++-------------- libcontainer/namespaces/exec.go | 10 +++-- libcontainer/network/veth.go | 16 ------- 3 files changed, 52 insertions(+), 52 deletions(-) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index 490135e..0430e29 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -6,6 +6,7 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces/nsinit" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "os" @@ -15,15 +16,26 @@ var ( displayPid bool newCommand string usrNet bool + masterFd int + console string ) func init() { flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") flag.BoolVar(&usrNet, "net", false, "user a net namespace") + flag.IntVar(&masterFd, "master", 0, "master fd") + flag.StringVar(&console, "console", "", "console path") flag.Parse() } +func nsinitFunc(container *libcontainer.Container) error { + container.Master = uintptr(masterFd) + container.Console = console + + return nsinit.InitNamespace(container) +} + func exec(container *libcontainer.Container) error { var ( netFile *os.File @@ -39,7 +51,7 @@ func exec(container *libcontainer.Container) error { container.NetNsFd = netFile.Fd() } - pid, err := namespaces.Exec(container) + pid, err := namespaces.ExecContainer(container) if err != nil { return fmt.Errorf("error exec container %s", err) } @@ -87,39 +99,39 @@ func execIn(container *libcontainer.Container) error { } func createNet(config *libcontainer.Network) error { - root := "/root/nsroot" - if err := network.SetupNamespaceMountDir(root); err != nil { - return err - } - - nspath := root + "/test" - if err := network.CreateNetworkNamespace(nspath); err != nil { - return nil - } - if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { - return err - } - if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { - return err - } - if err := network.InterfaceUp("veth0"); err != nil { - return err - } - - f, err := os.Open(nspath) - if err != nil { - return err - } - defer f.Close() - - if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { - return err - } - /* - if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + root := "/root/nsroot" + if err := network.SetupNamespaceMountDir(root); err != nil { return err } + + nspath := root + "/test" + if err := network.CreateNetworkNamespace(nspath); err != nil { + return nil + } + if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { + return err + } + if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { + return err + } + if err := network.InterfaceUp("veth0"); err != nil { + return err + } + + f, err := os.Open(nspath) + if err != nil { + return err + } + defer f.Close() + + if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { + return err + } + + if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + return err + } */ return nil } @@ -133,7 +145,7 @@ func main() { var ( err error cliCmd = flag.Arg(0) - config = flag.Arg(1) + config = "/root/development/gocode/src/github.com/dotcloud/docker/pkg/libcontainer/container.json" //flag.Arg(1) ) f, err := os.Open(config) if err != nil { @@ -149,6 +161,8 @@ func main() { f.Close() switch cliCmd { + case "init": + err = nsinitFunc(container) case "exec": err = exec(container) case "execin": diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 0077a0b..93b155b 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -12,6 +12,8 @@ import ( "log" "os" "os/exec" + "path/filepath" + "strconv" "syscall" ) @@ -37,16 +39,15 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { if err != nil { return -1, err } - container.Console = console - container.Master = master.Fd() + nsinit := filepath.Join(container.RootFs, ".nsinit") // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command("/.nsinit") + command := exec.Command(nsinit, "init", "-master", strconv.Itoa(int(master.Fd())), "-console", console) command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - command.SysProcAttr.Setctty = true + // command.SysProcAttr.Setctty = true if err := command.Start(); err != nil { return -1, err @@ -63,6 +64,7 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() + command.Wait() return pid, nil } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index dc207b3..2ecce22 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -3,7 +3,6 @@ package network import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" "os" "syscall" ) @@ -59,21 +58,6 @@ func SetupNamespaceMountDir(root string) error { return nil } -// CreateNetworkNamespace creates a new network namespace and binds it's fd -// at the binding path -func CreateNetworkNamespace(bindingPath string) error { - f, err := os.OpenFile(bindingPath, os.O_RDONLY|os.O_CREATE|os.O_EXCL, 0) - if err != nil { - return err - } - f.Close() - - if err := namespaces.CreateNewNamespace(libcontainer.CLONE_NEWNET, bindingPath); err != nil { - return err - } - return nil -} - // DeleteNetworkNamespace unmounts the binding path and removes the // file so that no references to the fd are present and the network // namespace is automatically cleaned up From 593219d1914c263f1efaef230fdc51c7996d1c86 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 10:44:29 -0800 Subject: [PATCH 027/117] Use nsinit for setting up namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 1 + libcontainer/container.go | 1 + libcontainer/namespaces/exec.go | 4 ++-- libcontainer/namespaces/nsinit/init.go | 13 +++++++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index 0430e29..ac0ea29 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -32,6 +32,7 @@ func init() { func nsinitFunc(container *libcontainer.Container) error { container.Master = uintptr(masterFd) container.Console = console + container.LogFile = "/root/logs" return nsinit.InitNamespace(container) } diff --git a/libcontainer/container.go b/libcontainer/container.go index dd5e728..c9a3f2e 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -13,6 +13,7 @@ type Container struct { Capabilities Capabilities `json:"capabilities,omitempty"` Master uintptr `json:"master"` Console string `json:"console"` + LogFile string `json:"log_file"` } type Command struct { diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 93b155b..7f4b4a6 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -44,9 +44,10 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "init", "-master", strconv.Itoa(int(master.Fd())), "-console", console) + command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init") command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag + command.ExtraFiles = []*os.File{master} // command.SysProcAttr.Setctty = true if err := command.Start(); err != nil { @@ -64,7 +65,6 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() - command.Wait() return pid, nil } diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index 9a75636..ae6159b 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -14,6 +14,10 @@ import ( // InitNamespace should be run inside an existing namespace to setup // common mounts, drop capabilities, and setup network interfaces func InitNamespace(container *libcontainer.Container) error { + if err := setLogFile(container); err != nil { + return err + } + rootfs, err := resolveRootfs(container) if err != nil { return err @@ -138,3 +142,12 @@ func openTerminal(name string, flag int) (*os.File, error) { } return os.NewFile(uintptr(r), name), nil } + +func setLogFile(container *libcontainer.Container) error { + f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) + if err != nil { + return err + } + log.SetOutput(f) + return nil +} From a304eab9d4da1a57b7592ca4ecf2b0bf51224b81 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Tue, 18 Feb 2014 23:13:36 -0800 Subject: [PATCH 028/117] Improve general quality of libcontainer Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/cli/main.go | 65 ++++++----- libcontainer/container.json | 2 +- libcontainer/namespaces/exec.go | 101 ++---------------- libcontainer/namespaces/mount.go | 44 ++++---- libcontainer/namespaces/namespaces.go | 32 ------ libcontainer/namespaces/ns_linux.go | 9 ++ libcontainer/namespaces/nsinit/init.go | 41 +++---- libcontainer/namespaces/utils.go | 14 --- .../namespaces => system}/calls_linux.go | 37 +------ system/pty_linux.go | 31 ++++++ system/setns_linux.go | 13 +++ system/setns_linux_amd64.go | 8 ++ 12 files changed, 159 insertions(+), 238 deletions(-) delete mode 100644 libcontainer/namespaces/namespaces.go rename {libcontainer/namespaces => system}/calls_linux.go (74%) create mode 100644 system/pty_linux.go create mode 100644 system/setns_linux.go create mode 100644 system/setns_linux_amd64.go diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go index ac0ea29..93bb039 100644 --- a/libcontainer/cli/main.go +++ b/libcontainer/cli/main.go @@ -10,6 +10,9 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "os" + exec_ "os/exec" + "path" + "path/filepath" ) var ( @@ -52,6 +55,18 @@ func exec(container *libcontainer.Container) error { container.NetNsFd = netFile.Fd() } + self, err := exec_.LookPath(os.Args[0]) + if err != nil { + return err + } + if output, err := exec_.Command("cp", self, path.Join(container.RootFs, ".nsinit")).CombinedOutput(); err != nil { + return fmt.Errorf("Error exec cp: %s, (%s)", err, output) + } else { + println(self, container.RootFs) + fmt.Printf("-----> %s\n", output) + } + println("----") + pid, err := namespaces.ExecContainer(container) if err != nil { return fmt.Errorf("error exec container %s", err) @@ -77,25 +92,25 @@ func exec(container *libcontainer.Container) error { } func execIn(container *libcontainer.Container) error { - f, err := os.Open("/root/nsroot/test") - if err != nil { - return err - } - container.NetNsFd = f.Fd() - pid, err := namespaces.ExecIn(container, &libcontainer.Command{ - Env: container.Command.Env, - Args: []string{ - newCommand, - }, - }) - if err != nil { - return fmt.Errorf("error exexin container %s", err) - } - exitcode, err := utils.WaitOnPid(pid) - if err != nil { - return fmt.Errorf("error waiting on child %s", err) - } - os.Exit(exitcode) + // f, err := os.Open("/root/nsroot/test") + // if err != nil { + // return err + // } + // container.NetNsFd = f.Fd() + // pid, err := namespaces.ExecIn(container, &libcontainer.Command{ + // Env: container.Command.Env, + // Args: []string{ + // newCommand, + // }, + // }) + // if err != nil { + // return fmt.Errorf("error exexin container %s", err) + // } + // exitcode, err := utils.WaitOnPid(pid) + // if err != nil { + // return fmt.Errorf("error waiting on child %s", err) + // } + // os.Exit(exitcode) return nil } @@ -143,11 +158,13 @@ func printErr(err error) { } func main() { - var ( - err error - cliCmd = flag.Arg(0) - config = "/root/development/gocode/src/github.com/dotcloud/docker/pkg/libcontainer/container.json" //flag.Arg(1) - ) + cliCmd := flag.Arg(0) + + config, err := filepath.Abs(flag.Arg(1)) + if err != nil { + printErr(err) + } + println("cli:", cliCmd, "config:", config) f, err := os.Open(config) if err != nil { printErr(err) diff --git a/libcontainer/container.json b/libcontainer/container.json index ed8eb1b..6e4fda5 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -12,7 +12,7 @@ "TERM=xterm" ] }, - "rootfs": "/root/main/mycontainer", + "rootfs": "/var/lib/docker/containers/ee76122136d691d63e09d24168a91ddb2ef9fdcf210b4de5c50aa76354892f4b/root", "namespaces": [ "NEWIPC", "NEWNS", diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 7f4b4a6..ea3d2ca 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -6,8 +6,8 @@ package namespaces import ( "errors" - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" "io" "log" "os" @@ -44,12 +44,15 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init") + command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init", "container.json") + // command.Stdin = os.Stdin + // command.Stdout = os.Stdout + // command.Stderr = os.Stderr command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - command.ExtraFiles = []*os.File{master} - // command.SysProcAttr.Setctty = true + //command.ExtraFiles = []*os.File{master} + println("vvvvvvvvv") if err := command.Start(); err != nil { return -1, err } @@ -68,104 +71,18 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { return pid, nil } -// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's -// pid and namespace configuration is needed along with the specific capabilities that should -// be dropped once inside the namespace. -func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { - return -1, fmt.Errorf("not implemented") - /* - if container.NsPid <= 0 { - return -1, libcontainer.ErrInvalidPid - } - - fds, err := getNsFds(container) - if err != nil { - return -1, err - } - - if container.NetNsFd > 0 { - fds = append(fds, container.NetNsFd) - } - - pid, err := fork() - if err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) - } - return -1, err - } - - if pid == 0 { - for _, fd := range fds { - if fd > 0 { - if err := JoinExistingNamespace(fd, ""); err != nil { - for _, fd := range fds { - syscall.Close(int(fd)) - } - writeError("join existing namespace for %d %s", fd, err) - } - } - syscall.Close(int(fd)) - } - - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - // important: - // - // we need to fork and unshare so that re can remount proc and sys within - // the namespace so the CLONE_NEWPID namespace will take effect - // if we don't fork we would end up unmounting proc and sys for the entire - // namespace - child, err := fork() - if err != nil { - writeError("fork child %s", err) - } - - if child == 0 { - if err := unshare(CLONE_NEWNS); err != nil { - writeError("unshare newns %s", err) - } - if err := remountProc(); err != nil { - writeError("remount proc %s", err) - } - if err := remountSys(); err != nil { - writeError("remount sys %s", err) - } - if err := capabilities.DropCapabilities(container); err != nil { - writeError("drop caps %s", err) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - exit, err := utils.WaitOnPid(child) - if err != nil { - writeError("wait on child %s", err) - } - os.Exit(exit) - } - if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { - writeError("exec %s", err) - } - panic("unreachable") - } - return pid, err - */ -} - func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, "", err } - console, err := Ptsname(master) + console, err := system.Ptsname(master) if err != nil { return nil, "", err } - if err := Unlockpt(master); err != nil { + if err := system.Unlockpt(master); err != nil { return nil, "", err } return master, console, nil diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index 8e7c54b..a9b981e 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -2,6 +2,7 @@ package namespaces import ( "fmt" + "github.com/dotcloud/docker/pkg/system" "log" "os" "path/filepath" @@ -14,16 +15,16 @@ var ( ) func SetupNewMountNamespace(rootfs, console string, readonly bool) error { - if err := Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } if readonly { - if err := Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } @@ -52,29 +53,30 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { return err } - if err := Chdir(rootfs); err != nil { + if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := Chroot("."); err != nil { + if err := system.Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := Chdir("/"); err != nil { + if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - Umask(0022) + system.Umask(0022) return nil } func copyDevNodes(rootfs string) error { - Umask(0000) + oldMask := system.Umask(0000) + defer system.Umask(oldMask) for _, node := range []string{ "null", @@ -95,7 +97,7 @@ func copyDevNodes(rootfs string) error { ) log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) - if err := Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } } @@ -125,7 +127,8 @@ func setupDev(rootfs string) error { } func setupConsole(rootfs, console string) error { - Umask(0000) + oldMask := system.Umask(0000) + defer system.Umask(oldMask) stat, err := os.Stat(console) if err != nil { @@ -145,11 +148,11 @@ func setupConsole(rootfs, console string) error { return err } - if err := Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } return nil @@ -158,7 +161,7 @@ func setupConsole(rootfs, console string) error { // mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts // inside the mount namespace func mountSystem(rootfs string) error { - mounts := []struct { + for _, m := range []struct { source string path string device string @@ -171,12 +174,11 @@ func mountSystem(rootfs string) error { {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, - } - for _, m := range mounts { + } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) } - if err := Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) } } @@ -184,22 +186,22 @@ func mountSystem(rootfs string) error { } func remountProc() error { - if err := Unmount("/proc", syscall.MNT_DETACH); err != nil { + if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := system.Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { return err } return nil } func remountSys() error { - if err := Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { if err != syscall.EINVAL { return err } } else { - if err := Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { return err } } diff --git a/libcontainer/namespaces/namespaces.go b/libcontainer/namespaces/namespaces.go deleted file mode 100644 index 05ef0ac..0000000 --- a/libcontainer/namespaces/namespaces.go +++ /dev/null @@ -1,32 +0,0 @@ -/* - TODO - pivot root - cgroups - more mount stuff that I probably am forgetting - apparmor -*/ - -package namespaces - -import ( - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// JoinExistingNamespace uses the fd of an existing linux namespace and -// has the current process join that namespace or the spacespace specified by ns -func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { - flag := namespaceMap[ns] - if err := Setns(fd, uintptr(flag)); err != nil { - return err - } - return nil -} - -// getNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare, and setns -func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { - for _, ns := range namespaces { - flag |= namespaceMap[ns] - } - return -} diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go index b0e5119..f612793 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/namespaces/ns_linux.go @@ -33,3 +33,12 @@ var namespaceFileMap = map[libcontainer.Namespace]string{ libcontainer.CLONE_NEWPID: "pid", libcontainer.CLONE_NEWNET: "net", } + +// getNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= namespaceMap[ns] + } + return +} diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index ae6159b..7f85eba 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -5,6 +5,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/system" "log" "os" "path/filepath" @@ -14,10 +15,12 @@ import ( // InitNamespace should be run inside an existing namespace to setup // common mounts, drop capabilities, and setup network interfaces func InitNamespace(container *libcontainer.Container) error { + println("|||||||||||||") if err := setLogFile(container); err != nil { return err } - + println(container.LogFile) + log.Printf("--------->") rootfs, err := resolveRootfs(container) if err != nil { return err @@ -26,7 +29,7 @@ func InitNamespace(container *libcontainer.Container) error { // any errors encoutered inside the namespace we should write // out to a log or a pipe to our parent and exit(1) // because writing to stderr will not work after we close - if err := closeMasterAndStd(container.Master); err != nil { + if err := closeMasterAndStd(os.NewFile(container.Master, "/dev/ptmx")); err != nil { log.Fatalf("close master and std %s", err) return err } @@ -49,15 +52,15 @@ func InitNamespace(container *libcontainer.Container) error { } */ - if _, err := namespaces.Setsid(); err != nil { + if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) return err } - if err := namespaces.Setctty(); err != nil { + if err := system.Setctty(); err != nil { log.Fatalf("setctty %s", err) return err } - if err := namespaces.ParentDeathSignal(); err != nil { + if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) return err } @@ -65,7 +68,7 @@ func InitNamespace(container *libcontainer.Container) error { log.Fatalf("setup mount namespace %s", err) return err } - if err := namespaces.Sethostname(container.ID); err != nil { + if err := system.Sethostname(container.ID); err != nil { log.Fatalf("sethostname %s", err) return err } @@ -78,12 +81,12 @@ func InitNamespace(container *libcontainer.Container) error { return err } if container.WorkingDir != "" { - if err := namespaces.Chdir(container.WorkingDir); err != nil { + if err := system.Chdir(container.WorkingDir); err != nil { log.Fatalf("chdir to %s %s", container.WorkingDir, err) return err } } - if err := namespaces.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { log.Fatalf("exec %s", err) return err } @@ -98,24 +101,23 @@ func resolveRootfs(container *libcontainer.Container) (string, error) { return filepath.EvalSymlinks(rootfs) } -func closeMasterAndStd(master uintptr) error { - namespaces.Closefd(master) - namespaces.Closefd(0) - namespaces.Closefd(1) - namespaces.Closefd(2) - +func closeMasterAndStd(master *os.File) error { + master.Close() + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() return nil } func setupUser(container *libcontainer.Container) error { // TODO: honor user passed on container - if err := namespaces.Setgroups(nil); err != nil { + if err := system.Setgroups(nil); err != nil { return err } - if err := namespaces.Setresgid(0, 0, 0); err != nil { + if err := system.Setresgid(0, 0, 0); err != nil { return err } - if err := namespaces.Setresuid(0, 0, 0); err != nil { + if err := system.Setresuid(0, 0, 0); err != nil { return err } return nil @@ -126,15 +128,16 @@ func dupSlave(slave *os.File) error { if slave.Fd() != 0 { return fmt.Errorf("slave fd not 0 %d", slave.Fd()) } - if err := namespaces.Dup2(slave.Fd(), 1); err != nil { + if err := system.Dup2(slave.Fd(), 1); err != nil { return err } - if err := namespaces.Dup2(slave.Fd(), 2); err != nil { + if err := system.Dup2(slave.Fd(), 2); err != nil { return err } return nil } +// openTerminal is a clone of os.OpenFile without the O_CLOEXEC addition. func openTerminal(name string, flag int) (*os.File, error) { r, e := syscall.Open(name, flag, 0) if e != nil { diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index fd195c0..a5d677c 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -72,17 +72,3 @@ func setupEnvironment(container *libcontainer.Container) { addEnvIfNotSet(container, "USER", "root") addEnvIfNotSet(container, "LOGNAME", "root") } - -func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { - master, err := Openpmtx() - if err != nil { - return "", nil, err - } - - console, err := Ptsname(master) - if err != nil { - master.Close() - return "", nil, err - } - return console, master, nil -} diff --git a/libcontainer/namespaces/calls_linux.go b/system/calls_linux.go similarity index 74% rename from libcontainer/namespaces/calls_linux.go rename to system/calls_linux.go index f006d56..42afa34 100644 --- a/libcontainer/namespaces/calls_linux.go +++ b/system/calls_linux.go @@ -1,15 +1,7 @@ -package namespaces +package system import ( - "fmt" - "os" "syscall" - "unsafe" -) - -const ( - TIOCGPTN = 0x80045430 - TIOCSPTLCK = 0x40045431 ) func Chroot(dir string) error { @@ -60,14 +52,6 @@ func Clone(flags uintptr) (int, error) { return int(pid), nil } -func Setns(fd uintptr, flags uintptr) error { - _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) - if err != 0 { - return err - } - return nil -} - func UsetCloseOnExec(fd uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { return err @@ -95,11 +79,6 @@ func Setsid() (int, error) { return syscall.Setsid() } -func Unlockpt(f *os.File) error { - var u int - return Ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) -} - func Ioctl(fd uintptr, flag, data uintptr) error { if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { return err @@ -107,18 +86,6 @@ func Ioctl(fd uintptr, flag, data uintptr) error { return nil } -func Ptsname(f *os.File) (string, error) { - var n int - if err := Ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { - return "", err - } - return fmt.Sprintf("/dev/pts/%d", n), nil -} - -func Openpmtx() (*os.File, error) { - return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) -} - func Closefd(fd uintptr) error { return syscall.Close(int(fd)) } @@ -132,7 +99,7 @@ func Mknod(path string, mode uint32, dev int) error { } func ParentDeathSignal() error { - if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 { return err } return nil diff --git a/system/pty_linux.go b/system/pty_linux.go new file mode 100644 index 0000000..b281b71 --- /dev/null +++ b/system/pty_linux.go @@ -0,0 +1,31 @@ +package system + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// Unlockpt should be called before opening the slave side of a pseudoterminal. +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// Ptsname retrieves the name of the first available pts for the given master. +func Ptsname(f *os.File) (string, error) { + var n int + + if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// OpenPtmx opens /dev/ptmx, i.e. the PTY master. +func OpenPtmx() (*os.File, error) { + // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} diff --git a/system/setns_linux.go b/system/setns_linux.go new file mode 100644 index 0000000..be6f3ed --- /dev/null +++ b/system/setns_linux.go @@ -0,0 +1,13 @@ +package system + +import ( + "syscall" +) + +func Setns(fd uintptr, flags uintptr) error { + _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + if err != 0 { + return err + } + return nil +} diff --git a/system/setns_linux_amd64.go b/system/setns_linux_amd64.go new file mode 100644 index 0000000..4e30625 --- /dev/null +++ b/system/setns_linux_amd64.go @@ -0,0 +1,8 @@ +// +build linux,amd64 + +package system + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +const ( + SYS_SETNS = 308 +) From d62cc1cc661bcf621a262aa2229524f6001b99ab Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 19 Feb 2014 12:47:01 -0800 Subject: [PATCH 029/117] Fix ptmx issue on libcontainer Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/namespaces/exec.go | 3 ++- libcontainer/namespaces/mount.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index ea3d2ca..77550d6 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -50,7 +50,8 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { // command.Stderr = os.Stderr command.SysProcAttr = &syscall.SysProcAttr{} command.SysProcAttr.Cloneflags = flag - //command.ExtraFiles = []*os.File{master} + + command.ExtraFiles = []*os.File{master} println("vvvvvvvvv") if err := command.Start(); err != nil { diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/mount.go index a9b981e..5c0b8ea 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/mount.go @@ -41,7 +41,7 @@ func SetupNewMountNamespace(rootfs, console string, readonly bool) error { if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err } - if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil { + if err := os.Symlink("pts/ptmx", ptmx); err != nil { return fmt.Errorf("symlink dev ptmx %s", err) } From c1f8606d508ae40f721bc6c20e5b2afab1911b42 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 14:33:25 -0800 Subject: [PATCH 030/117] Use nsinit as app Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 10 +- libcontainer/namespaces/exec.go | 40 ++----- libcontainer/namespaces/linux_x86_64.go | 7 -- libcontainer/namespaces/ns_linux.go | 2 +- libcontainer/namespaces/nsinit/init.go | 112 ++++++++++-------- libcontainer/namespaces/{ => nsinit}/mount.go | 4 +- libcontainer/namespaces/utils.go | 26 ---- 7 files changed, 82 insertions(+), 119 deletions(-) delete mode 100644 libcontainer/namespaces/linux_x86_64.go rename libcontainer/namespaces/{ => nsinit}/mount.go (98%) diff --git a/libcontainer/container.go b/libcontainer/container.go index c9a3f2e..c288544 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -2,18 +2,14 @@ package libcontainer type Container struct { ID string `json:"id,omitempty"` - NsPid int `json:"namespace_pid,omitempty"` Command *Command `json:"command,omitempty"` - RootFs string `json:"rootfs,omitempty"` ReadonlyFs bool `json:"readonly_fs,omitempty"` - NetNsFd uintptr `json:"network_namespace_fd,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` - Master uintptr `json:"master"` - Console string `json:"console"` - LogFile string `json:"log_file"` + LogFile string `json:"log_file,omitempty"` + Network *Network `json:"network,omitempty"` } type Command struct { @@ -22,9 +18,9 @@ type Command struct { } type Network struct { - TempVethName string `json:"temp_veth,omitempty"` IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` Bridge string `json:"bridge,omitempty"` Mtu int `json:"mtu,omitempty"` + TempVethName string `json:"temp_veth,omitempty"` } diff --git a/libcontainer/namespaces/exec.go b/libcontainer/namespaces/exec.go index 77550d6..8e5bf68 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/namespaces/exec.go @@ -1,27 +1,17 @@ -/* - Higher level convience functions for setting up a container -*/ - package namespaces import ( - "errors" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/term" "io" "log" "os" "os/exec" - "path/filepath" - "strconv" "syscall" ) -var ( - ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace") -) - -// Exec will spawn new namespaces with the specified Container configuration +// ExecContainer will spawn new namespaces with the specified Container configuration // in the RootFs path and return the pid of the new containerized process. // // If an existing network namespace is specified the container @@ -30,30 +20,19 @@ var ( // existing network namespace and the CLONE_NEWNET option in the container configuration will allow // the container to the the host's networking options and configuration. func ExecContainer(container *libcontainer.Container) (pid int, err error) { - // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join - if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { - return -1, ErrExistingNetworkNamespace - } - master, console, err := createMasterAndConsole() if err != nil { return -1, err } - nsinit := filepath.Join(container.RootFs, ".nsinit") // we need CLONE_VFORK so we can wait on the child flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - command := exec.Command(nsinit, "-master", strconv.Itoa(int(master.Fd())), "-console", console, "init", "container.json") - // command.Stdin = os.Stdin - // command.Stdout = os.Stdout - // command.Stderr = os.Stderr - command.SysProcAttr = &syscall.SysProcAttr{} - command.SysProcAttr.Cloneflags = flag + command := exec.Command("nsinit", console) + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: flag, + } - command.ExtraFiles = []*os.File{master} - - println("vvvvvvvvv") if err := command.Start(); err != nil { return -1, err } @@ -64,11 +43,18 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { log.Println(err) } }() + go func() { if _, err := io.Copy(master, os.Stdin); err != nil { log.Println(err) } }() + + term.SetRawTerminal(os.Stdin.Fd()) + + if err := command.Wait(); err != nil { + return pid, err + } return pid, nil } diff --git a/libcontainer/namespaces/linux_x86_64.go b/libcontainer/namespaces/linux_x86_64.go deleted file mode 100644 index ac9a014..0000000 --- a/libcontainer/namespaces/linux_x86_64.go +++ /dev/null @@ -1,7 +0,0 @@ -// +build linux,x86_64 -package namespaces - -// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 -const ( - SYS_SETNS = 308 -) diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/namespaces/ns_linux.go index f612793..2c73e08 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/namespaces/ns_linux.go @@ -40,5 +40,5 @@ func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { flag |= namespaceMap[ns] } - return + return flag } diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/namespaces/nsinit/init.go index 7f85eba..523854e 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/namespaces/nsinit/init.go @@ -1,6 +1,7 @@ -package nsinit +package main import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -12,103 +13,112 @@ import ( "syscall" ) -// InitNamespace should be run inside an existing namespace to setup -// common mounts, drop capabilities, and setup network interfaces -func InitNamespace(container *libcontainer.Container) error { - println("|||||||||||||") - if err := setLogFile(container); err != nil { - return err - } - println(container.LogFile) - log.Printf("--------->") - rootfs, err := resolveRootfs(container) +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open("container.json") if err != nil { - return err + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} + +func main() { + container, err := loadContainer() + if err != nil { + log.Fatal(err) } - // any errors encoutered inside the namespace we should write - // out to a log or a pipe to our parent and exit(1) - // because writing to stderr will not work after we close - if err := closeMasterAndStd(os.NewFile(container.Master, "/dev/ptmx")); err != nil { - log.Fatalf("close master and std %s", err) - return err + if os.Args[1] == "exec" { + _, err := namespaces.ExecContainer(container) + if err != nil { + log.Fatal(err) + } + os.Exit(0) + } + console := os.Args[1] + + if err := setLogFile(container); err != nil { + log.Fatal(err) } - slave, err := openTerminal(container.Console, syscall.O_RDWR) + rootfs, err := resolveRootfs() + if err != nil { + log.Fatal(err) + } + + // close pipes so that we can replace it with the pty + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() + + slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { log.Fatalf("open terminal %s", err) - return err + } + if slave.Fd() != 0 { + log.Fatalf("slave fd should be 0") } if err := dupSlave(slave); err != nil { log.Fatalf("dup2 slave %s", err) - return err } - /* - if container.NetNsFd > 0 { - if err := joinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { - log.Fatalf("join existing net namespace %s", err) - } - } - */ - if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) - return err } if err := system.Setctty(); err != nil { log.Fatalf("setctty %s", err) - return err } if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) - return err } - if err := namespaces.SetupNewMountNamespace(rootfs, container.Console, container.ReadonlyFs); err != nil { + + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { log.Fatalf("setup mount namespace %s", err) - return err } + + if container.Network != nil { + if err := setupNetworking(container); err != nil { + log.Fatalf("setup networking %s", err) + } + } + if err := system.Sethostname(container.ID); err != nil { log.Fatalf("sethostname %s", err) - return err } if err := capabilities.DropCapabilities(container); err != nil { log.Fatalf("drop capabilities %s", err) - return err } if err := setupUser(container); err != nil { log.Fatalf("setup user %s", err) - return err } if container.WorkingDir != "" { if err := system.Chdir(container.WorkingDir); err != nil { log.Fatalf("chdir to %s %s", container.WorkingDir, err) - return err } } if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { log.Fatalf("exec %s", err) - return err } panic("unreachable") } -func resolveRootfs(container *libcontainer.Container) (string, error) { - rootfs, err := filepath.Abs(container.RootFs) +func resolveRootfs() (string, error) { + cwd, err := os.Getwd() + if err != nil { + return "", err + } + rootfs, err := filepath.Abs(cwd) if err != nil { return "", err } return filepath.EvalSymlinks(rootfs) } -func closeMasterAndStd(master *os.File) error { - master.Close() - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() - return nil -} - func setupUser(container *libcontainer.Container) error { // TODO: honor user passed on container if err := system.Setgroups(nil); err != nil { @@ -154,3 +164,7 @@ func setLogFile(container *libcontainer.Container) error { log.SetOutput(f) return nil } + +func setupNetworking(conatiner *libcontainer.Container) error { + return nil +} diff --git a/libcontainer/namespaces/mount.go b/libcontainer/namespaces/nsinit/mount.go similarity index 98% rename from libcontainer/namespaces/mount.go rename to libcontainer/namespaces/nsinit/mount.go index 5c0b8ea..f9ee969 100644 --- a/libcontainer/namespaces/mount.go +++ b/libcontainer/namespaces/nsinit/mount.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "fmt" @@ -14,7 +14,7 @@ var ( defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV ) -func SetupNewMountNamespace(rootfs, console string, readonly bool) error { +func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go index a5d677c..edc3ab5 100644 --- a/libcontainer/namespaces/utils.go +++ b/libcontainer/namespaces/utils.go @@ -7,7 +7,6 @@ import ( "path/filepath" "strconv" "strings" - "syscall" ) func addEnvIfNotSet(container *libcontainer.Container, key, value string) { @@ -26,31 +25,6 @@ func addEnvIfNotSet(container *libcontainer.Container, key, value string) { container.Command.Env = append(container.Command.Env, jv) } -// getNsFds inspects the container's namespace configuration and opens the fds to -// each of the namespaces. -func getNsFds(container *libcontainer.Container) ([]uintptr, error) { - var ( - namespaces = []string{} - fds = []uintptr{} - ) - - for _, ns := range container.Namespaces { - namespaces = append(namespaces, namespaceFileMap[ns]) - } - - for _, ns := range namespaces { - fd, err := getNsFd(container.NsPid, ns) - if err != nil { - for _, fd = range fds { - syscall.Close(int(fd)) - } - return nil, err - } - fds = append(fds, fd) - } - return fds, nil -} - // getNsFd returns the fd for a specific pid and namespace option func getNsFd(pid int, ns string) (uintptr, error) { nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) From e25ebdd06c3dcb901fa9df7f8042ed0abd041337 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 14:55:34 -0800 Subject: [PATCH 031/117] Simplify namespaces with only nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/cli/main.go | 203 ------------------ libcontainer/namespaces/utils.go | 48 ----- libcontainer/{namespaces => nsinit}/exec.go | 26 ++- libcontainer/{namespaces => }/nsinit/init.go | 13 +- libcontainer/{namespaces => }/nsinit/mount.go | 0 .../{namespaces => nsinit}/ns_linux.go | 11 +- 6 files changed, 23 insertions(+), 278 deletions(-) delete mode 100644 libcontainer/cli/main.go delete mode 100644 libcontainer/namespaces/utils.go rename libcontainer/{namespaces => nsinit}/exec.go (63%) rename libcontainer/{namespaces => }/nsinit/init.go (93%) rename libcontainer/{namespaces => }/nsinit/mount.go (100%) rename libcontainer/{namespaces => nsinit}/ns_linux.go (74%) diff --git a/libcontainer/cli/main.go b/libcontainer/cli/main.go deleted file mode 100644 index 93bb039..0000000 --- a/libcontainer/cli/main.go +++ /dev/null @@ -1,203 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces/nsinit" - "github.com/dotcloud/docker/pkg/libcontainer/network" - "github.com/dotcloud/docker/pkg/libcontainer/utils" - "os" - exec_ "os/exec" - "path" - "path/filepath" -) - -var ( - displayPid bool - newCommand string - usrNet bool - masterFd int - console string -) - -func init() { - flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") - flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") - flag.BoolVar(&usrNet, "net", false, "user a net namespace") - flag.IntVar(&masterFd, "master", 0, "master fd") - flag.StringVar(&console, "console", "", "console path") - flag.Parse() -} - -func nsinitFunc(container *libcontainer.Container) error { - container.Master = uintptr(masterFd) - container.Console = console - container.LogFile = "/root/logs" - - return nsinit.InitNamespace(container) -} - -func exec(container *libcontainer.Container) error { - var ( - netFile *os.File - err error - ) - container.NetNsFd = 0 - - if usrNet { - netFile, err = os.Open("/root/nsroot/test") - if err != nil { - return err - } - container.NetNsFd = netFile.Fd() - } - - self, err := exec_.LookPath(os.Args[0]) - if err != nil { - return err - } - if output, err := exec_.Command("cp", self, path.Join(container.RootFs, ".nsinit")).CombinedOutput(); err != nil { - return fmt.Errorf("Error exec cp: %s, (%s)", err, output) - } else { - println(self, container.RootFs) - fmt.Printf("-----> %s\n", output) - } - println("----") - - pid, err := namespaces.ExecContainer(container) - if err != nil { - return fmt.Errorf("error exec container %s", err) - } - - if displayPid { - fmt.Println(pid) - } - - exitcode, err := utils.WaitOnPid(pid) - if err != nil { - return fmt.Errorf("error waiting on child %s", err) - } - fmt.Println(exitcode) - if usrNet { - netFile.Close() - if err := network.DeleteNetworkNamespace("/root/nsroot/test"); err != nil { - return err - } - } - os.Exit(exitcode) - return nil -} - -func execIn(container *libcontainer.Container) error { - // f, err := os.Open("/root/nsroot/test") - // if err != nil { - // return err - // } - // container.NetNsFd = f.Fd() - // pid, err := namespaces.ExecIn(container, &libcontainer.Command{ - // Env: container.Command.Env, - // Args: []string{ - // newCommand, - // }, - // }) - // if err != nil { - // return fmt.Errorf("error exexin container %s", err) - // } - // exitcode, err := utils.WaitOnPid(pid) - // if err != nil { - // return fmt.Errorf("error waiting on child %s", err) - // } - // os.Exit(exitcode) - return nil -} - -func createNet(config *libcontainer.Network) error { - /* - root := "/root/nsroot" - if err := network.SetupNamespaceMountDir(root); err != nil { - return err - } - - nspath := root + "/test" - if err := network.CreateNetworkNamespace(nspath); err != nil { - return nil - } - if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { - return err - } - if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { - return err - } - if err := network.InterfaceUp("veth0"); err != nil { - return err - } - - f, err := os.Open(nspath) - if err != nil { - return err - } - defer f.Close() - - if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { - return err - } - - if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { - return err - } - */ - return nil -} - -func printErr(err error) { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) -} - -func main() { - cliCmd := flag.Arg(0) - - config, err := filepath.Abs(flag.Arg(1)) - if err != nil { - printErr(err) - } - println("cli:", cliCmd, "config:", config) - f, err := os.Open(config) - if err != nil { - printErr(err) - } - - dec := json.NewDecoder(f) - var container *libcontainer.Container - - if err := dec.Decode(&container); err != nil { - printErr(err) - } - f.Close() - - switch cliCmd { - case "init": - err = nsinitFunc(container) - case "exec": - err = exec(container) - case "execin": - err = execIn(container) - case "net": - err = createNet(&libcontainer.Network{ - TempVethName: "veth1", - IP: "172.17.0.100/16", - Gateway: "172.17.42.1", - Mtu: 1500, - Bridge: "docker0", - }) - default: - err = fmt.Errorf("command not supported: %s", cliCmd) - } - - if err != nil { - printErr(err) - } -} diff --git a/libcontainer/namespaces/utils.go b/libcontainer/namespaces/utils.go deleted file mode 100644 index edc3ab5..0000000 --- a/libcontainer/namespaces/utils.go +++ /dev/null @@ -1,48 +0,0 @@ -package namespaces - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" - "os" - "path/filepath" - "strconv" - "strings" -) - -func addEnvIfNotSet(container *libcontainer.Container, key, value string) { - jv := fmt.Sprintf("%s=%s", key, value) - if len(container.Command.Env) == 0 { - container.Command.Env = []string{jv} - return - } - - for _, v := range container.Command.Env { - parts := strings.Split(v, "=") - if parts[0] == key { - return - } - } - container.Command.Env = append(container.Command.Env, jv) -} - -// getNsFd returns the fd for a specific pid and namespace option -func getNsFd(pid int, ns string) (uintptr, error) { - nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) - // OpenFile adds closOnExec - f, err := os.OpenFile(nspath, os.O_RDONLY, 0666) - if err != nil { - return 0, err - } - return f.Fd(), nil -} - -// setupEnvironment adds additional environment variables to the container's -// Command such as USER, LOGNAME, container, and TERM -func setupEnvironment(container *libcontainer.Container) { - addEnvIfNotSet(container, "container", "docker") - // TODO: check if pty - addEnvIfNotSet(container, "TERM", "xterm") - // TODO: get username from container - addEnvIfNotSet(container, "USER", "root") - addEnvIfNotSet(container, "LOGNAME", "root") -} diff --git a/libcontainer/namespaces/exec.go b/libcontainer/nsinit/exec.go similarity index 63% rename from libcontainer/namespaces/exec.go rename to libcontainer/nsinit/exec.go index 8e5bf68..ef81b0e 100644 --- a/libcontainer/namespaces/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "github.com/dotcloud/docker/pkg/libcontainer" @@ -11,15 +11,7 @@ import ( "syscall" ) -// ExecContainer will spawn new namespaces with the specified Container configuration -// in the RootFs path and return the pid of the new containerized process. -// -// If an existing network namespace is specified the container -// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is, -// the container will be spawned with a new network namespace with no configuration. Omiting an -// existing network namespace and the CLONE_NEWNET option in the container configuration will allow -// the container to the the host's networking options and configuration. -func ExecContainer(container *libcontainer.Container) (pid int, err error) { +func execCommand(container *libcontainer.Container) (pid int, err error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err @@ -50,7 +42,19 @@ func ExecContainer(container *libcontainer.Container) (pid int, err error) { } }() - term.SetRawTerminal(os.Stdin.Fd()) + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return -1, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return -1, err + } + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + command.Process.Kill() + return -1, err + } + defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { return pid, err diff --git a/libcontainer/namespaces/nsinit/init.go b/libcontainer/nsinit/init.go similarity index 93% rename from libcontainer/namespaces/nsinit/init.go rename to libcontainer/nsinit/init.go index 523854e..b4b7de4 100644 --- a/libcontainer/namespaces/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,7 +5,6 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" - "github.com/dotcloud/docker/pkg/libcontainer/namespaces" "github.com/dotcloud/docker/pkg/system" "log" "os" @@ -34,7 +33,7 @@ func main() { } if os.Args[1] == "exec" { - _, err := namespaces.ExecContainer(container) + _, err := execCommand(container) if err != nil { log.Fatal(err) } @@ -157,11 +156,13 @@ func openTerminal(name string, flag int) (*os.File, error) { } func setLogFile(container *libcontainer.Container) error { - f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) - if err != nil { - return err + if container.LogFile != "" { + f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) + if err != nil { + return err + } + log.SetOutput(f) } - log.SetOutput(f) return nil } diff --git a/libcontainer/namespaces/nsinit/mount.go b/libcontainer/nsinit/mount.go similarity index 100% rename from libcontainer/namespaces/nsinit/mount.go rename to libcontainer/nsinit/mount.go diff --git a/libcontainer/namespaces/ns_linux.go b/libcontainer/nsinit/ns_linux.go similarity index 74% rename from libcontainer/namespaces/ns_linux.go rename to libcontainer/nsinit/ns_linux.go index 2c73e08..b54bc2b 100644 --- a/libcontainer/namespaces/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -1,4 +1,4 @@ -package namespaces +package main import ( "github.com/dotcloud/docker/pkg/libcontainer" @@ -25,15 +25,6 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: CLONE_NEWNET, } -var namespaceFileMap = map[libcontainer.Namespace]string{ - libcontainer.CLONE_NEWNS: "mnt", - libcontainer.CLONE_NEWUTS: "uts", - libcontainer.CLONE_NEWIPC: "ipc", - libcontainer.CLONE_NEWUSER: "user", - libcontainer.CLONE_NEWPID: "pid", - libcontainer.CLONE_NEWNET: "net", -} - // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { From 8430fbf11e1a20464cdb48384f2c87a3c51af304 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 15:33:44 -0800 Subject: [PATCH 032/117] Implement init veth creation Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 9 ++++----- libcontainer/container.json | 14 +++++++++---- libcontainer/network/veth.go | 38 +++++------------------------------- libcontainer/nsinit/exec.go | 33 +++++++++++++++++++++++++++++++ libcontainer/nsinit/init.go | 14 ++++++++++--- libcontainer/ubuntu.json | 22 --------------------- 6 files changed, 63 insertions(+), 67 deletions(-) delete mode 100644 libcontainer/ubuntu.json diff --git a/libcontainer/container.go b/libcontainer/container.go index c288544..3f3961d 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -18,9 +18,8 @@ type Command struct { } type Network struct { - IP string `json:"ip,omitempty"` - Gateway string `json:"gateway,omitempty"` - Bridge string `json:"bridge,omitempty"` - Mtu int `json:"mtu,omitempty"` - TempVethName string `json:"temp_veth,omitempty"` + IP string `json:"ip,omitempty"` + Gateway string `json:"gateway,omitempty"` + Bridge string `json:"bridge,omitempty"` + Mtu int `json:"mtu,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 6e4fda5..8731170 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,6 +1,6 @@ { "id": "koye", - "namespace_pid": 3117, + "log_file": "/root/logs", "command": { "args": [ "/bin/bash" @@ -12,12 +12,12 @@ "TERM=xterm" ] }, - "rootfs": "/var/lib/docker/containers/ee76122136d691d63e09d24168a91ddb2ef9fdcf210b4de5c50aa76354892f4b/root", "namespaces": [ "NEWIPC", "NEWNS", "NEWPID", - "NEWUTS" + "NEWUTS", + "NEWNET" ], "capabilities": [ "SETPCAP", @@ -34,5 +34,11 @@ "AUDIT_CONTROL", "MAC_OVERRIDE", "MAC_ADMIN" - ] + ], + "network": { + "ip": "172.17.0.100/16", + "gateway": "172.17.42.1", + "bridge": "docker0", + "mtu": 1500 + } } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 2ecce22..05512e6 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -3,18 +3,16 @@ package network import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "os" - "syscall" ) // SetupVeth sets up an existing network namespace with the specified // network configuration. -func SetupVeth(config *libcontainer.Network) error { - if err := InterfaceDown(config.TempVethName); err != nil { - return fmt.Errorf("interface down %s %s", config.TempVethName, err) +func SetupVeth(config *libcontainer.Network, tempVethName string) error { + if err := InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) } - if err := ChangeInterfaceName(config.TempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", config.TempVethName, err) + if err := ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) } if err := SetInterfaceIp("eth0", config.IP); err != nil { return fmt.Errorf("set eth0 ip %s", err) @@ -41,29 +39,3 @@ func SetupVeth(config *libcontainer.Network) error { } return nil } - -// SetupNamespaceMountDir prepares a new root for use as a mount -// source for bind mounting namespace fd to an outside path -func SetupNamespaceMountDir(root string) error { - if err := os.MkdirAll(root, 0666); err != nil { - return err - } - // make sure mounts are not unmounted by other mnt namespaces - if err := syscall.Mount("", root, "none", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil && err != syscall.EINVAL { - return err - } - if err := syscall.Mount(root, root, "none", syscall.MS_BIND, ""); err != nil { - return err - } - return nil -} - -// DeleteNetworkNamespace unmounts the binding path and removes the -// file so that no references to the fd are present and the network -// namespace is automatically cleaned up -func DeleteNetworkNamespace(bindingPath string) error { - if err := syscall.Unmount(bindingPath, 0); err != nil { - return err - } - return os.Remove(bindingPath) -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ef81b0e..9cd1741 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,7 +1,9 @@ package main import ( + "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -25,11 +27,34 @@ func execCommand(container *libcontainer.Container) (pid int, err error) { Cloneflags: flag, } + inPipe, err := command.StdinPipe() + if err != nil { + return -1, err + } + if err := command.Start(); err != nil { return -1, err } pid = command.Process.Pid + if container.Network != nil { + name1, name2, err := createVethPair() + if err != nil { + log.Fatal(err) + } + if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { + log.Fatal(err) + } + if err := network.InterfaceUp(name1); err != nil { + log.Fatal(err) + } + if err := network.SetInterfaceInNamespacePid(name2, pid); err != nil { + log.Fatal(err) + } + fmt.Fprint(inPipe, name2) + inPipe.Close() + } + go func() { if _, err := io.Copy(os.Stdout, master); err != nil { log.Println(err) @@ -78,3 +103,11 @@ func createMasterAndConsole() (*os.File, string, error) { } return master, console, nil } + +func createVethPair() (name1 string, name2 string, err error) { + name1, name2 = "veth001", "veth002" + if err = network.CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index b4b7de4..2804f01 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,7 +5,9 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "io/ioutil" "log" "os" "path/filepath" @@ -50,6 +52,12 @@ func main() { log.Fatal(err) } + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + log.Fatalf("error reading from stdin %s", err) + } + tempVethName := string(data) + // close pipes so that we can replace it with the pty os.Stdin.Close() os.Stdout.Close() @@ -81,7 +89,7 @@ func main() { } if container.Network != nil { - if err := setupNetworking(container); err != nil { + if err := setupNetworking(container, tempVethName); err != nil { log.Fatalf("setup networking %s", err) } } @@ -166,6 +174,6 @@ func setLogFile(container *libcontainer.Container) error { return nil } -func setupNetworking(conatiner *libcontainer.Container) error { - return nil +func setupNetworking(container *libcontainer.Container, tempVethName string) error { + return network.SetupVeth(container.Network, tempVethName) } diff --git a/libcontainer/ubuntu.json b/libcontainer/ubuntu.json deleted file mode 100644 index 0a450ae..0000000 --- a/libcontainer/ubuntu.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "id": "koye", - "namespace_pid": 3745, - "command": { - "args": [ - "/sbin/init" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm" - ] - }, - "rootfs": "/var/lib/docker/btrfs/subvolumes/7c0f15df1ad2e2fe04d7a6e079aec17406e9465a6a37dd16cb0dd754fc0167b3", - "namespaces": [ - "NEWIPC", - "NEWNS", - "NEWPID", - "NEWUTS" - ] -} From ab6864d0c0e4ca7de8a2646f4cf4de4348c682ea Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 15:54:53 -0800 Subject: [PATCH 033/117] Add dynamic veth name Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 41 -------------------------------- libcontainer/nsinit/exec.go | 10 +++++++- libcontainer/nsinit/init.go | 46 ++++++++++++++++++++++++++++-------- libcontainer/utils/utils.go | 24 +++---------------- 4 files changed, 48 insertions(+), 73 deletions(-) delete mode 100644 libcontainer/network/veth.go diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go deleted file mode 100644 index 05512e6..0000000 --- a/libcontainer/network/veth.go +++ /dev/null @@ -1,41 +0,0 @@ -package network - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// SetupVeth sets up an existing network namespace with the specified -// network configuration. -func SetupVeth(config *libcontainer.Network, tempVethName string) error { - if err := InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) - } - if err := ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := SetInterfaceIp("eth0", config.IP); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - - if err := SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - - if err := SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - - if config.Gateway != "" { - if err := SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } - } - return nil -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 9cd1741..e032407 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -105,7 +106,14 @@ func createMasterAndConsole() (*os.File, string, error) { } func createVethPair() (name1 string, name2 string, err error) { - name1, name2 = "veth001", "veth002" + name1, err = utils.GenerateRandomName("dock", 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName("dock", 4) + if err != nil { + return + } if err = network.CreateVethPair(name1, name2); err != nil { return } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 2804f01..fe8fd4b 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -52,11 +52,14 @@ func main() { log.Fatal(err) } - data, err := ioutil.ReadAll(os.Stdin) - if err != nil { - log.Fatalf("error reading from stdin %s", err) + var tempVethName string + if container.Network != nil { + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + log.Fatalf("error reading from stdin %s", err) + } + tempVethName = string(data) } - tempVethName := string(data) // close pipes so that we can replace it with the pty os.Stdin.Close() @@ -73,7 +76,6 @@ func main() { if err := dupSlave(slave); err != nil { log.Fatalf("dup2 slave %s", err) } - if _, err := system.Setsid(); err != nil { log.Fatalf("setsid %s", err) } @@ -83,13 +85,11 @@ func main() { if err := system.ParentDeathSignal(); err != nil { log.Fatalf("parent deth signal %s", err) } - if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { log.Fatalf("setup mount namespace %s", err) } - if container.Network != nil { - if err := setupNetworking(container, tempVethName); err != nil { + if err := setupNetworking(container.Network, tempVethName); err != nil { log.Fatalf("setup networking %s", err) } } @@ -174,6 +174,32 @@ func setLogFile(container *libcontainer.Container) error { return nil } -func setupNetworking(container *libcontainer.Container, tempVethName string) error { - return network.SetupVeth(container.Network, tempVethName) +func setupNetworking(config *libcontainer.Network, tempVethName string) error { + if err := network.InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) + } + if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) + } + if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := network.SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := network.SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := network.SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil } diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 7289fec..d3223c3 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -4,30 +4,12 @@ import ( "crypto/rand" "encoding/hex" "io" - "os" - "syscall" ) -func WaitOnPid(pid int) (exitcode int, err error) { - child, err := os.FindProcess(pid) - if err != nil { - return -1, err - } - state, err := child.Wait() - if err != nil { - return -1, err - } - return getExitCode(state), nil -} - -func getExitCode(state *os.ProcessState) int { - return state.Sys().(syscall.WaitStatus).ExitStatus() -} - -func GenerateRandomName(size int) (string, error) { - id := make([]byte, size) +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) if _, err := io.ReadFull(rand.Reader, id); err != nil { return "", err } - return hex.EncodeToString(id), nil + return prefix + hex.EncodeToString(id)[:size], nil } From 84ba029e25539405212cb2b626c7f737e4bb0cfb Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 16:40:36 -0800 Subject: [PATCH 034/117] General cleanup of libcontainer Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/errors.go | 9 ----- libcontainer/network/network.go | 26 -------------- libcontainer/nsinit/exec.go | 51 ++++++++++++-------------- libcontainer/nsinit/init.go | 63 +++++++++------------------------ libcontainer/nsinit/main.go | 42 ++++++++++++++++++++++ libcontainer/nsinit/mount.go | 61 ++++++++++++------------------- libcontainer/nsinit/ns_linux.go | 25 ++++--------- 7 files changed, 111 insertions(+), 166 deletions(-) delete mode 100644 libcontainer/errors.go create mode 100644 libcontainer/nsinit/main.go diff --git a/libcontainer/errors.go b/libcontainer/errors.go deleted file mode 100644 index c6964ee..0000000 --- a/libcontainer/errors.go +++ /dev/null @@ -1,9 +0,0 @@ -package libcontainer - -import ( - "errors" -) - -var ( - ErrInvalidPid = errors.New("no ns pid found") -) diff --git a/libcontainer/network/network.go b/libcontainer/network/network.go index 31c5d32..8c7a4b6 100644 --- a/libcontainer/network/network.go +++ b/libcontainer/network/network.go @@ -1,15 +1,10 @@ package network import ( - "errors" "github.com/dotcloud/docker/pkg/netlink" "net" ) -var ( - ErrNoDefaultRoute = errors.New("no default network route found") -) - func InterfaceUp(name string) error { iface, err := net.InterfaceByName(name) if err != nil { @@ -46,14 +41,6 @@ func SetInterfaceInNamespacePid(name string, nsPid int) error { return netlink.NetworkSetNsPid(iface, nsPid) } -func SetInterfaceInNamespaceFd(name string, fd int) error { - iface, err := net.InterfaceByName(name) - if err != nil { - return err - } - return netlink.NetworkSetNsFd(iface, fd) -} - func SetInterfaceMaster(name, master string) error { iface, err := net.InterfaceByName(name) if err != nil { @@ -89,16 +76,3 @@ func SetMtu(name string, mtu int) error { } return netlink.NetworkSetMTU(iface, mtu) } - -func GetDefaultMtu() (int, error) { - routes, err := netlink.NetworkGetRoutes() - if err != nil { - return -1, err - } - for _, r := range routes { - if r.Default { - return r.Iface.MTU, nil - } - } - return -1, ErrNoDefaultRoute -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index e032407..4ac070d 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -8,65 +8,54 @@ import ( "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" - "log" + "io/ioutil" "os" "os/exec" "syscall" ) -func execCommand(container *libcontainer.Container) (pid int, err error) { +func execCommand(container *libcontainer.Container) (int, error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err } - // we need CLONE_VFORK so we can wait on the child - flag := uintptr(getNamespaceFlags(container.Namespaces) | CLONE_VFORK) - - command := exec.Command("nsinit", console) + command := exec.Command("nsinit", "init", console) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: flag, + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child } inPipe, err := command.StdinPipe() if err != nil { return -1, err } - if err := command.Start(); err != nil { return -1, err } - pid = command.Process.Pid + if err := writePidFile(command); err != nil { + return -1, err + } if container.Network != nil { name1, name2, err := createVethPair() if err != nil { - log.Fatal(err) + return -1, err } if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { - log.Fatal(err) + return -1, err } if err := network.InterfaceUp(name1); err != nil { - log.Fatal(err) + return -1, err } - if err := network.SetInterfaceInNamespacePid(name2, pid); err != nil { - log.Fatal(err) + if err := network.SetInterfaceInNamespacePid(name2, command.Process.Pid); err != nil { + return -1, err } fmt.Fprint(inPipe, name2) inPipe.Close() } - go func() { - if _, err := io.Copy(os.Stdout, master); err != nil { - log.Println(err) - } - }() - - go func() { - if _, err := io.Copy(master, os.Stdin); err != nil { - log.Println(err) - } - }() + go io.Copy(os.Stdout, master) + go io.Copy(master, os.Stdin) ws, err := term.GetWinsize(os.Stdin.Fd()) if err != nil { @@ -83,9 +72,11 @@ func execCommand(container *libcontainer.Container) (pid int, err error) { defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { - return pid, err + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } } - return pid, nil + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } func createMasterAndConsole() (*os.File, string, error) { @@ -93,12 +84,10 @@ func createMasterAndConsole() (*os.File, string, error) { if err != nil { return nil, "", err } - console, err := system.Ptsname(master) if err != nil { return nil, "", err } - if err := system.Unlockpt(master); err != nil { return nil, "", err } @@ -119,3 +108,7 @@ func createVethPair() (name1 string, name2 string, err error) { } return } + +func writePidFile(command *exec.Cmd) error { + return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) +} diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index fe8fd4b..16a3081 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,7 +1,6 @@ package main import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -14,49 +13,21 @@ import ( "syscall" ) -func loadContainer() (*libcontainer.Container, error) { - f, err := os.Open("container.json") - if err != nil { - return nil, err - } - defer f.Close() - - var container *libcontainer.Container - if err := json.NewDecoder(f).Decode(&container); err != nil { - return nil, err - } - return container, nil -} - -func main() { - container, err := loadContainer() - if err != nil { - log.Fatal(err) - } - - if os.Args[1] == "exec" { - _, err := execCommand(container) - if err != nil { - log.Fatal(err) - } - os.Exit(0) - } - console := os.Args[1] - +func initCommand(container *libcontainer.Container, console string) error { if err := setLogFile(container); err != nil { - log.Fatal(err) + return err } rootfs, err := resolveRootfs() if err != nil { - log.Fatal(err) + return err } var tempVethName string if container.Network != nil { data, err := ioutil.ReadAll(os.Stdin) if err != nil { - log.Fatalf("error reading from stdin %s", err) + return fmt.Errorf("error reading from stdin %s", err) } tempVethName = string(data) } @@ -68,48 +39,48 @@ func main() { slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { - log.Fatalf("open terminal %s", err) + return fmt.Errorf("open terminal %s", err) } if slave.Fd() != 0 { - log.Fatalf("slave fd should be 0") + return fmt.Errorf("slave fd should be 0") } if err := dupSlave(slave); err != nil { - log.Fatalf("dup2 slave %s", err) + return fmt.Errorf("dup2 slave %s", err) } if _, err := system.Setsid(); err != nil { - log.Fatalf("setsid %s", err) + return fmt.Errorf("setsid %s", err) } if err := system.Setctty(); err != nil { - log.Fatalf("setctty %s", err) + return fmt.Errorf("setctty %s", err) } if err := system.ParentDeathSignal(); err != nil { - log.Fatalf("parent deth signal %s", err) + return fmt.Errorf("parent deth signal %s", err) } if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { - log.Fatalf("setup mount namespace %s", err) + return fmt.Errorf("setup mount namespace %s", err) } if container.Network != nil { if err := setupNetworking(container.Network, tempVethName); err != nil { - log.Fatalf("setup networking %s", err) + return fmt.Errorf("setup networking %s", err) } } if err := system.Sethostname(container.ID); err != nil { - log.Fatalf("sethostname %s", err) + return fmt.Errorf("sethostname %s", err) } if err := capabilities.DropCapabilities(container); err != nil { - log.Fatalf("drop capabilities %s", err) + return fmt.Errorf("drop capabilities %s", err) } if err := setupUser(container); err != nil { - log.Fatalf("setup user %s", err) + return fmt.Errorf("setup user %s", err) } if container.WorkingDir != "" { if err := system.Chdir(container.WorkingDir); err != nil { - log.Fatalf("chdir to %s %s", container.WorkingDir, err) + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { - log.Fatalf("exec %s", err) + return fmt.Errorf("exec %s", err) } panic("unreachable") } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go new file mode 100644 index 0000000..47abcce --- /dev/null +++ b/libcontainer/nsinit/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "encoding/json" + "github.com/dotcloud/docker/pkg/libcontainer" + "log" + "os" +) + +func main() { + container, err := loadContainer() + if err != nil { + log.Fatal(err) + } + + switch os.Args[1] { + case "exec": + exitCode, err := execCommand(container) + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + case "init": + if err := initCommand(container, os.Args[2]); err != nil { + log.Fatal(err) + } + } +} + +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open("container.json") + if err != nil { + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index f9ee969..13ee13e 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -3,68 +3,47 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/system" - "log" "os" "path/filepath" "syscall" ) -var ( - // default mount point options - defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV -) +// default mount point options +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } - if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mouting %s as bind %s", rootfs, err) } - if readonly { if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting %s as readonly %s", rootfs, err) } } - if err := mountSystem(rootfs); err != nil { return fmt.Errorf("mount system %s", err) } - if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } - - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - if err := setupDev(rootfs); err != nil { return err } - - if err := setupConsole(rootfs, console); err != nil { + if err := setupPtmx(rootfs, console); err != nil { return err } - if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := system.Chroot("."); err != nil { return fmt.Errorf("chroot . %s", err) } - if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } @@ -90,13 +69,10 @@ func copyDevNodes(rootfs string) error { if err != nil { return err } - var ( dest = filepath.Join(rootfs, "dev", node) st = stat.Sys().(*syscall.Stat_t) ) - - log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { return fmt.Errorf("copy %s %s", node, err) } @@ -134,24 +110,22 @@ func setupConsole(rootfs, console string) error { if err != nil { return fmt.Errorf("stat console %s %s", console, err) } - st := stat.Sys().(*syscall.Stat_t) - - dest := filepath.Join(rootfs, "dev/console") + var ( + st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, "dev/console") + ) if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { return fmt.Errorf("remove %s %s", dest, err) } - if err := os.Chmod(console, 0600); err != nil { return err } if err := os.Chown(console, 0, 0); err != nil { return err } - if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { return fmt.Errorf("mknod %s %s", dest, err) } - if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("bind %s to %s %s", console, dest, err) } @@ -168,10 +142,10 @@ func mountSystem(rootfs string) error { flags int data string }{ - {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults}, - {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults}, + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, } { @@ -189,7 +163,7 @@ func remountProc() error { if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err } - if err := system.Mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { return err } return nil @@ -201,9 +175,20 @@ func remountSys() error { return err } } else { - if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { return err } } return nil } + +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + return setupConsole(rootfs, console) +} diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index b54bc2b..2392ffd 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -2,27 +2,16 @@ package main import ( "github.com/dotcloud/docker/pkg/libcontainer" -) - -const ( - SIGCHLD = 0x14 - CLONE_VFORK = 0x00004000 - CLONE_NEWNS = 0x00020000 - CLONE_NEWUTS = 0x04000000 - CLONE_NEWIPC = 0x08000000 - CLONE_NEWUSER = 0x10000000 - CLONE_NEWPID = 0x20000000 - CLONE_NEWNET = 0x40000000 + "syscall" ) var namespaceMap = map[libcontainer.Namespace]int{ - "": 0, - libcontainer.CLONE_NEWNS: CLONE_NEWNS, - libcontainer.CLONE_NEWUTS: CLONE_NEWUTS, - libcontainer.CLONE_NEWIPC: CLONE_NEWIPC, - libcontainer.CLONE_NEWUSER: CLONE_NEWUSER, - libcontainer.CLONE_NEWPID: CLONE_NEWPID, - libcontainer.CLONE_NEWNET: CLONE_NEWNET, + libcontainer.CLONE_NEWNS: syscall.CLONE_NEWNS, + libcontainer.CLONE_NEWUTS: syscall.CLONE_NEWUTS, + libcontainer.CLONE_NEWIPC: syscall.CLONE_NEWIPC, + libcontainer.CLONE_NEWUSER: syscall.CLONE_NEWUSER, + libcontainer.CLONE_NEWPID: syscall.CLONE_NEWPID, + libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } // getNamespaceFlags parses the container's Namespaces options to set the correct From b48bc85967d473743192951d3e9ed2f85bf5c0a2 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 19 Feb 2014 16:50:10 -0800 Subject: [PATCH 035/117] OSX compilation Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 2 ++ libcontainer/nsinit/init.go | 2 ++ libcontainer/nsinit/main.go | 13 +++++++++++++ libcontainer/nsinit/mount.go | 2 ++ 4 files changed, 19 insertions(+) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 4ac070d..5b53be2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 16a3081..1c90ecc 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 47abcce..c9f9d7b 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -2,17 +2,27 @@ package main import ( "encoding/json" + "errors" "github.com/dotcloud/docker/pkg/libcontainer" "log" "os" ) +var ( + ErrUnsupported = errors.New("Unsupported method") + ErrWrongArguments = errors.New("Wrong argument count") +) + func main() { container, err := loadContainer() if err != nil { log.Fatal(err) } + argc := len(os.Args) + if argc < 2 { + log.Fatal(ErrWrongArguments) + } switch os.Args[1] { case "exec": exitCode, err := execCommand(container) @@ -21,6 +31,9 @@ func main() { } os.Exit(exitCode) case "init": + if argc != 3 { + log.Fatal(ErrWrongArguments) + } if err := initCommand(container, os.Args[2]); err != nil { log.Fatal(err) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 13ee13e..baa850f 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -1,3 +1,5 @@ +// +build linux + package main import ( From e3d5adc9e2b4a6d7e6009ce3534a1cb2c4cbf363 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 19:14:31 -0800 Subject: [PATCH 036/117] Refactor large funcs Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 5 +-- libcontainer/container.json | 3 +- libcontainer/nsinit/exec.go | 60 ++++++++++++++++--------- libcontainer/nsinit/init.go | 87 +++++++++++++++++-------------------- libcontainer/types.go | 48 ++++++++++---------- 5 files changed, 107 insertions(+), 96 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index 3f3961d..c8dbdd6 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,14 +1,13 @@ package libcontainer type Container struct { - ID string `json:"id,omitempty"` - Command *Command `json:"command,omitempty"` + Hostname string `json:"hostname,omitempty"` ReadonlyFs bool `json:"readonly_fs,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` + Command *Command `json:"command,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` - LogFile string `json:"log_file,omitempty"` Network *Network `json:"network,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 8731170..2abf01a 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,6 +1,5 @@ { "id": "koye", - "log_file": "/root/logs", "command": { "args": [ "/bin/bash" @@ -9,7 +8,7 @@ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", "container=docker", - "TERM=xterm" + "TERM=xterm-256color" ] }, "namespaces": [ diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 5b53be2..4abebd2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -27,6 +27,8 @@ func execCommand(container *libcontainer.Container) (int, error) { Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child } + // create a pipe so that we can syncronize with the namespaced process and + // pass the veth name to the child inPipe, err := command.StdinPipe() if err != nil { return -1, err @@ -39,34 +41,17 @@ func execCommand(container *libcontainer.Container) (int, error) { } if container.Network != nil { - name1, name2, err := createVethPair() + vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) if err != nil { return -1, err } - if err := network.SetInterfaceMaster(name1, container.Network.Bridge); err != nil { - return -1, err - } - if err := network.InterfaceUp(name1); err != nil { - return -1, err - } - if err := network.SetInterfaceInNamespacePid(name2, command.Process.Pid); err != nil { - return -1, err - } - fmt.Fprint(inPipe, name2) - inPipe.Close() + sendVethName(vethPair, inPipe) } go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) - ws, err := term.GetWinsize(os.Stdin.Fd()) - if err != nil { - return -1, err - } - if err := term.SetWinsize(master.Fd(), ws); err != nil { - return -1, err - } - state, err := term.SetRawTerminal(os.Stdin.Fd()) + state, err := setupWindow(master) if err != nil { command.Process.Kill() return -1, err @@ -81,6 +66,41 @@ func execCommand(container *libcontainer.Container) (int, error) { return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +func sendVethName(name string, pipe io.WriteCloser) { + // write the veth pair name to the child's stdin then close the + // pipe so that the child stops waiting + fmt.Fprint(pipe, name) + pipe.Close() +} + +func setupVeth(bridge string, nspid int) (string, error) { + name1, name2, err := createVethPair() + if err != nil { + return "", err + } + if err := network.SetInterfaceMaster(name1, bridge); err != nil { + return "", err + } + if err := network.InterfaceUp(name1); err != nil { + return "", err + } + if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { + return "", err + } + return name2, nil +} + +func setupWindow(master *os.File) (*term.State, error) { + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(os.Stdin.Fd()) +} + func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 1c90ecc..d853a32 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -9,17 +9,12 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "io/ioutil" - "log" "os" "path/filepath" "syscall" ) func initCommand(container *libcontainer.Container, console string) error { - if err := setLogFile(container); err != nil { - return err - } - rootfs, err := resolveRootfs() if err != nil { return err @@ -27,11 +22,10 @@ func initCommand(container *libcontainer.Container, console string) error { var tempVethName string if container.Network != nil { - data, err := ioutil.ReadAll(os.Stdin) + tempVethName, err = getVethName() if err != nil { - return fmt.Errorf("error reading from stdin %s", err) + return err } - tempVethName = string(data) } // close pipes so that we can replace it with the pty @@ -61,13 +55,10 @@ func initCommand(container *libcontainer.Container, console string) error { if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if container.Network != nil { - if err := setupNetworking(container.Network, tempVethName); err != nil { - return fmt.Errorf("setup networking %s", err) - } + if err := setupNetworking(container.Network, tempVethName); err != nil { + return fmt.Errorf("setup networking %s", err) } - - if err := system.Sethostname(container.ID); err != nil { + if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } if err := capabilities.DropCapabilities(container); err != nil { @@ -136,43 +127,45 @@ func openTerminal(name string, flag int) (*os.File, error) { return os.NewFile(uintptr(r), name), nil } -func setLogFile(container *libcontainer.Container) error { - if container.LogFile != "" { - f, err := os.OpenFile(container.LogFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0655) - if err != nil { - return err +func setupNetworking(config *libcontainer.Network, tempVethName string) error { + if config != nil { + if err := network.InterfaceDown(tempVethName); err != nil { + return fmt.Errorf("interface down %s %s", tempVethName, err) + } + if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", tempVethName, err) + } + if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := network.SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := network.SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := network.InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := network.SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } } - log.SetOutput(f) } return nil } -func setupNetworking(config *libcontainer.Network, tempVethName string) error { - if err := network.InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) +// getVethName reads from Stdin the temp veth name +// sent by the parent processes after the veth pair +// has been created and setup +func getVethName() (string, error) { + data, err := ioutil.ReadAll(os.Stdin) + if err != nil { + return "", fmt.Errorf("error reading from stdin %s", err) } - if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := network.SetInterfaceIp("eth0", config.IP); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - if err := network.SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - if err := network.SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - if config.Gateway != "" { - if err := network.SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } - } - return nil + return string(data), nil } diff --git a/libcontainer/types.go b/libcontainer/types.go index db1c3b9..b5d9932 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,29 +1,5 @@ package libcontainer -type Namespace string -type Namespaces []Namespace - -func (n Namespaces) Contains(ns Namespace) bool { - for _, nns := range n { - if nns == ns { - return true - } - } - return false -} - -type Capability string -type Capabilities []Capability - -func (c Capabilities) Contains(capp Capability) bool { - for _, cc := range c { - if cc == capp { - return true - } - } - return false -} - const ( CAP_SETPCAP Capability = "SETPCAP" CAP_SYS_MODULE Capability = "SYS_MODULE" @@ -47,3 +23,27 @@ const ( CLONE_NEWPID Namespace = "NEWPID" // pid CLONE_NEWNET Namespace = "NEWNET" // network ) + +type Namespace string +type Namespaces []Namespace + +func (n Namespaces) Contains(ns Namespace) bool { + for _, nns := range n { + if nns == ns { + return true + } + } + return false +} + +type Capability string +type Capabilities []Capability + +func (c Capabilities) Contains(capp Capability) bool { + for _, cc := range c { + if cc == capp { + return true + } + } + return false +} From 663518ba66f50b97144e8fb8da8a175b28f5372d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 19:53:25 -0800 Subject: [PATCH 037/117] Add execin function to running a process in a namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.json | 2 +- libcontainer/nsinit/execin.go | 115 ++++++++++++++++++++++++++++++++ libcontainer/nsinit/main.go | 8 +++ libcontainer/nsinit/ns_linux.go | 9 +++ 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 libcontainer/nsinit/execin.go diff --git a/libcontainer/container.json b/libcontainer/container.json index 2abf01a..c5807a7 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,5 +1,5 @@ { - "id": "koye", + "hostname": "koye", "command": { "args": [ "/bin/bash" diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go new file mode 100644 index 0000000..362cf5a --- /dev/null +++ b/libcontainer/nsinit/execin.go @@ -0,0 +1,115 @@ +package main + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" +) + +func execinCommand(container *libcontainer.Container) (int, error) { + nspid, err := readPid() + if err != nil { + return -1, err + } + + for _, ns := range container.Namespaces { + if err := system.Unshare(namespaceMap[ns]); err != nil { + return -1, err + } + } + fds, err := getNsFds(nspid, container) + closeFds := func() { + for _, f := range fds { + system.Closefd(f) + } + } + if err != nil { + closeFds() + return -1, err + } + + for _, fd := range fds { + if fd > 0 { + if err := system.Setns(fd, 0); err != nil { + closeFds() + return -1, fmt.Errorf("setns %s", err) + } + } + system.Closefd(fd) + } + + // if the container has a new pid and mount namespace we need to + // remount proc and sys to pick up the changes + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + + pid, err := system.Fork() + if err != nil { + return -1, err + } + if pid == 0 { + // TODO: make all raw syscalls to be fork safe + if err := system.Unshare(syscall.CLONE_NEWNS); err != nil { + return -1, err + } + if err := remountProc(); err != nil { + return -1, fmt.Errorf("remount proc %s", err) + } + if err := remountSys(); err != nil { + return -1, fmt.Errorf("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + return -1, fmt.Errorf("drop capabilities %s", err) + } + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + return -1, err + } + } + proc, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := proc.Wait() + if err != nil { + return -1, err + } + os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) + } + if err := capabilities.DropCapabilities(container); err != nil { + return -1, fmt.Errorf("drop capabilities %s", err) + } + if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + return -1, err + } + panic("unreachable") +} + +func readPid() (int, error) { + data, err := ioutil.ReadFile(".nspid") + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} + +func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { + fds := make([]uintptr, len(container.Namespaces)) + for i, ns := range container.Namespaces { + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) + if err != nil { + return fds, err + } + fds[i] = f.Fd() + } + return fds, nil +} diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index c9f9d7b..8fe700e 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -37,6 +37,14 @@ func main() { if err := initCommand(container, os.Args[2]); err != nil { log.Fatal(err) } + case "execin": + exitCode, err := execinCommand(container) + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + default: + log.Fatalf("command not supported for nsinit %s", os.Args[1]) } } diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 2392ffd..a2809eb 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -14,6 +14,15 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } +var namespaceFileMap = map[libcontainer.Namespace]string{ + libcontainer.CLONE_NEWNS: "mnt", + libcontainer.CLONE_NEWUTS: "uts", + libcontainer.CLONE_NEWIPC: "ipc", + libcontainer.CLONE_NEWUSER: "user", + libcontainer.CLONE_NEWPID: "pid", + libcontainer.CLONE_NEWNET: "net", +} + // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { From bb59129b2f1190de245742bc12a7bed2c9c0b5b9 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 20:35:04 -0800 Subject: [PATCH 038/117] Refactor to remove cmd from container Pass the container's command via args Remove execin function and just look for an existing nspid file to join the namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 7 +------ libcontainer/container.json | 17 ++++++---------- libcontainer/nsinit/exec.go | 21 ++++++++++++++------ libcontainer/nsinit/execin.go | 24 +++-------------------- libcontainer/nsinit/init.go | 4 ++-- libcontainer/nsinit/main.go | 37 ++++++++++++++++++++++++++--------- 6 files changed, 55 insertions(+), 55 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index c8dbdd6..763526f 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -5,17 +5,12 @@ type Container struct { ReadonlyFs bool `json:"readonly_fs,omitempty"` User string `json:"user,omitempty"` WorkingDir string `json:"working_dir,omitempty"` - Command *Command `json:"command,omitempty"` + Env []string `json:"environment,omitempty"` Namespaces Namespaces `json:"namespaces,omitempty"` Capabilities Capabilities `json:"capabilities,omitempty"` Network *Network `json:"network,omitempty"` } -type Command struct { - Args []string `json:"args,omitempty"` - Env []string `json:"environment,omitempty"` -} - type Network struct { IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` diff --git a/libcontainer/container.json b/libcontainer/container.json index c5807a7..ccc9abb 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,16 +1,11 @@ { "hostname": "koye", - "command": { - "args": [ - "/bin/bash" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm-256color" - ] - }, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], "namespaces": [ "NEWIPC", "NEWNS", diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 4abebd2..67f907a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -16,17 +16,13 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container) (int, error) { +func execCommand(container *libcontainer.Container, args []string) (int, error) { master, console, err := createMasterAndConsole() if err != nil { return -1, err } - command := exec.Command("nsinit", "init", console) - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child - } - + command := createCommand(container, console, args) // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child inPipe, err := command.StdinPipe() @@ -39,6 +35,7 @@ func execCommand(container *libcontainer.Container) (int, error) { if err := writePidFile(command); err != nil { return -1, err } + defer deletePidFile() if container.Network != nil { vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) @@ -134,3 +131,15 @@ func createVethPair() (name1 string, name2 string, err error) { func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } + +func deletePidFile() error { + return os.Remove(".nspid") +} + +func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{"init", console}, args...)...) + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child + } + return command +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 362cf5a..7f32620 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -5,19 +5,13 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/system" - "io/ioutil" "os" "path/filepath" "strconv" "syscall" ) -func execinCommand(container *libcontainer.Container) (int, error) { - nspid, err := readPid() - if err != nil { - return -1, err - } - +func execinCommand(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err @@ -67,7 +61,7 @@ func execinCommand(container *libcontainer.Container) (int, error) { if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return -1, err } } @@ -84,24 +78,12 @@ func execinCommand(container *libcontainer.Container) (int, error) { if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return -1, err } panic("unreachable") } -func readPid() (int, error) { - data, err := ioutil.ReadFile(".nspid") - if err != nil { - return -1, err - } - pid, err := strconv.Atoi(string(data)) - if err != nil { - return -1, err - } - return pid, nil -} - func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index d853a32..82706fd 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -14,7 +14,7 @@ import ( "syscall" ) -func initCommand(container *libcontainer.Container, console string) error { +func initCommand(container *libcontainer.Container, console string, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err @@ -72,7 +72,7 @@ func initCommand(container *libcontainer.Container, console string) error { return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - if err := system.Exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + if err := system.Exec(args[0], args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 8fe700e..30c8b06 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -4,8 +4,10 @@ import ( "encoding/json" "errors" "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" "log" "os" + "strconv" ) var ( @@ -25,24 +27,29 @@ func main() { } switch os.Args[1] { case "exec": - exitCode, err := execCommand(container) + var exitCode int + nspid, err := readPid() + if err != nil { + if !os.IsNotExist(err) { + log.Fatal(err) + } + } + if nspid > 0 { + exitCode, err = execinCommand(container, nspid, os.Args[2:]) + } else { + exitCode, err = execCommand(container, os.Args[2:]) + } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": - if argc != 3 { + if argc < 3 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, os.Args[2]); err != nil { + if err := initCommand(container, os.Args[2], os.Args[3:]); err != nil { log.Fatal(err) } - case "execin": - exitCode, err := execinCommand(container) - if err != nil { - log.Fatal(err) - } - os.Exit(exitCode) default: log.Fatalf("command not supported for nsinit %s", os.Args[1]) } @@ -61,3 +68,15 @@ func loadContainer() (*libcontainer.Container, error) { } return container, nil } + +func readPid() (int, error) { + data, err := ioutil.ReadFile(".nspid") + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} From 3ded9b431a10ec555bc64bd313db56e0eb0766c3 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 21:15:44 -0800 Subject: [PATCH 039/117] Update readme and add TODO Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 75 +++++++++++++++++++++++++----------------- libcontainer/TODO.md | 17 ++++++++++ 2 files changed, 61 insertions(+), 31 deletions(-) create mode 100644 libcontainer/TODO.md diff --git a/libcontainer/README.md b/libcontainer/README.md index 91d7478..07fe4f7 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -1,39 +1,34 @@ ## libcontainer - reference implementation for containers -#### playground +#### background + +libcontainer specifies configuration options for what a container is. It provides a native Go implementation +for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management. -Use the cli package to test out functionality - -First setup a container configuration. You will need a root fs, better go the path to a -stopped docker container and use that. - +#### container +A container is a self contained directory that is able to run one or more processes inside without +affecting the host system. The directory is usually a full system tree. Inside the directory +a `container.json` file just be placed with the runtime configuration for how the process +should be contained and run. Environment, networking, and different capabilities for the +process are specified in this file. +Sample `container.json` file: ```json { - "id": "koye", - "namespace_pid": 12265, - "command": { - "args": [ - "/bin/bash" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm" - ] - }, - "rootfs": "/root/development/gocode/src/github.com/docker/libcontainer/namespaces/ubuntu", - "network": null, - "user": "", - "working_dir": "", + "hostname": "koye", + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], "namespaces": [ - "NEWNET", "NEWIPC", "NEWNS", "NEWPID", - "NEWUTS" + "NEWUTS", + "NEWNET" ], "capabilities": [ "SETPCAP", @@ -50,14 +45,32 @@ stopped docker container and use that. "AUDIT_CONTROL", "MAC_OVERRIDE", "MAC_ADMIN" - ] + ], + "network": { + "ip": "172.17.0.100/16", + "gateway": "172.17.42.1", + "bridge": "docker0", + "mtu": 1500 + } } ``` -After you have a json file and a rootfs path to use just run: -`./cli exec container.json` +Using this configuration and the current directory holding the rootfs for a process to live, one can se libcontainer to exec the container. Running the life of the namespace a `.nspid` file +is written to the current directory with the pid of the namespace'd process to the external word. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. -If you want to attach to an existing namespace just use the same json -file with the container still running and do: -`./cli execin container.json` +#### nsinit + +`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to +spawn or join new containers giving the current directory. To use `nsinit` cd into a linux +rootfs and copy a `container.json` file into the directory with your specified configuration. + +To execution `/bin/bash` in the current directory as a container just run: +```bash +nsinit exec /bin/bash +``` + +If you wish to spawn another process inside the container while your current bash session is +running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. + +You can identify if a process is running in a container by looking to see if `.nspid` is in the root of the directory. diff --git a/libcontainer/TODO.md b/libcontainer/TODO.md new file mode 100644 index 0000000..f18c0b4 --- /dev/null +++ b/libcontainer/TODO.md @@ -0,0 +1,17 @@ +#### goals +* small and simple - line count is not everything but less code is better +* clean lines between what we do in the pkg +* provide primitives for working with namespaces not cater to every option +* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code + +#### tasks +* proper tty for a new process in an existing container +* use exec or raw syscalls for new process in existing container +* setup proper user in namespace if specified +* implement hook or clean interface for cgroups +* example configs for different setups (host networking, boot init) +* improve pkg documentation with comments +* testing - this is hard in a low level pkg but we could do some, maybe +* pivot root +* selinux +* apparmor From ca0e7f087cadb8e20759aee81668dcf1f936c8a6 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 21:21:49 -0800 Subject: [PATCH 040/117] Add CAP_NET_ADMIN Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/capabilities/capabilities.go | 1 + libcontainer/types.go | 1 + 2 files changed, 2 insertions(+) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index 3301e10..c19b719 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -21,6 +21,7 @@ var capMap = map[libcontainer.Capability]capability.Cap{ libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, + libcontainer.CAP_NET_ADMIN: capability.CAP_NET_ADMIN, } // DropCapabilities drops capabilities for the current process based diff --git a/libcontainer/types.go b/libcontainer/types.go index b5d9932..fcd00fd 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -15,6 +15,7 @@ const ( CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" CAP_MAC_ADMIN Capability = "MAC_ADMIN" + CAP_NET_ADMIN Capability = "NET_ADMIN" CLONE_NEWNS Namespace = "NEWNS" // mount CLONE_NEWUTS Namespace = "NEWUTS" // utsname From c20c1dfb04c6b47f2febe60dda05d85309884a07 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 22:43:40 -0800 Subject: [PATCH 041/117] Add comments to many functions Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/capabilities/capabilities.go | 1 + libcontainer/container.go | 22 +++++++----- libcontainer/nsinit/exec.go | 23 ++++++++++--- libcontainer/nsinit/execin.go | 10 ++---- libcontainer/nsinit/init.go | 18 ++++++---- libcontainer/nsinit/main.go | 4 +-- libcontainer/nsinit/mount.go | 41 ++++++++++++++++------- libcontainer/nsinit/ns_linux.go | 3 ++ libcontainer/types.go | 18 +++++++--- libcontainer/utils/utils.go | 2 ++ 10 files changed, 97 insertions(+), 45 deletions(-) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index c19b719..65fd455 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -41,6 +41,7 @@ func DropCapabilities(container *libcontainer.Container) error { return nil } +// getCapabilities returns the specific cap values for the libcontainer types func getCapabilities(container *libcontainer.Container) []capability.Cap { drop := []capability.Cap{} for _, c := range container.Capabilities { diff --git a/libcontainer/container.go b/libcontainer/container.go index 763526f..a6a57da 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,16 +1,22 @@ package libcontainer +// Container defines configuration options for how a +// container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` - ReadonlyFs bool `json:"readonly_fs,omitempty"` - User string `json:"user,omitempty"` - WorkingDir string `json:"working_dir,omitempty"` - Env []string `json:"environment,omitempty"` - Namespaces Namespaces `json:"namespaces,omitempty"` - Capabilities Capabilities `json:"capabilities,omitempty"` - Network *Network `json:"network,omitempty"` + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Network *Network `json:"network,omitempty"` // nil for host's network stack } +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack type Network struct { IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 67f907a..202cfca 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -38,7 +38,7 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) defer deletePidFile() if container.Network != nil { - vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) + vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { return -1, err } @@ -63,14 +63,21 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +// sendVethName writes the veth pair name to the child's stdin then closes the +// pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { - // write the veth pair name to the child's stdin then close the - // pipe so that the child stops waiting fmt.Fprint(pipe, name) pipe.Close() } -func setupVeth(bridge string, nspid int) (string, error) { +// initializeContainerVeth will create a veth pair and setup the host's +// side of the pair by setting the specified bridge as the master and bringing +// up the interface. +// +// Then will with set the other side of the veth pair into the container's namespaced +// using the pid and returns the veth's interface name to provide to the container to +// finish setting up the interface inside the namespace +func initializeContainerVeth(bridge string, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err @@ -98,6 +105,8 @@ func setupWindow(master *os.File) (*term.State, error) { return term.SetRawTerminal(os.Stdin.Fd()) } +// createMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { @@ -113,6 +122,8 @@ func createMasterAndConsole() (*os.File, string, error) { return master, console, nil } +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created func createVethPair() (name1 string, name2 string, err error) { name1, err = utils.GenerateRandomName("dock", 4) if err != nil { @@ -128,6 +139,7 @@ func createVethPair() (name1 string, name2 string, err error) { return } +// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } @@ -136,6 +148,9 @@ func deletePidFile() error { return os.Remove(".nspid") } +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { command := exec.Command("nsinit", append([]string{"init", console}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 7f32620..d6224f9 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -28,6 +28,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) return -1, err } + // foreach namespace fd, use setns to join an existing container's namespaces for _, fd := range fds { if fd > 0 { if err := system.Setns(fd, 0); err != nil { @@ -42,7 +43,6 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) // remount proc and sys to pick up the changes if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - pid, err := system.Fork() if err != nil { return -1, err @@ -58,12 +58,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) if err := remountSys(); err != nil { return -1, fmt.Errorf("remount sys %s", err) } - if err := capabilities.DropCapabilities(container); err != nil { - return -1, fmt.Errorf("drop capabilities %s", err) - } - if err := system.Exec(args[0], args[0:], container.Env); err != nil { - return -1, err - } + goto dropAndExec } proc, err := os.FindProcess(pid) if err != nil { @@ -75,6 +70,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) } os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) } +dropAndExec: if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 82706fd..c77fd90 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -37,9 +37,6 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err != nil { return fmt.Errorf("open terminal %s", err) } - if slave.Fd() != 0 { - return fmt.Errorf("slave fd should be 0") - } if err := dupSlave(slave); err != nil { return fmt.Errorf("dup2 slave %s", err) } @@ -55,7 +52,7 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetworking(container.Network, tempVethName); err != nil { + if err := setupVethNetwork(container.Network, tempVethName); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -78,6 +75,8 @@ func initCommand(container *libcontainer.Container, console string, args []strin panic("unreachable") } +// resolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs func resolveRootfs() (string, error) { cwd, err := os.Getwd() if err != nil { @@ -104,8 +103,9 @@ func setupUser(container *libcontainer.Container) error { return nil } +// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that +// the slave's fd is 0, or stdin func dupSlave(slave *os.File) error { - // we close Stdin,etc so our pty slave should have fd 0 if slave.Fd() != 0 { return fmt.Errorf("slave fd not 0 %d", slave.Fd()) } @@ -118,7 +118,8 @@ func dupSlave(slave *os.File) error { return nil } -// openTerminal is a clone of os.OpenFile without the O_CLOEXEC addition. +// openTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace func openTerminal(name string, flag int) (*os.File, error) { r, e := syscall.Open(name, flag, 0) if e != nil { @@ -127,7 +128,10 @@ func openTerminal(name string, flag int) (*os.File, error) { return os.NewFile(uintptr(r), name), nil } -func setupNetworking(config *libcontainer.Network, tempVethName string) error { +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { if config != nil { if err := network.InterfaceDown(tempVethName); err != nil { return fmt.Errorf("interface down %s %s", tempVethName, err) diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 30c8b06..f45fe55 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -26,7 +26,7 @@ func main() { log.Fatal(ErrWrongArguments) } switch os.Args[1] { - case "exec": + case "exec": // this is executed outside of the namespace in the cwd var exitCode int nspid, err := readPid() if err != nil { @@ -43,7 +43,7 @@ func main() { log.Fatal(err) } os.Exit(exitCode) - case "init": + case "init": // this is executed inside of the namespace to setup the container if argc < 3 { log.Fatal(ErrWrongArguments) } diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index baa850f..6eb2e09 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -10,10 +10,16 @@ import ( "syscall" ) -// default mount point options +// default mount point flags const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +// setupNewMountNamespace is used to initialize a new mount namespace for an new +// container in the rootfs that is specified. +// +// There is no need to unmount the new mounts because as soon as the mount namespace +// is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { + // mount as slave so that the new mounts do not propagate to the host if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } @@ -55,6 +61,7 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { return nil } +// copyDevNodes mknods the hosts devices so the new container has access to them func copyDevNodes(rootfs string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -82,6 +89,8 @@ func copyDevNodes(rootfs string) error { return nil } +// setupDev symlinks the current processes pipes into the +// appropriate destination on the containers rootfs func setupDev(rootfs string) error { for _, link := range []struct { from string @@ -104,6 +113,7 @@ func setupDev(rootfs string) error { return nil } +// setupConsole ensures that the container has a proper /dev/console setup func setupConsole(rootfs, console string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -161,6 +171,24 @@ func mountSystem(rootfs string) error { return nil } +// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and +// finishes setting up /dev/console +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if err := setupConsole(rootfs, console); err != nil { + return err + } + return nil +} + +// remountProc is used to detach and remount the proc filesystem +// commonly needed with running a new process inside an existing container func remountProc() error { if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err @@ -183,14 +211,3 @@ func remountSys() error { } return nil } - -func setupPtmx(rootfs, console string) error { - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - return setupConsole(rootfs, console) -} diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index a2809eb..481bdf7 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -14,6 +14,9 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } +// namespaceFileMap is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace var namespaceFileMap = map[libcontainer.Namespace]string{ libcontainer.CLONE_NEWNS: "mnt", libcontainer.CLONE_NEWUTS: "uts", diff --git a/libcontainer/types.go b/libcontainer/types.go index fcd00fd..bb54ff5 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,5 +1,8 @@ package libcontainer +// These constants are defined as string types so that +// it is clear when adding the configuration in config files +// instead of using ints or other types const ( CAP_SETPCAP Capability = "SETPCAP" CAP_SYS_MODULE Capability = "SYS_MODULE" @@ -25,9 +28,15 @@ const ( CLONE_NEWNET Namespace = "NEWNET" // network ) -type Namespace string -type Namespaces []Namespace +type ( + Namespace string + Namespaces []Namespace + Capability string + Capabilities []Capability +) +// Contains returns true if the specified Namespace is +// in the slice func (n Namespaces) Contains(ns Namespace) bool { for _, nns := range n { if nns == ns { @@ -37,9 +46,8 @@ func (n Namespaces) Contains(ns Namespace) bool { return false } -type Capability string -type Capabilities []Capability - +// Contains returns true if the specified Capability is +// in the slice func (c Capabilities) Contains(capp Capability) bool { for _, cc := range c { if cc == capp { diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index d3223c3..5050997 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -6,6 +6,8 @@ import ( "io" ) +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value func GenerateRandomName(prefix string, size int) (string, error) { id := make([]byte, 32) if _, err := io.ReadFull(rand.Reader, id); err != nil { From d68ba7742fdcb5d0dea815e7bd27c2035301660d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 19 Feb 2014 22:46:02 -0800 Subject: [PATCH 042/117] Remove privileged.json config Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/privileged.json | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 libcontainer/privileged.json diff --git a/libcontainer/privileged.json b/libcontainer/privileged.json deleted file mode 100644 index be877ad..0000000 --- a/libcontainer/privileged.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "id": "koye", - "namespace_pid": 3745, - "command": { - "args": [ - "/usr/lib/systemd/systemd" - ], - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=" - ] - }, - "rootfs": "/root/main/mycontainer", - "namespaces": [ - "NEWIPC", - "NEWNS", - "NEWPID", - "NEWUTS" - ] -} From 8590435fa025241ca28be9780026d6ca571c6816 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 12:00:54 -0800 Subject: [PATCH 043/117] WIP for setup kmsg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 6eb2e09..67f9020 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -43,6 +43,9 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupPtmx(rootfs, console); err != nil { return err } + if err := setupKmsg(rootfs); err != nil { + return err + } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } @@ -211,3 +214,32 @@ func remountSys() error { } return nil } + +func setupKmsg(rootfs string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + var ( + source = filepath.Join(rootfs, "dev/kmsg") + dest = filepath.Join(rootfs, "proc/kmsg") + ) + + if err := system.Mkfifo(source, 0600); err != nil { + return err + } + + os.Chmod(source, 0600) + os.Chown(source, 0, 0) + + if err := system.Mount(source, dest, "bind", syscall.MS_BIND, ""); err != nil { + return err + } + _, err := os.OpenFile(source, syscall.O_RDWR|syscall.O_NDELAY|syscall.O_CLOEXEC, 0) + if err != nil { + return err + } + if err := syscall.Unlink(source); err != nil { + return err + } + return nil +} From 3de41b34a2c49cea867223a4269e1c543bd66ec3 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 20 Feb 2014 23:12:08 +0100 Subject: [PATCH 044/117] libcontainer: Initial version of cgroups support This is a minimal version of raw cgroup support for libcontainer. It has only enough for what docker needs, and it has no support for systemd yet. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- cgroups/cgroups.go | 16 ++- libcontainer/cgroup/cgroup.go | 177 ++++++++++++++++++++++++++++++++++ libcontainer/container.go | 7 ++ libcontainer/container.json | 5 +- libcontainer/nsinit/exec.go | 13 ++- libcontainer/nsinit/init.go | 10 +- 6 files changed, 218 insertions(+), 10 deletions(-) create mode 100644 libcontainer/cgroup/cgroup.go diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 91ac384..b9318f9 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return parseCgroupFile(subsystem, f) +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) @@ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { } text := s.Text() parts := strings.Split(text, ":") - if parts[1] == subsystem { - return parts[2], nil + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } } } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go new file mode 100644 index 0000000..e30262c --- /dev/null +++ b/libcontainer/cgroup/cgroup.go @@ -0,0 +1,177 @@ +package cgroup + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "io/ioutil" + "os" + "path/filepath" + "strconv" +) + +// We have two implementation of cgroups support, one is based on +// systemd and the dbus api, and one is based on raw cgroup fs operations +// following the pre-single-writer model docs at: +// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +func useSystemd() bool { + return false +} + +func applyCgroupSystemd(container *libcontainer.Container, pid int) error { + return fmt.Errorf("not supported yet") +} + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { + cgroup := container.CgroupName + if container.CgroupParent != "" { + cgroup = filepath.Join(container.CgroupParent, cgroup) + } + + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + + path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) + + return path, nil +} + +func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { + path, err := getCgroup(subsystem, container) + if err != nil { + return "", err + } + + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + + return path, nil +} + +func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + + if !container.DeviceAccess { + dir, err := joinCgroup("devices", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + + if container.Memory != 0 || container.MemorySwap != 0 { + dir, err := joinCgroup("memory", container, pid) + if err != nil { + return err + } + defer func() { + if retErr != nil { + os.RemoveAll(dir) + } + }() + + if container.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + return err + } + } + if container.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { + return err + } + } + } + + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := joinCgroup("cpu", container, pid) + if err != nil { + return err + } + if container.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { + return err + } + } + return nil +} + +func CleanupCgroup(container *libcontainer.Container) error { + path, _ := getCgroup("memory", container) + os.RemoveAll(path) + path, _ = getCgroup("devices", container) + os.RemoveAll(path) + path, _ = getCgroup("cpu", container) + os.RemoveAll(path) + return nil +} + +func ApplyCgroup(container *libcontainer.Container, pid int) error { + if container.CgroupName == "" { + return nil + } + + if useSystemd() { + return applyCgroupSystemd(container, pid) + } else { + return applyCgroupRaw(container, pid) + } +} diff --git a/libcontainer/container.go b/libcontainer/container.go index a6a57da..b34ac8b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,6 +11,13 @@ type Container struct { Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Network *Network `json:"network,omitempty"` // nil for host's network stack + + CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup + CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index ccc9abb..3e23600 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -34,5 +34,8 @@ "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 - } + }, + "cgroup_name": "docker-koye", + "cgroup_parent": "docker", + "memory": 524800 } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 202cfca..acff647 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -5,6 +5,7 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/cgroup" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" @@ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return -1, err } if err := writePidFile(command); err != nil { + command.Process.Kill() return -1, err } defer deletePidFile() + // Do this before syncing with child so that no children + // can escape the cgroup + if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if container.Network != nil { vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { @@ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) sendVethName(vethPair, inPipe) } + // Sync with child + inPipe.Close() + go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { fmt.Fprint(pipe, name) - pipe.Close() } // initializeContainerVeth will create a veth pair and setup the host's diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index c77fd90..f619276 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin return err } - var tempVethName string - if container.Network != nil { - tempVethName, err = getVethName() - if err != nil { - return err - } + // We always read this as it is a way to sync with the parent as well + tempVethName, err := getVethName() + if err != nil { + return err } // close pipes so that we can replace it with the pty From 848fd7638b91f46b908e4afd90e96fc2c0155826 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 14:40:00 -0800 Subject: [PATCH 045/117] Revert "WIP for setup kmsg" This reverts commit 80db9a918337c4ae80ffa9a001da13bd24e848c8. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 67f9020..6eb2e09 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -43,9 +43,6 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupPtmx(rootfs, console); err != nil { return err } - if err := setupKmsg(rootfs); err != nil { - return err - } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } @@ -214,32 +211,3 @@ func remountSys() error { } return nil } - -func setupKmsg(rootfs string) error { - oldMask := system.Umask(0000) - defer system.Umask(oldMask) - - var ( - source = filepath.Join(rootfs, "dev/kmsg") - dest = filepath.Join(rootfs, "proc/kmsg") - ) - - if err := system.Mkfifo(source, 0600); err != nil { - return err - } - - os.Chmod(source, 0600) - os.Chown(source, 0, 0) - - if err := system.Mount(source, dest, "bind", syscall.MS_BIND, ""); err != nil { - return err - } - _, err := os.OpenFile(source, syscall.O_RDWR|syscall.O_NDELAY|syscall.O_CLOEXEC, 0) - if err != nil { - return err - } - if err := syscall.Unlink(source); err != nil { - return err - } - return nil -} From b90aaf6828eab4f2bbdba82c3e0e4cd0d356b6ad Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 14:40:36 -0800 Subject: [PATCH 046/117] Remove clone_vfork Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index acff647..f73ad32 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -165,7 +165,7 @@ func deletePidFile() error { func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { command := exec.Command("nsinit", append([]string{"init", console}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces) | syscall.CLONE_VFORK), // we need CLONE_VFORK so we can wait on the child + Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } return command } From 1cecb000035a2d16a99a80a20871ea4d207d29b1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 15:48:48 -0800 Subject: [PATCH 047/117] Refactory cgroups into general pkg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 61 +++++++++++++++- libcontainer/cgroup/cgroup.go | 131 ++++++++++++---------------------- libcontainer/container.go | 28 ++++---- libcontainer/container.json | 8 ++- 4 files changed, 124 insertions(+), 104 deletions(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index b9318f9..1e96caa 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -5,10 +5,23 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/mount" "io" + "io/ioutil" "os" + "path/filepath" + "strconv" "strings" ) +type Cgroup struct { + Name string `json:"name,omitempty"` + Parent string `json:"parent,omitempty"` + + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) +} + // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { mounts, err := mount.GetMounts() @@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } } } - return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem) } @@ -50,9 +62,50 @@ func GetInitCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func (c *Cgroup) Path(root, subsystem string) (string, error) { + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(root, subsystem, initPath, cgroup), nil +} + +func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { + path, err := c.Path(root, subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (c *Cgroup) Cleanup(root string) error { + get := func(subsystem string) string { + path, _ := c.Path(root, subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + os.RemoveAll(path) + } + return nil +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) - for s.Scan() { if err := s.Err(); err != nil { return "", err @@ -67,3 +120,7 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) } + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go index e30262c..5f27ac3 100644 --- a/libcontainer/cgroup/cgroup.go +++ b/libcontainer/cgroup/cgroup.go @@ -10,71 +10,46 @@ import ( "strconv" ) -// We have two implementation of cgroups support, one is based on -// systemd and the dbus api, and one is based on raw cgroup fs operations -// following the pre-single-writer model docs at: -// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ -const ( - cgroupRoot = "/sys/fs/cgroup" -) - -func useSystemd() bool { - return false -} - -func applyCgroupSystemd(container *libcontainer.Container, pid int) error { - return fmt.Errorf("not supported yet") -} - -func writeFile(dir, file, data string) error { - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) -} - -func getCgroup(subsystem string, container *libcontainer.Container) (string, error) { - cgroup := container.CgroupName - if container.CgroupParent != "" { - cgroup = filepath.Join(container.CgroupParent, cgroup) +func ApplyCgroup(container *libcontainer.Container, pid int) (err error) { + if container.Cgroups == nil { + return nil } - initPath, err := cgroups.GetInitCgroupDir(subsystem) + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := cgroups.FindCgroupMountpoint("memory") if err != nil { - return "", err + return err } - - path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) - - return path, nil -} - -func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) { - path, err := getCgroup(subsystem, container) - if err != nil { - return "", err - } - - if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { - return "", err - } - - if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { - return "", err - } - - return path, nil -} - -func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { + cgroupRoot = filepath.Dir(cgroupRoot) if _, err := os.Stat(cgroupRoot); err != nil { return fmt.Errorf("cgroups fs not found") } + if err := setupDevices(container, cgroupRoot, pid); err != nil { + return err + } + if err := setupMemory(container, cgroupRoot, pid); err != nil { + return err + } + if err := setupCpu(container, cgroupRoot, pid); err != nil { + return err + } + return nil +} - if !container.DeviceAccess { - dir, err := joinCgroup("devices", container, pid) +func setupDevices(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { + if !container.Cgroups.DeviceAccess { + dir, err := container.Cgroups.Join(cgroupRoot, "devices", pid) if err != nil { return err } + defer func() { - if retErr != nil { + if err != nil { os.RemoveAll(dir) } }() @@ -113,65 +88,53 @@ func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) { } } } + return nil +} - if container.Memory != 0 || container.MemorySwap != 0 { - dir, err := joinCgroup("memory", container, pid) +func setupMemory(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { + if container.Cgroups.Memory != 0 || container.Cgroups.MemorySwap != 0 { + dir, err := container.Cgroups.Join(cgroupRoot, "memory", pid) if err != nil { return err } defer func() { - if retErr != nil { + if err != nil { os.RemoveAll(dir) } }() - if container.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + if container.Cgroups.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { return err } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil { + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { return err } } - if container.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil { + if container.Cgroups.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.Cgroups.MemorySwap, 10)); err != nil { return err } } } + return nil +} +func setupCpu(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { // We always want to join the cpu group, to allow fair cpu scheduling // on a container basis - dir, err := joinCgroup("cpu", container, pid) + dir, err := container.Cgroups.Join(cgroupRoot, "cpu", pid) if err != nil { return err } - if container.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil { + if container.Cgroups.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.Cgroups.CpuShares, 10)); err != nil { return err } } return nil } -func CleanupCgroup(container *libcontainer.Container) error { - path, _ := getCgroup("memory", container) - os.RemoveAll(path) - path, _ = getCgroup("devices", container) - os.RemoveAll(path) - path, _ = getCgroup("cpu", container) - os.RemoveAll(path) - return nil -} - -func ApplyCgroup(container *libcontainer.Container, pid int) error { - if container.CgroupName == "" { - return nil - } - - if useSystemd() { - return applyCgroupSystemd(container, pid) - } else { - return applyCgroupRaw(container, pid) - } +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } diff --git a/libcontainer/container.go b/libcontainer/container.go index b34ac8b..4c0e39a 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -1,23 +1,21 @@ package libcontainer +import ( + "github.com/dotcloud/docker/pkg/cgroups" +) + // Container defines configuration options for how a // container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` // hostname - ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly - User string `json:"user,omitempty"` // user to execute the process as - WorkingDir string `json:"working_dir,omitempty"` // current working directory - Env []string `json:"environment,omitempty"` // environment to set - Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply - Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop - Network *Network `json:"network,omitempty"` // nil for host's network stack - - CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup - CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice - DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice - Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) - MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap - CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Network *Network `json:"network,omitempty"` // nil for host's network stack + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` } // Network defines configuration for a container's networking stack diff --git a/libcontainer/container.json b/libcontainer/container.json index 3e23600..2207543 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -35,7 +35,9 @@ "bridge": "docker0", "mtu": 1500 }, - "cgroup_name": "docker-koye", - "cgroup_parent": "docker", - "memory": 524800 + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 524800 + } } From c4769ec624becd45e3dcc56c559f15b0db3a7a4c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 15:50:55 -0800 Subject: [PATCH 048/117] Change IP to address because it includes the subnet Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 7 ++++++- libcontainer/container.go | 2 +- libcontainer/container.json | 2 +- libcontainer/nsinit/init.go | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 07fe4f7..163161c 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -47,10 +47,15 @@ Sample `container.json` file: "MAC_ADMIN" ], "network": { - "ip": "172.17.0.100/16", + "address": "172.17.0.100/16", "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 + }, + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 524800 } } ``` diff --git a/libcontainer/container.go b/libcontainer/container.go index 4c0e39a..e6e4b47 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -23,7 +23,7 @@ type Container struct { // The network configuration can be omited from a container causing the // container to be setup with the host's networking stack type Network struct { - IP string `json:"ip,omitempty"` + Address string `json:"address,omitempty"` Gateway string `json:"gateway,omitempty"` Bridge string `json:"bridge,omitempty"` Mtu int `json:"mtu,omitempty"` diff --git a/libcontainer/container.json b/libcontainer/container.json index 2207543..c1a07dc 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -30,7 +30,7 @@ "MAC_ADMIN" ], "network": { - "ip": "172.17.0.100/16", + "address": "172.17.0.100/16", "gateway": "172.17.42.1", "bridge": "docker0", "mtu": 1500 diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f619276..f89e539 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -137,7 +137,7 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { return fmt.Errorf("change %s to eth0 %s", tempVethName, err) } - if err := network.SetInterfaceIp("eth0", config.IP); err != nil { + if err := network.SetInterfaceIp("eth0", config.Address); err != nil { return fmt.Errorf("set eth0 ip %s", err) } if err := network.SetMtu("eth0", config.Mtu); err != nil { From ccc915b7b9ac0fcc4b3bc70e3c348357f6321249 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 16:11:22 -0800 Subject: [PATCH 049/117] Move rest of cgroups functions into cgroups pkg Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 122 +++++++++++++++++++++++++++++ libcontainer/cgroup/cgroup.go | 140 ---------------------------------- libcontainer/nsinit/exec.go | 9 ++- 3 files changed, 127 insertions(+), 144 deletions(-) delete mode 100644 libcontainer/cgroup/cgroup.go diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 1e96caa..96002f0 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -124,3 +124,125 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) { func writeFile(dir, file, data string) error { return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } + +func (c *Cgroup) Apply(pid int) error { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := FindCgroupMountpoint("memory") + if err != nil { + return err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + if err := c.setupDevices(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupMemory(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupCpu(cgroupRoot, pid); err != nil { + return err + } + return nil +} + +func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { + if !c.DeviceAccess { + dir, err := c.Join(cgroupRoot, "devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := c.Join(cgroupRoot, "memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + if c.MemorySwap != 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10)); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := c.Join(cgroupRoot, "cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/cgroup/cgroup.go b/libcontainer/cgroup/cgroup.go deleted file mode 100644 index 5f27ac3..0000000 --- a/libcontainer/cgroup/cgroup.go +++ /dev/null @@ -1,140 +0,0 @@ -package cgroup - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/cgroups" - "github.com/dotcloud/docker/pkg/libcontainer" - "io/ioutil" - "os" - "path/filepath" - "strconv" -) - -func ApplyCgroup(container *libcontainer.Container, pid int) (err error) { - if container.Cgroups == nil { - return nil - } - - // We have two implementation of cgroups support, one is based on - // systemd and the dbus api, and one is based on raw cgroup fs operations - // following the pre-single-writer model docs at: - // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ - // - // we can pick any subsystem to find the root - cgroupRoot, err := cgroups.FindCgroupMountpoint("memory") - if err != nil { - return err - } - cgroupRoot = filepath.Dir(cgroupRoot) - if _, err := os.Stat(cgroupRoot); err != nil { - return fmt.Errorf("cgroups fs not found") - } - if err := setupDevices(container, cgroupRoot, pid); err != nil { - return err - } - if err := setupMemory(container, cgroupRoot, pid); err != nil { - return err - } - if err := setupCpu(container, cgroupRoot, pid); err != nil { - return err - } - return nil -} - -func setupDevices(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - if !container.Cgroups.DeviceAccess { - dir, err := container.Cgroups.Join(cgroupRoot, "devices", pid) - if err != nil { - return err - } - - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if err := writeFile(dir, "devices.deny", "a"); err != nil { - return err - } - - allow := []string{ - // /dev/null, zero, full - "c 1:3 rwm", - "c 1:5 rwm", - "c 1:7 rwm", - - // consoles - "c 5:1 rwm", - "c 5:0 rwm", - "c 4:0 rwm", - "c 4:1 rwm", - - // /dev/urandom,/dev/random - "c 1:9 rwm", - "c 1:8 rwm", - - // /dev/pts/ - pts namespaces are "coming soon" - "c 136:* rwm", - "c 5:2 rwm", - - // tuntap - "c 10:200 rwm", - } - - for _, val := range allow { - if err := writeFile(dir, "devices.allow", val); err != nil { - return err - } - } - } - return nil -} - -func setupMemory(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - if container.Cgroups.Memory != 0 || container.Cgroups.MemorySwap != 0 { - dir, err := container.Cgroups.Join(cgroupRoot, "memory", pid) - if err != nil { - return err - } - defer func() { - if err != nil { - os.RemoveAll(dir) - } - }() - - if container.Cgroups.Memory != 0 { - if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { - return err - } - if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Cgroups.Memory, 10)); err != nil { - return err - } - } - if container.Cgroups.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.Cgroups.MemorySwap, 10)); err != nil { - return err - } - } - } - return nil -} - -func setupCpu(container *libcontainer.Container, cgroupRoot string, pid int) (err error) { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - dir, err := container.Cgroups.Join(cgroupRoot, "cpu", pid) - if err != nil { - return err - } - if container.Cgroups.CpuShares != 0 { - if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.Cgroups.CpuShares, 10)); err != nil { - return err - } - } - return nil -} - -func writeFile(dir, file, data string) error { - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) -} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f73ad32..f04e9be 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -5,7 +5,6 @@ package main import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/cgroup" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" @@ -41,9 +40,11 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // Do this before syncing with child so that no children // can escape the cgroup - if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil { - command.Process.Kill() - return -1, err + if container.Cgroups != nil { + if err := container.Cgroups.Apply(command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } } if container.Network != nil { From 3677967f4e48fb39c6e915891188a85a374c6710 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:53:50 -0800 Subject: [PATCH 050/117] Use flag for init Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 2 +- libcontainer/nsinit/main.go | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f04e9be..6d87f3b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -164,7 +164,7 @@ func deletePidFile() error { // defined on the container's configuration and use the current binary as the init with the // args provided func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"init", console}, args...)...) + command := exec.Command("nsinit", append([]string{"-console", console, "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index f45fe55..e7240df 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -3,6 +3,7 @@ package main import ( "encoding/json" "errors" + "flag" "github.com/dotcloud/docker/pkg/libcontainer" "io/ioutil" "log" @@ -16,16 +17,18 @@ var ( ) func main() { + console := flag.String("console", "", "Console (pty slave) name") + flag.Parse() + container, err := loadContainer() if err != nil { log.Fatal(err) } - argc := len(os.Args) - if argc < 2 { + if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } - switch os.Args[1] { + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd var exitCode int nspid, err := readPid() @@ -35,23 +38,23 @@ func main() { } } if nspid > 0 { - exitCode, err = execinCommand(container, nspid, os.Args[2:]) + exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, os.Args[2:]) + exitCode, err = execCommand(container, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container - if argc < 3 { + if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, os.Args[2], os.Args[3:]); err != nil { + if err := initCommand(container, *console, flag.Args()[1:]); err != nil { log.Fatal(err) } default: - log.Fatalf("command not supported for nsinit %s", os.Args[1]) + log.Fatalf("command not supported for nsinit %s", flag.Arg(0)) } } From b2e01cbe8cd0ce3b541fd0cd6250e21995cd4236 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:58:13 -0800 Subject: [PATCH 051/117] Use a custom pipe instead of stdin for sync net namespace Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 16 ++++++++++------ libcontainer/nsinit/init.go | 11 +++++++---- libcontainer/nsinit/main.go | 3 ++- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6d87f3b..8007ed4 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -22,16 +22,20 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return -1, err } - command := createCommand(container, console, args) // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child - inPipe, err := command.StdinPipe() + r, w, err := os.Pipe() if err != nil { return -1, err } + system.UsetCloseOnExec(r.Fd()) + + command := createCommand(container, console, r.Fd(), args) + if err := command.Start(); err != nil { return -1, err } + if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -52,11 +56,11 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) if err != nil { return -1, err } - sendVethName(vethPair, inPipe) + sendVethName(vethPair, w) } // Sync with child - inPipe.Close() + w.Close() go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -163,8 +167,8 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"-console", console, "init"}, args...)...) +func createCommand(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{"-console", console, "-pipe", fmt.Sprint(pipe), "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f89e539..a0815ee 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -8,20 +8,21 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "io" "io/ioutil" "os" "path/filepath" "syscall" ) -func initCommand(container *libcontainer.Container, console string, args []string) error { +func initCommand(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err } // We always read this as it is a way to sync with the parent as well - tempVethName, err := getVethName() + tempVethName, err := getVethName(pipe) if err != nil { return err } @@ -164,8 +165,10 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { // getVethName reads from Stdin the temp veth name // sent by the parent processes after the veth pair // has been created and setup -func getVethName() (string, error) { - data, err := ioutil.ReadAll(os.Stdin) +func getVethName(pipe io.ReadCloser) (string, error) { + defer pipe.Close() + + data, err := ioutil.ReadAll(pipe) if err != nil { return "", fmt.Errorf("error reading from stdin %s", err) } diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index e7240df..6f2825b 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -18,6 +18,7 @@ var ( func main() { console := flag.String("console", "", "Console (pty slave) name") + pipeFd := flag.Int("pipe", 0, "sync pipe fd") flag.Parse() container, err := loadContainer() @@ -50,7 +51,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, *console, flag.Args()[1:]); err != nil { + if err := initCommand(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From 41696722fa70acd53db889d13a0a05b2413d36a3 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 17:59:08 -0800 Subject: [PATCH 052/117] Minor cleanup Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 8007ed4..7f552c2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -56,11 +56,12 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) if err != nil { return -1, err } - sendVethName(vethPair, w) + sendVethName(w, vethPair) } // Sync with child w.Close() + r.Close() go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -82,7 +83,7 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) // sendVethName writes the veth pair name to the child's stdin then closes the // pipe so that the child stops waiting for more data -func sendVethName(name string, pipe io.WriteCloser) { +func sendVethName(pipe io.Writer, name string) { fmt.Fprint(pipe, name) } From 97738ffed35855bfe5ad91767a8e6a7c222937a3 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 18:05:40 -0800 Subject: [PATCH 053/117] Handle non-tty mode Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 55 ++++++++++++++++++++++++++++-------- libcontainer/nsinit/init.go | 30 +++++++++++--------- libcontainer/nsinit/main.go | 9 ++++-- libcontainer/nsinit/mount.go | 6 ++-- 4 files changed, 70 insertions(+), 30 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 7f552c2..b290ace 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -16,10 +16,21 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container, args []string) (int, error) { - master, console, err := createMasterAndConsole() - if err != nil { - return -1, err +func execCommand(container *libcontainer.Container, tty bool, args []string) (int, error) { + var ( + master *os.File + console string + err error + + inPipe io.WriteCloser + outPipe, errPipe io.ReadCloser + ) + + if tty { + master, console, err = createMasterAndConsole() + if err != nil { + return -1, err + } } // create a pipe so that we can syncronize with the namespaced process and @@ -32,6 +43,21 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) command := createCommand(container, console, r.Fd(), args) + if !tty { + inPipe, err = command.StdinPipe() + if err != nil { + return -1, err + } + outPipe, err = command.StdoutPipe() + if err != nil { + return -1, err + } + errPipe, err = command.StderrPipe() + if err != nil { + return -1, err + } + } + if err := command.Start(); err != nil { return -1, err } @@ -63,15 +89,20 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) w.Close() r.Close() - go io.Copy(os.Stdout, master) - go io.Copy(master, os.Stdin) - - state, err := setupWindow(master) - if err != nil { - command.Process.Kill() - return -1, err + if tty { + go io.Copy(os.Stdout, master) + go io.Copy(master, os.Stdin) + state, err := setupWindow(master) + if err != nil { + command.Process.Kill() + return -1, err + } + defer term.RestoreTerminal(os.Stdin.Fd(), state) + } else { + go io.Copy(inPipe, os.Stdin) + go io.Copy(os.Stdout, outPipe) + go io.Copy(os.Stderr, errPipe) } - defer term.RestoreTerminal(os.Stdin.Fd(), state) if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index a0815ee..ef7fc4e 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -27,23 +27,27 @@ func initCommand(container *libcontainer.Container, console string, pipe io.Read return err } - // close pipes so that we can replace it with the pty - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() + if console != "" { + // close pipes so that we can replace it with the pty + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() + slave, err := openTerminal(console, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + return fmt.Errorf("dup2 slave %s", err) + } + } - slave, err := openTerminal(console, syscall.O_RDWR) - if err != nil { - return fmt.Errorf("open terminal %s", err) - } - if err := dupSlave(slave); err != nil { - return fmt.Errorf("dup2 slave %s", err) - } if _, err := system.Setsid(); err != nil { return fmt.Errorf("setsid %s", err) } - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) + if console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } } if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent deth signal %s", err) diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/main.go index 6f2825b..f66ff0d 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/main.go @@ -17,8 +17,11 @@ var ( ) func main() { - console := flag.String("console", "", "Console (pty slave) name") - pipeFd := flag.Int("pipe", 0, "sync pipe fd") + var ( + console = flag.String("console", "", "Console (pty slave) name") + tty = flag.Bool("tty", false, "Create a tty") + pipeFd = flag.Int("pipe", 0, "sync pipe fd") + ) flag.Parse() container, err := loadContainer() @@ -41,7 +44,7 @@ func main() { if nspid > 0 { exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, flag.Args()[1:]) + exitCode, err = execCommand(container, *tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 6eb2e09..9cf69f4 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -40,8 +40,10 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := setupDev(rootfs); err != nil { return err } - if err := setupPtmx(rootfs, console); err != nil { - return err + if console != "" { + if err := setupPtmx(rootfs, console); err != nil { + return err + } } if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) From 52fa4de610a7e77da6b8afecebd09f759faa367d Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 20 Feb 2014 18:10:30 -0800 Subject: [PATCH 054/117] Make sure to close the pipe upon ctrl-d Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/exec.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b290ace..44d9aff 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -99,7 +99,10 @@ func execCommand(container *libcontainer.Container, tty bool, args []string) (in } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { - go io.Copy(inPipe, os.Stdin) + go func() { + defer inPipe.Close() + io.Copy(inPipe, os.Stdin) + }() go io.Copy(os.Stdout, outPipe) go io.Copy(os.Stderr, errPipe) } @@ -109,6 +112,7 @@ func execCommand(container *libcontainer.Container, tty bool, args []string) (in return -1, err } } + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } From 5d71533d4eb75a96c9d163e6a2ec8181a91395fe Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 18:27:42 -0800 Subject: [PATCH 055/117] Make nsinit a proper go pkg and add the main in another dir Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 4 +++- libcontainer/nsinit/exec.go | 6 ++++-- libcontainer/nsinit/execin.go | 5 +++-- libcontainer/nsinit/init.go | 6 ++++-- libcontainer/nsinit/mount.go | 2 +- libcontainer/nsinit/ns_linux.go | 2 +- libcontainer/nsinit/{ => nsinit}/main.go | 7 ++++--- 7 files changed, 20 insertions(+), 12 deletions(-) rename libcontainer/nsinit/{ => nsinit}/main.go (87%) diff --git a/libcontainer/README.md b/libcontainer/README.md index 163161c..3a2a843 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -72,9 +72,11 @@ rootfs and copy a `container.json` file into the directory with your specified c To execution `/bin/bash` in the current directory as a container just run: ```bash -nsinit exec /bin/bash +nsinit -tty exec /bin/bash ``` +If you want a proper tty setup inside the new container you must use the `-tty` flag when running nsinit. + If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 44d9aff..9d0f7ff 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" @@ -16,7 +16,9 @@ import ( "syscall" ) -func execCommand(container *libcontainer.Container, tty bool, args []string) (int, error) { +// Exec performes setup outside of a namespace so that a container can be +// executed. Exec is a high level function for working with container namespaces. +func Exec(container *libcontainer.Container, tty bool, args []string) (int, error) { var ( master *os.File console string diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index d6224f9..85a8990 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -1,4 +1,4 @@ -package main +package nsinit import ( "fmt" @@ -11,7 +11,8 @@ import ( "syscall" ) -func execinCommand(container *libcontainer.Container, nspid int, args []string) (int, error) { +// ExecIn uses an existing pid and joins the pid's namespaces with the new command. +func ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index ef7fc4e..f80d785 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" @@ -15,7 +15,9 @@ import ( "syscall" ) -func initCommand(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { +// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, +// and other options required for the new container. +func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { rootfs, err := resolveRootfs() if err != nil { return err diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 9cf69f4..a73e97e 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -1,6 +1,6 @@ // +build linux -package main +package nsinit import ( "fmt" diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 481bdf7..e42d4b8 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -1,4 +1,4 @@ -package main +package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" diff --git a/libcontainer/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go similarity index 87% rename from libcontainer/nsinit/main.go rename to libcontainer/nsinit/nsinit/main.go index f66ff0d..9d3c201 100644 --- a/libcontainer/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -5,6 +5,7 @@ import ( "errors" "flag" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" "io/ioutil" "log" "os" @@ -42,9 +43,9 @@ func main() { } } if nspid > 0 { - exitCode, err = execinCommand(container, nspid, flag.Args()[1:]) + exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = execCommand(container, *tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, *tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) @@ -54,7 +55,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := initCommand(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From 6054bda2b8a5c51d2663b3c7a563ad024ac12e80 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 20 Feb 2014 18:38:28 -0800 Subject: [PATCH 056/117] Refactor the flag management for main Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 13 ++++--------- libcontainer/nsinit/init.go | 1 - libcontainer/nsinit/nsinit/main.go | 27 +++++++++++++++++---------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 9d0f7ff..d2b87b6 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -44,18 +44,14 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro system.UsetCloseOnExec(r.Fd()) command := createCommand(container, console, r.Fd(), args) - if !tty { - inPipe, err = command.StdinPipe() - if err != nil { + if inPipe, err = command.StdinPipe(); err != nil { return -1, err } - outPipe, err = command.StdoutPipe() - if err != nil { + if outPipe, err = command.StdoutPipe(); err != nil { return -1, err } - errPipe, err = command.StderrPipe() - if err != nil { + if errPipe, err = command.StderrPipe(); err != nil { return -1, err } } @@ -63,7 +59,6 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro if err := command.Start(); err != nil { return -1, err } - if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -94,6 +89,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro if tty { go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) + state, err := setupWindow(master) if err != nil { command.Process.Kill() @@ -114,7 +110,6 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro return -1, err } } - return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f80d785..88a5c3c 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -173,7 +173,6 @@ func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { // has been created and setup func getVethName(pipe io.ReadCloser) (string, error) { defer pipe.Close() - data, err := ioutil.ReadAll(pipe) if err != nil { return "", fmt.Errorf("error reading from stdin %s", err) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 9d3c201..33a7747 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -12,27 +12,34 @@ import ( "strconv" ) +var ( + console string + tty bool + pipeFd int +) + var ( ErrUnsupported = errors.New("Unsupported method") ErrWrongArguments = errors.New("Wrong argument count") ) -func main() { - var ( - console = flag.String("console", "", "Console (pty slave) name") - tty = flag.Bool("tty", false, "Create a tty") - pipeFd = flag.Int("pipe", 0, "sync pipe fd") - ) - flag.Parse() +func init() { + flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.BoolVar(&tty, "tty", false, "create a tty") + flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.Parse() +} + +func main() { container, err := loadContainer() if err != nil { log.Fatal(err) } - if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd var exitCode int @@ -45,7 +52,7 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, *tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, tty, flag.Args()[1:]) } if err != nil { log.Fatal(err) @@ -55,7 +62,7 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, *console, os.NewFile(uintptr(*pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: From d67915851dc941c4dc59ea6375ec21a327bf48f4 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 13:53:11 -0800 Subject: [PATCH 057/117] Move tty into container.json Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 7 +++---- libcontainer/container.go | 1 + libcontainer/container.json | 3 ++- libcontainer/nsinit/exec.go | 8 ++++---- libcontainer/nsinit/nsinit/main.go | 4 +--- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 3a2a843..89a4ec0 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -17,6 +17,7 @@ Sample `container.json` file: ```json { "hostname": "koye", + "tty": true, "environment": [ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", @@ -55,7 +56,7 @@ Sample `container.json` file: "cgroups": { "name": "docker-koye", "parent": "docker", - "memory": 524800 + "memory": 5248000 } } ``` @@ -72,11 +73,9 @@ rootfs and copy a `container.json` file into the directory with your specified c To execution `/bin/bash` in the current directory as a container just run: ```bash -nsinit -tty exec /bin/bash +nsinit exec /bin/bash ``` -If you want a proper tty setup inside the new container you must use the `-tty` flag when running nsinit. - If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. diff --git a/libcontainer/container.go b/libcontainer/container.go index e6e4b47..3c1b62b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -12,6 +12,7 @@ type Container struct { User string `json:"user,omitempty"` // user to execute the process as WorkingDir string `json:"working_dir,omitempty"` // current working directory Env []string `json:"environment,omitempty"` // environment to set + Tty bool `json:"tty,omitempty"` // setup a proper tty or not Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Network *Network `json:"network,omitempty"` // nil for host's network stack diff --git a/libcontainer/container.json b/libcontainer/container.json index c1a07dc..07e52df 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -1,5 +1,6 @@ { "hostname": "koye", + "tty": true, "environment": [ "HOME=/", "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", @@ -38,6 +39,6 @@ "cgroups": { "name": "docker-koye", "parent": "docker", - "memory": 524800 + "memory": 5248000 } } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index d2b87b6..e2adf3d 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -18,7 +18,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, tty bool, args []string) (int, error) { +func Exec(container *libcontainer.Container, args []string) (int, error) { var ( master *os.File console string @@ -28,7 +28,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro outPipe, errPipe io.ReadCloser ) - if tty { + if container.Tty { master, console, err = createMasterAndConsole() if err != nil { return -1, err @@ -44,7 +44,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro system.UsetCloseOnExec(r.Fd()) command := createCommand(container, console, r.Fd(), args) - if !tty { + if !container.Tty { if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -86,7 +86,7 @@ func Exec(container *libcontainer.Container, tty bool, args []string) (int, erro w.Close() r.Close() - if tty { + if container.Tty { go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 33a7747..6508a3e 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -14,7 +14,6 @@ import ( var ( console string - tty bool pipeFd int ) @@ -25,7 +24,6 @@ var ( func init() { flag.StringVar(&console, "console", "", "console (pty slave) path") - flag.BoolVar(&tty, "tty", false, "create a tty") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() @@ -52,7 +50,7 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, tty, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, flag.Args()[1:]) } if err != nil { log.Fatal(err) From d40fbbb69be7ee4e8aef1b2730a911f3fa93e36c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 14:49:55 -0800 Subject: [PATCH 058/117] Add good logging support to both sides Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 35 ++++++++++++++++++++++----- libcontainer/nsinit/init.go | 23 ++++++++++-------- libcontainer/nsinit/nsinit/main.go | 39 ++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index e2adf3d..24e722a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -11,6 +11,7 @@ import ( "github.com/dotcloud/docker/pkg/term" "io" "io/ioutil" + "log" "os" "os/exec" "syscall" @@ -18,7 +19,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, args []string) (int, error) { +func Exec(container *libcontainer.Container, logFile string, args []string) (int, error) { var ( master *os.File console string @@ -29,6 +30,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { ) if container.Tty { + log.Printf("setting up master and console") master, console, err = createMasterAndConsole() if err != nil { return -1, err @@ -43,8 +45,9 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } system.UsetCloseOnExec(r.Fd()) - command := createCommand(container, console, r.Fd(), args) + command := createCommand(container, console, logFile, r.Fd(), args) if !container.Tty { + log.Printf("opening pipes on command") if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -56,9 +59,11 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } } + log.Printf("staring init") if err := command.Start(); err != nil { return -1, err } + log.Printf("writting state file") if err := writePidFile(command); err != nil { command.Process.Kill() return -1, err @@ -68,6 +73,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { // Do this before syncing with child so that no children // can escape the cgroup if container.Cgroups != nil { + log.Printf("setting up cgroups") if err := container.Cgroups.Apply(command.Process.Pid); err != nil { command.Process.Kill() return -1, err @@ -75,18 +81,22 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } if container.Network != nil { - vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) + log.Printf("creating veth pair") + vethPair, err := initializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) if err != nil { return -1, err } + log.Printf("sending %s as veth pair name", vethPair) sendVethName(w, vethPair) } // Sync with child + log.Printf("closing sync pipes") w.Close() r.Close() if container.Tty { + log.Printf("starting copy for tty") go io.Copy(os.Stdout, master) go io.Copy(master, os.Stdin) @@ -97,6 +107,7 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { + log.Printf("starting copy for std pipes") go func() { defer inPipe.Close() io.Copy(inPipe, os.Stdin) @@ -105,11 +116,13 @@ func Exec(container *libcontainer.Container, args []string) (int, error) { go io.Copy(os.Stderr, errPipe) } + log.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } + log.Printf("process ended") return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } @@ -126,17 +139,22 @@ func sendVethName(pipe io.Writer, name string) { // Then will with set the other side of the veth pair into the container's namespaced // using the pid and returns the veth's interface name to provide to the container to // finish setting up the interface inside the namespace -func initializeContainerVeth(bridge string, nspid int) (string, error) { +func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err } + log.Printf("veth pair created %s <> %s", name1, name2) if err := network.SetInterfaceMaster(name1, bridge); err != nil { return "", err } + if err := network.SetMtu(name1, mtu); err != nil { + return "", err + } if err := network.InterfaceUp(name1); err != nil { return "", err } + log.Printf("setting %s inside %d namespace", name2, nspid) if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { return "", err } @@ -200,8 +218,13 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{"-console", console, "-pipe", fmt.Sprint(pipe), "init"}, args...)...) +func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { + command := exec.Command("nsinit", append([]string{ + "-console", console, + "-pipe", fmt.Sprint(pipe), + "-log", logFile, + "init"}, args...)...) + command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 88a5c3c..8fc5f3d 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -10,6 +10,7 @@ import ( "github.com/dotcloud/docker/pkg/system" "io" "io/ioutil" + "log" "os" "path/filepath" "syscall" @@ -17,19 +18,23 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, args []string) error { - rootfs, err := resolveRootfs() +func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe io.ReadCloser, args []string) error { + rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err } + log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well tempVethName, err := getVethName(pipe) if err != nil { return err } - + if tempVethName != "" { + log.Printf("received veth name %s", tempVethName) + } if console != "" { + log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty os.Stdin.Close() os.Stdout.Close() @@ -42,7 +47,6 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, return fmt.Errorf("dup2 slave %s", err) } } - if _, err := system.Setsid(); err != nil { return fmt.Errorf("setsid %s", err) } @@ -63,9 +67,11 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } + log.Printf("dropping capabilities") if err := capabilities.DropCapabilities(container); err != nil { return fmt.Errorf("drop capabilities %s", err) } + log.Printf("setting user in namespace") if err := setupUser(container); err != nil { return fmt.Errorf("setup user %s", err) } @@ -74,6 +80,7 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } + log.Printf("execing %s goodbye", args[0]) if err := system.Exec(args[0], args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } @@ -82,12 +89,8 @@ func Init(container *libcontainer.Container, console string, pipe io.ReadCloser, // resolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs -func resolveRootfs() (string, error) { - cwd, err := os.Getwd() - if err != nil { - return "", err - } - rootfs, err := filepath.Abs(cwd) +func resolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) if err != nil { return "", err } diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 6508a3e..0873c09 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -6,6 +6,7 @@ import ( "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "io" "io/ioutil" "log" "os" @@ -15,6 +16,7 @@ import ( var ( console string pipeFd int + logFile string ) var ( @@ -24,22 +26,27 @@ var ( func init() { flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() } func main() { + if flag.NArg() < 1 { + log.Fatal(ErrWrongArguments) + } container, err := loadContainer() if err != nil { log.Fatal(err) } - if flag.NArg() < 1 { - log.Fatal(ErrWrongArguments) + if err := setupLogging(); err != nil { + log.Fatal(err) } - switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd + log.SetPrefix("[nsinit exec] ") + var exitCode int nspid, err := readPid() if err != nil { @@ -50,17 +57,22 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, logFile, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container + log.SetPrefix("[nsinit init] ") + cwd, err := os.Getwd() + if err != nil { + log.Fatal(err) + } if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { + if err := nsinit.Init(container, cwd, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { log.Fatal(err) } default: @@ -93,3 +105,20 @@ func readPid() (int, error) { } return pid, nil } + +func setupLogging() (err error) { + var writer io.Writer + switch logFile { + case "stderr": + writer = os.Stderr + case "none", "": + writer = ioutil.Discard + default: + writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) + if err != nil { + return err + } + } + log.SetOutput(writer) + return nil +} From e90b85bdc06ffaef70f801e00eb62158d016e2a7 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 15:32:50 -0800 Subject: [PATCH 059/117] User os.Args[0] as name to reexec Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 24e722a..ba548a2 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -219,7 +219,9 @@ func deletePidFile() error { // defined on the container's configuration and use the current binary as the init with the // args provided func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { - command := exec.Command("nsinit", append([]string{ + // get our binary name so we can always reexec ourself + name := os.Args[0] + command := exec.Command(name, append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), "-log", logFile, From b9bc36a8bb9cb928a0be940e22e77fb851175f87 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:17:18 -0800 Subject: [PATCH 060/117] Use lookup path for init Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 1 + libcontainer/nsinit/init.go | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ba548a2..80fe849 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -230,5 +230,6 @@ func createCommand(container *libcontainer.Container, console, logFile string, p command.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), } + command.Env = container.Env return command } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 8fc5f3d..04716ba 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -12,6 +12,7 @@ import ( "io/ioutil" "log" "os" + "os/exec" "path/filepath" "syscall" ) @@ -80,8 +81,13 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - log.Printf("execing %s goodbye", args[0]) - if err := system.Exec(args[0], args[0:], container.Env); err != nil { + name, err := exec.LookPath(args[0]) + if err != nil { + return err + } + + log.Printf("execing %s goodbye", name) + if err := system.Exec(name, args[0:], container.Env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") From b3d2325c5fc91e3065715ef94aacabd619035e2d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:28:43 -0800 Subject: [PATCH 061/117] Pass pipes into Exec function Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 14 +++++++------- libcontainer/nsinit/nsinit/main.go | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 80fe849..98f5209 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -19,7 +19,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, logFile string, args []string) (int, error) { var ( master *os.File console string @@ -97,23 +97,23 @@ func Exec(container *libcontainer.Container, logFile string, args []string) (int if container.Tty { log.Printf("starting copy for tty") - go io.Copy(os.Stdout, master) - go io.Copy(master, os.Stdin) + go io.Copy(stdout, master) + go io.Copy(master, stdin) state, err := setupWindow(master) if err != nil { command.Process.Kill() return -1, err } - defer term.RestoreTerminal(os.Stdin.Fd(), state) + defer term.RestoreTerminal(uintptr(syscall.Stdin), state) } else { log.Printf("starting copy for std pipes") go func() { defer inPipe.Close() - io.Copy(inPipe, os.Stdin) + io.Copy(inPipe, stdin) }() - go io.Copy(os.Stdout, outPipe) - go io.Copy(os.Stderr, errPipe) + go io.Copy(stdout, outPipe) + go io.Copy(stderr, errPipe) } log.Printf("waiting on process") diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 0873c09..e6e3827 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -57,7 +57,9 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { - exitCode, err = nsinit.Exec(container, logFile, flag.Args()[1:]) + exitCode, err = nsinit.Exec(container, + os.Stdin, os.Stdout, os.Stderr, + logFile, flag.Args()[1:]) } if err != nil { log.Fatal(err) From fa64bff7151c8050d7e6cab5e6d965bc389ae6e6 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 16:40:32 -0800 Subject: [PATCH 062/117] Pass tty master to Exec Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 3 +-- libcontainer/nsinit/nsinit/main.go | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 98f5209..3622196 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -19,9 +19,8 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { var ( - master *os.File console string err error diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e6e3827..28d42d4 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -58,7 +58,7 @@ func main() { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { exitCode, err = nsinit.Exec(container, - os.Stdin, os.Stdout, os.Stderr, + os.Stdin, os.Stdout, os.Stderr, nil, logFile, flag.Args()[1:]) } if err != nil { From d2fa488fa2a8cf1d5d1ccc323246fb138e64a1f2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 17:11:57 -0800 Subject: [PATCH 063/117] Initial commit of libcontainer running docker Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 2 +- libcontainer/nsinit/ns_linux.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 3622196..6671ebe 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -227,7 +227,7 @@ func createCommand(container *libcontainer.Container, console, logFile string, p "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(getNamespaceFlags(container.Namespaces)), + Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), } command.Env = container.Env return command diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index e42d4b8..58af247 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -28,7 +28,7 @@ var namespaceFileMap = map[libcontainer.Namespace]string{ // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns -func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { flag |= namespaceMap[ns] } From cda4f27f57919dcfd0d2cc380456551ea50bdaa2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 21:14:21 -0800 Subject: [PATCH 064/117] Export functions of nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6671ebe..b2eaa0b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -30,7 +30,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if container.Tty { log.Printf("setting up master and console") - master, console, err = createMasterAndConsole() + master, console, err = CreateMasterAndConsole() if err != nil { return -1, err } @@ -44,7 +44,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } system.UsetCloseOnExec(r.Fd()) - command := createCommand(container, console, logFile, r.Fd(), args) + command := CreateCommand(container, console, logFile, r.Fd(), args) if !container.Tty { log.Printf("opening pipes on command") if inPipe, err = command.StdinPipe(); err != nil { @@ -81,12 +81,12 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if container.Network != nil { log.Printf("creating veth pair") - vethPair, err := initializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) + vethPair, err := InitializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) if err != nil { return -1, err } log.Printf("sending %s as veth pair name", vethPair) - sendVethName(w, vethPair) + SendVethName(w, vethPair) } // Sync with child @@ -99,7 +99,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. go io.Copy(stdout, master) go io.Copy(master, stdin) - state, err := setupWindow(master) + state, err := SetupWindow(master, os.Stdin) if err != nil { command.Process.Kill() return -1, err @@ -125,9 +125,9 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } -// sendVethName writes the veth pair name to the child's stdin then closes the +// SendVethName writes the veth pair name to the child's stdin then closes the // pipe so that the child stops waiting for more data -func sendVethName(pipe io.Writer, name string) { +func SendVethName(pipe io.Writer, name string) { fmt.Fprint(pipe, name) } @@ -138,7 +138,7 @@ func sendVethName(pipe io.Writer, name string) { // Then will with set the other side of the veth pair into the container's namespaced // using the pid and returns the veth's interface name to provide to the container to // finish setting up the interface inside the namespace -func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { +func InitializeContainerVeth(bridge string, mtu, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err @@ -160,20 +160,22 @@ func initializeContainerVeth(bridge string, mtu, nspid int) (string, error) { return name2, nil } -func setupWindow(master *os.File) (*term.State, error) { - ws, err := term.GetWinsize(os.Stdin.Fd()) +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func SetupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) if err != nil { return nil, err } if err := term.SetWinsize(master.Fd(), ws); err != nil { return nil, err } - return term.SetRawTerminal(os.Stdin.Fd()) + return term.SetRawTerminal(parent.Fd()) } -// createMasterAndConsole will open /dev/ptmx on the host and retreive the +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the // pts name for use as the pty slave inside the container -func createMasterAndConsole() (*os.File, string, error) { +func CreateMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, "", err @@ -217,7 +219,7 @@ func deletePidFile() error { // createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func createCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { +func CreateCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { // get our binary name so we can always reexec ourself name := os.Args[0] command := exec.Command(name, append([]string{ From 609c298810f3c22176493a425dae22f274bfe20b Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:20:15 -0800 Subject: [PATCH 065/117] Refactor network creation and initialization into strategies Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 9 ++- libcontainer/container.go | 13 ++-- libcontainer/container.json | 9 ++- libcontainer/network/strategy.go | 32 ++++++++++ libcontainer/network/veth.go | 103 +++++++++++++++++++++++++++++++ libcontainer/nsinit/exec.go | 97 +++++++++++------------------ libcontainer/nsinit/init.go | 55 ++++++----------- 7 files changed, 211 insertions(+), 107 deletions(-) create mode 100644 libcontainer/network/strategy.go create mode 100644 libcontainer/network/veth.go diff --git a/libcontainer/README.md b/libcontainer/README.md index 89a4ec0..36553af 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -45,12 +45,17 @@ Sample `container.json` file: "AUDIT_WRITE", "AUDIT_CONTROL", "MAC_OVERRIDE", - "MAC_ADMIN" + "MAC_ADMIN", + "NET_ADMIN" ], "network": { + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, "address": "172.17.0.100/16", "gateway": "172.17.42.1", - "bridge": "docker0", "mtu": 1500 }, "cgroups": { diff --git a/libcontainer/container.go b/libcontainer/container.go index 3c1b62b..4a47977 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -4,6 +4,10 @@ import ( "github.com/dotcloud/docker/pkg/cgroups" ) +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + // Container defines configuration options for how a // container is setup inside a directory and how a process should be executed type Container struct { @@ -24,8 +28,9 @@ type Container struct { // The network configuration can be omited from a container causing the // container to be setup with the host's networking stack type Network struct { - Address string `json:"address,omitempty"` - Gateway string `json:"gateway,omitempty"` - Bridge string `json:"bridge,omitempty"` - Mtu int `json:"mtu,omitempty"` + Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc + Context Context `json:"context,omitempty"` // generic context for type specific networking options + Address string `json:"address,omitempty"` + Gateway string `json:"gateway,omitempty"` + Mtu int `json:"mtu,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index 07e52df..c2b21f8 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -28,12 +28,17 @@ "AUDIT_WRITE", "AUDIT_CONTROL", "MAC_OVERRIDE", - "MAC_ADMIN" + "MAC_ADMIN", + "NET_ADMIN" ], "network": { + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, "address": "172.17.0.100/16", "gateway": "172.17.42.1", - "bridge": "docker0", "mtu": 1500 }, "cgroups": { diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go new file mode 100644 index 0000000..8ecc11a --- /dev/null +++ b/libcontainer/network/strategy.go @@ -0,0 +1,32 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +var ( + ErrNotValidStrategyType = errors.New("not a valid network strategy type") +) + +var strategies = map[string]NetworkStrategy{ + "veth": &Veth{}, +} + +// NetworkStrategy represends a specific network configuration for +// a containers networking stack +type NetworkStrategy interface { + Create(*libcontainer.Network, int) (libcontainer.Context, error) + Initialize(*libcontainer.Network, libcontainer.Context) error +} + +// GetStrategy returns the specific network strategy for the +// provided type. If no strategy is registered for the type an +// ErrNotValidStrategyType is returned. +func GetStrategy(tpe string) (NetworkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, ErrNotValidStrategyType + } + return s, nil +} diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go new file mode 100644 index 0000000..61fec55 --- /dev/null +++ b/libcontainer/network/veth.go @@ -0,0 +1,103 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "log" +) + +type Veth struct { +} + +func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { + log.Printf("creating veth network") + var ( + bridge string + prefix string + exists bool + ) + if bridge, exists = n.Context["bridge"]; !exists { + return nil, fmt.Errorf("bridge does not exist in network context") + } + if prefix, exists = n.Context["prefix"]; !exists { + return nil, fmt.Errorf("veth prefix does not exist in network context") + } + name1, name2, err := createVethPair(prefix) + if err != nil { + return nil, err + } + context := libcontainer.Context{ + "vethHost": name1, + "vethChild": name2, + } + log.Printf("veth pair created %s <> %s", name1, name2) + if err := SetInterfaceMaster(name1, bridge); err != nil { + return context, err + } + if err := SetMtu(name1, n.Mtu); err != nil { + return context, err + } + if err := InterfaceUp(name1); err != nil { + return context, err + } + log.Printf("setting %s inside %d namespace", name2, nspid) + if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { + return context, err + } + return context, nil +} + +func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { + var ( + vethChild string + exists bool + ) + if vethChild, exists = context["vethChild"]; !exists { + return fmt.Errorf("vethChild does not exist in network context") + } + if err := InterfaceDown(vethChild); err != nil { + return fmt.Errorf("interface down %s %s", vethChild, err) + } + if err := ChangeInterfaceName(vethChild, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", vethChild, err) + } + if err := SetInterfaceIp("eth0", config.Address); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created +func createVethPair(prefix string) (name1 string, name2 string, err error) { + name1, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + if err = CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b2eaa0b..6c4d766 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,10 +3,10 @@ package nsinit import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" - "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/term" "io" @@ -19,11 +19,11 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, + master *os.File, logFile string, args []string) (int, error) { var ( - console string - err error - + console string + err error inPipe io.WriteCloser outPipe, errPipe io.ReadCloser ) @@ -46,7 +46,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command := CreateCommand(container, console, logFile, r.Fd(), args) if !container.Tty { - log.Printf("opening pipes on command") + log.Printf("opening std pipes") if inPipe, err = command.StdinPipe(); err != nil { return -1, err } @@ -78,15 +78,9 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return -1, err } } - - if container.Network != nil { - log.Printf("creating veth pair") - vethPair, err := InitializeContainerVeth(container.Network.Bridge, container.Network.Mtu, command.Process.Pid) - if err != nil { - return -1, err - } - log.Printf("sending %s as veth pair name", vethPair) - SendVethName(w, vethPair) + if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + command.Process.Kill() + return -1, err } // Sync with child @@ -104,7 +98,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command.Process.Kill() return -1, err } - defer term.RestoreTerminal(uintptr(syscall.Stdin), state) + defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { log.Printf("starting copy for std pipes") go func() { @@ -125,39 +119,34 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } -// SendVethName writes the veth pair name to the child's stdin then closes the -// pipe so that the child stops waiting for more data -func SendVethName(pipe io.Writer, name string) { - fmt.Fprint(pipe, name) +func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { + if container.Network != nil { + log.Printf("creating host network configuration type %s", container.Network.Type) + strategy, err := network.GetStrategy(container.Network.Type) + if err != nil { + return err + } + networkContext, err := strategy.Create(container.Network, nspid) + if err != nil { + return err + } + log.Printf("sending %v as network context", networkContext) + if err := SendContext(pipe, networkContext); err != nil { + return err + } + } + return nil } -// initializeContainerVeth will create a veth pair and setup the host's -// side of the pair by setting the specified bridge as the master and bringing -// up the interface. -// -// Then will with set the other side of the veth pair into the container's namespaced -// using the pid and returns the veth's interface name to provide to the container to -// finish setting up the interface inside the namespace -func InitializeContainerVeth(bridge string, mtu, nspid int) (string, error) { - name1, name2, err := createVethPair() +// SendContext writes the veth pair name to the child's stdin then closes the +// pipe so that the child stops waiting for more data +func SendContext(pipe io.Writer, context libcontainer.Context) error { + data, err := json.Marshal(context) if err != nil { - return "", err + return err } - log.Printf("veth pair created %s <> %s", name1, name2) - if err := network.SetInterfaceMaster(name1, bridge); err != nil { - return "", err - } - if err := network.SetMtu(name1, mtu); err != nil { - return "", err - } - if err := network.InterfaceUp(name1); err != nil { - return "", err - } - log.Printf("setting %s inside %d namespace", name2, nspid) - if err := network.SetInterfaceInNamespacePid(name2, nspid); err != nil { - return "", err - } - return name2, nil + pipe.Write(data) + return nil } // SetupWindow gets the parent window size and sets the master @@ -190,29 +179,13 @@ func CreateMasterAndConsole() (*os.File, string, error) { return master, console, nil } -// createVethPair will automatically generage two random names for -// the veth pair and ensure that they have been created -func createVethPair() (name1 string, name2 string, err error) { - name1, err = utils.GenerateRandomName("dock", 4) - if err != nil { - return - } - name2, err = utils.GenerateRandomName("dock", 4) - if err != nil { - return - } - if err = network.CreateVethPair(name1, name2); err != nil { - return - } - return -} - // writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } func deletePidFile() error { + log.Printf("removing .nspid file") return os.Remove(".nspid") } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 04716ba..f530d4a 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -3,6 +3,7 @@ package nsinit import ( + "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" @@ -27,13 +28,10 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well - tempVethName, err := getVethName(pipe) + context, err := GetContextFromParent(pipe) if err != nil { return err } - if tempVethName != "" { - log.Printf("received veth name %s", tempVethName) - } if console != "" { log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty @@ -62,7 +60,7 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupVethNetwork(container.Network, tempVethName); err != nil { + if err := setupNetwork(container.Network, context); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -145,46 +143,29 @@ func openTerminal(name string, flag int) (*os.File, error) { // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway -func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { +func setupNetwork(config *libcontainer.Network, context libcontainer.Context) error { if config != nil { - if err := network.InterfaceDown(tempVethName); err != nil { - return fmt.Errorf("interface down %s %s", tempVethName, err) - } - if err := network.ChangeInterfaceName(tempVethName, "eth0"); err != nil { - return fmt.Errorf("change %s to eth0 %s", tempVethName, err) - } - if err := network.SetInterfaceIp("eth0", config.Address); err != nil { - return fmt.Errorf("set eth0 ip %s", err) - } - if err := network.SetMtu("eth0", config.Mtu); err != nil { - return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("eth0"); err != nil { - return fmt.Errorf("eth0 up %s", err) - } - if err := network.SetMtu("lo", config.Mtu); err != nil { - return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) - } - if err := network.InterfaceUp("lo"); err != nil { - return fmt.Errorf("lo up %s", err) - } - if config.Gateway != "" { - if err := network.SetDefaultGateway(config.Gateway); err != nil { - return fmt.Errorf("set gateway to %s %s", config.Gateway, err) - } + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err } + return strategy.Initialize(config, context) } return nil } -// getVethName reads from Stdin the temp veth name -// sent by the parent processes after the veth pair -// has been created and setup -func getVethName(pipe io.ReadCloser) (string, error) { +func GetContextFromParent(pipe io.ReadCloser) (libcontainer.Context, error) { defer pipe.Close() data, err := ioutil.ReadAll(pipe) if err != nil { - return "", fmt.Errorf("error reading from stdin %s", err) + return nil, fmt.Errorf("error reading from stdin %s", err) } - return string(data), nil + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + log.Printf("received context %v", context) + } + return context, nil } From c71bc032798742df70551a7b6cbbe8ca6dbd03f2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:37:09 -0800 Subject: [PATCH 066/117] Refactor exec method Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 138 ++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 61 deletions(-) diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 6c4d766..3cbe43a 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -22,20 +22,10 @@ import ( func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, master *os.File, logFile string, args []string) (int, error) { var ( - console string - err error - inPipe io.WriteCloser - outPipe, errPipe io.ReadCloser + console string + err error ) - if container.Tty { - log.Printf("setting up master and console") - master, console, err = CreateMasterAndConsole() - if err != nil { - return -1, err - } - } - // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child r, w, err := os.Pipe() @@ -44,49 +34,15 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } system.UsetCloseOnExec(r.Fd()) + if container.Tty { + log.Printf("setting up master and console") + master, console, err = CreateMasterAndConsole() + if err != nil { + return -1, err + } + } + command := CreateCommand(container, console, logFile, r.Fd(), args) - if !container.Tty { - log.Printf("opening std pipes") - if inPipe, err = command.StdinPipe(); err != nil { - return -1, err - } - if outPipe, err = command.StdoutPipe(); err != nil { - return -1, err - } - if errPipe, err = command.StderrPipe(); err != nil { - return -1, err - } - } - - log.Printf("staring init") - if err := command.Start(); err != nil { - return -1, err - } - log.Printf("writting state file") - if err := writePidFile(command); err != nil { - command.Process.Kill() - return -1, err - } - defer deletePidFile() - - // Do this before syncing with child so that no children - // can escape the cgroup - if container.Cgroups != nil { - log.Printf("setting up cgroups") - if err := container.Cgroups.Apply(command.Process.Pid); err != nil { - command.Process.Kill() - return -1, err - } - } - if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { - command.Process.Kill() - return -1, err - } - - // Sync with child - log.Printf("closing sync pipes") - w.Close() - r.Close() if container.Tty { log.Printf("starting copy for tty") @@ -100,15 +56,39 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } defer term.RestoreTerminal(os.Stdin.Fd(), state) } else { - log.Printf("starting copy for std pipes") - go func() { - defer inPipe.Close() - io.Copy(inPipe, stdin) - }() - go io.Copy(stdout, outPipe) - go io.Copy(stderr, errPipe) + if err := startStdCopy(command, stdin, stdout, stderr); err != nil { + command.Process.Kill() + return -1, err + } } + log.Printf("staring init") + if err := command.Start(); err != nil { + return -1, err + } + log.Printf("writing state file") + if err := writePidFile(command); err != nil { + command.Process.Kill() + return -1, err + } + defer deletePidFile() + + // Do this before syncing with child so that no children + // can escape the cgroup + if err := SetupCgroups(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + command.Process.Kill() + return -1, err + } + + // Sync with child + log.Printf("closing sync pipes") + w.Close() + r.Close() + log.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { @@ -119,6 +99,16 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +func SetupCgroups(container *libcontainer.Container, nspid int) error { + if container.Cgroups != nil { + log.Printf("setting up cgroups") + if err := container.Cgroups.Apply(nspid); err != nil { + return err + } + } + return nil +} + func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { if container.Network != nil { log.Printf("creating host network configuration type %s", container.Network.Type) @@ -207,3 +197,29 @@ func CreateCommand(container *libcontainer.Container, console, logFile string, p command.Env = container.Env return command } + +func startStdCopy(command *exec.Cmd, stdin io.Reader, stdout, stderr io.Writer) error { + log.Printf("opening std pipes") + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + log.Printf("starting copy for std pipes") + go func() { + defer inPipe.Close() + io.Copy(inPipe, stdin) + }() + go io.Copy(stdout, outPipe) + go io.Copy(stderr, errPipe) + + return nil +} From 118ca3ae64768042997def8f3ff0d389d2049567 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 21 Feb 2014 22:58:30 -0800 Subject: [PATCH 067/117] Add syncpipe for passing context Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/exec.go | 27 +++-------- libcontainer/nsinit/init.go | 44 ++++++++---------- libcontainer/nsinit/nsinit/main.go | 6 ++- libcontainer/nsinit/sync_pipe.go | 73 ++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 48 deletions(-) create mode 100644 libcontainer/nsinit/sync_pipe.go diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index 3cbe43a..ec75e9c 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,7 +3,6 @@ package nsinit import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" @@ -28,11 +27,10 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. // create a pipe so that we can syncronize with the namespaced process and // pass the veth name to the child - r, w, err := os.Pipe() + syncPipe, err := NewSyncPipe() if err != nil { return -1, err } - system.UsetCloseOnExec(r.Fd()) if container.Tty { log.Printf("setting up master and console") @@ -42,8 +40,7 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. } } - command := CreateCommand(container, console, logFile, r.Fd(), args) - + command := CreateCommand(container, console, logFile, syncPipe.child.Fd(), args) if container.Tty { log.Printf("starting copy for tty") go io.Copy(stdout, master) @@ -79,15 +76,14 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. command.Process.Kill() return -1, err } - if err := InitializeNetworking(container, command.Process.Pid, w); err != nil { + if err := InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() return -1, err } // Sync with child log.Printf("closing sync pipes") - w.Close() - r.Close() + syncPipe.Close() log.Printf("waiting on process") if err := command.Wait(); err != nil { @@ -109,7 +105,7 @@ func SetupCgroups(container *libcontainer.Container, nspid int) error { return nil } -func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io.Writer) error { +func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { log.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) @@ -121,24 +117,13 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe io. return err } log.Printf("sending %v as network context", networkContext) - if err := SendContext(pipe, networkContext); err != nil { + if err := pipe.SendToChild(networkContext); err != nil { return err } } return nil } -// SendContext writes the veth pair name to the child's stdin then closes the -// pipe so that the child stops waiting for more data -func SendContext(pipe io.Writer, context libcontainer.Context) error { - data, err := json.Marshal(context) - if err != nil { - return err - } - pipe.Write(data) - return nil -} - // SetupWindow gets the parent window size and sets the master // pty to the current size and set the parents mode to RAW func SetupWindow(master, parent *os.File) (*term.State, error) { diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index f530d4a..cdedc14 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -3,14 +3,11 @@ package nsinit import ( - "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "io" - "io/ioutil" "log" "os" "os/exec" @@ -20,7 +17,7 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe io.ReadCloser, args []string) error { +func Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err @@ -28,16 +25,18 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well - context, err := GetContextFromParent(pipe) + context, err := syncPipe.ReadFromParent() if err != nil { + syncPipe.Close() return err } + syncPipe.Close() + log.Printf("received context from parent %v", context) + if console != "" { log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() + closeStdPipes() slave, err := openTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) @@ -79,18 +78,27 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, pipe return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } + return execArgs(args, container.Env) +} + +func execArgs(args []string, env []string) error { name, err := exec.LookPath(args[0]) if err != nil { return err } - log.Printf("execing %s goodbye", name) - if err := system.Exec(name, args[0:], container.Env); err != nil { + if err := system.Exec(name, args[0:], env); err != nil { return fmt.Errorf("exec %s", err) } panic("unreachable") } +func closeStdPipes() { + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() +} + // resolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func resolveRootfs(uncleanRootfs string) (string, error) { @@ -153,19 +161,3 @@ func setupNetwork(config *libcontainer.Network, context libcontainer.Context) er } return nil } - -func GetContextFromParent(pipe io.ReadCloser) (libcontainer.Context, error) { - defer pipe.Close() - data, err := ioutil.ReadAll(pipe) - if err != nil { - return nil, fmt.Errorf("error reading from stdin %s", err) - } - var context libcontainer.Context - if len(data) > 0 { - if err := json.Unmarshal(data, &context); err != nil { - return nil, err - } - log.Printf("received context %v", context) - } - return context, nil -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 28d42d4..2400ab6 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -74,7 +74,11 @@ func main() { if flag.NArg() < 2 { log.Fatal(ErrWrongArguments) } - if err := nsinit.Init(container, cwd, console, os.NewFile(uintptr(pipeFd), "pipe"), flag.Args()[1:]); err != nil { + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) + if err != nil { + log.Fatal(err) + } + if err := nsinit.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { log.Fatal(err) } default: diff --git a/libcontainer/nsinit/sync_pipe.go b/libcontainer/nsinit/sync_pipe.go new file mode 100644 index 0000000..7b29e98 --- /dev/null +++ b/libcontainer/nsinit/sync_pipe.go @@ -0,0 +1,73 @@ +package nsinit + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" +) + +// SyncPipe allows communication to and from the child processes +// to it's parent and allows the two independent processes to +// syncronize their state. +type SyncPipe struct { + parent, child *os.File +} + +func NewSyncPipe() (s *SyncPipe, err error) { + s = &SyncPipe{} + s.child, s.parent, err = os.Pipe() + if err != nil { + return nil, err + } + system.UsetCloseOnExec(s.child.Fd()) + return s, nil +} + +func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) { + s := &SyncPipe{} + if parendFd > 0 { + s.parent = os.NewFile(parendFd, "parendPipe") + } else if childFd > 0 { + s.child = os.NewFile(childFd, "childPipe") + } else { + return nil, fmt.Errorf("no valid sync pipe fd specified") + } + return s, nil +} + +func (s *SyncPipe) SendToChild(context libcontainer.Context) error { + data, err := json.Marshal(context) + if err != nil { + return err + } + s.parent.Write(data) + return nil +} + +func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) { + data, err := ioutil.ReadAll(s.child) + if err != nil { + return nil, fmt.Errorf("error reading from sync pipe %s", err) + } + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + } + return context, nil + +} + +func (s *SyncPipe) Close() error { + if s.parent != nil { + s.parent.Close() + } + if s.child != nil { + s.child.Close() + } + return nil +} From 1271ddcd61f90b3c25654bdb5acf130b6e107189 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 22 Feb 2014 00:29:21 -0800 Subject: [PATCH 068/117] Abstract out diff implementations for importing Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/command.go | 34 +++++++++ libcontainer/nsinit/exec.go | 107 ++++------------------------ libcontainer/nsinit/nsinit/main.go | 4 +- libcontainer/nsinit/state.go | 24 +++++++ libcontainer/nsinit/term.go | 109 +++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 94 deletions(-) create mode 100644 libcontainer/nsinit/command.go create mode 100644 libcontainer/nsinit/state.go create mode 100644 libcontainer/nsinit/term.go diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go new file mode 100644 index 0000000..b1c5631 --- /dev/null +++ b/libcontainer/nsinit/command.go @@ -0,0 +1,34 @@ +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "os" + "os/exec" + "syscall" +) + +type CommandFactory interface { + Create(container *libcontainer.Container, console, logFile string, syncFd uintptr, args []string) *exec.Cmd +} + +type DefaultCommandFactory struct{} + +// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { + // get our binary name so we can always reexec ourself + name := os.Args[0] + command := exec.Command(name, append([]string{ + "-console", console, + "-pipe", fmt.Sprint(pipe), + "-log", logFile, + "init"}, args...)...) + + command.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), + } + command.Env = container.Env + return command +} diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ec75e9c..ee83f4f 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -3,13 +3,9 @@ package nsinit import ( - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "github.com/dotcloud/docker/pkg/term" - "io" - "io/ioutil" "log" "os" "os/exec" @@ -18,9 +14,11 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io.Writer, - master *os.File, logFile string, args []string) (int, error) { +func Exec(container *libcontainer.Container, + factory CommandFactory, state StateWriter, term Terminal, + logFile string, args []string) (int, error) { var ( + master *os.File console string err error ) @@ -38,37 +36,28 @@ func Exec(container *libcontainer.Container, stdin io.Reader, stdout, stderr io. if err != nil { return -1, err } + term.SetMaster(master) } - command := CreateCommand(container, console, logFile, syncPipe.child.Fd(), args) - if container.Tty { - log.Printf("starting copy for tty") - go io.Copy(stdout, master) - go io.Copy(master, stdin) - - state, err := SetupWindow(master, os.Stdin) - if err != nil { - command.Process.Kill() - return -1, err - } - defer term.RestoreTerminal(os.Stdin.Fd(), state) - } else { - if err := startStdCopy(command, stdin, stdout, stderr); err != nil { - command.Process.Kill() - return -1, err - } + command := factory.Create(container, console, logFile, syncPipe.child.Fd(), args) + if err := term.Attach(command); err != nil { + return -1, err } + defer term.Close() log.Printf("staring init") if err := command.Start(); err != nil { return -1, err } log.Printf("writing state file") - if err := writePidFile(command); err != nil { + if err := state.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } - defer deletePidFile() + defer func() { + log.Printf("removing state file") + state.DeletePid() + }() // Do this before syncing with child so that no children // can escape the cgroup @@ -124,19 +113,6 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *Sy return nil } -// SetupWindow gets the parent window size and sets the master -// pty to the current size and set the parents mode to RAW -func SetupWindow(master, parent *os.File) (*term.State, error) { - ws, err := term.GetWinsize(parent.Fd()) - if err != nil { - return nil, err - } - if err := term.SetWinsize(master.Fd(), ws); err != nil { - return nil, err - } - return term.SetRawTerminal(parent.Fd()) -} - // CreateMasterAndConsole will open /dev/ptmx on the host and retreive the // pts name for use as the pty slave inside the container func CreateMasterAndConsole() (*os.File, string, error) { @@ -153,58 +129,3 @@ func CreateMasterAndConsole() (*os.File, string, error) { } return master, console, nil } - -// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container -func writePidFile(command *exec.Cmd) error { - return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) -} - -func deletePidFile() error { - log.Printf("removing .nspid file") - return os.Remove(".nspid") -} - -// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces -// defined on the container's configuration and use the current binary as the init with the -// args provided -func CreateCommand(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { - // get our binary name so we can always reexec ourself - name := os.Args[0] - command := exec.Command(name, append([]string{ - "-console", console, - "-pipe", fmt.Sprint(pipe), - "-log", logFile, - "init"}, args...)...) - - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), - } - command.Env = container.Env - return command -} - -func startStdCopy(command *exec.Cmd, stdin io.Reader, stdout, stderr io.Writer) error { - log.Printf("opening std pipes") - inPipe, err := command.StdinPipe() - if err != nil { - return err - } - outPipe, err := command.StdoutPipe() - if err != nil { - return err - } - errPipe, err := command.StderrPipe() - if err != nil { - return err - } - - log.Printf("starting copy for std pipes") - go func() { - defer inPipe.Close() - io.Copy(inPipe, stdin) - }() - go io.Copy(stdout, outPipe) - go io.Copy(stderr, errPipe) - - return nil -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 2400ab6..c299412 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -57,8 +57,10 @@ func main() { if nspid > 0 { exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) } else { + term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) exitCode, err = nsinit.Exec(container, - os.Stdin, os.Stdout, os.Stderr, nil, + &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}, + term, logFile, flag.Args()[1:]) } if err != nil { diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go new file mode 100644 index 0000000..1f0fedd --- /dev/null +++ b/libcontainer/nsinit/state.go @@ -0,0 +1,24 @@ +package nsinit + +import ( + "fmt" + "io/ioutil" + "os" +) + +type StateWriter interface { + WritePid(pid int) error + DeletePid() error +} + +type DefaultStateWriter struct { +} + +// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container +func (*DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(pid)), 0655) +} + +func (*DefaultStateWriter) DeletePid() error { + return os.Remove(".nspid") +} diff --git a/libcontainer/nsinit/term.go b/libcontainer/nsinit/term.go new file mode 100644 index 0000000..6492468 --- /dev/null +++ b/libcontainer/nsinit/term.go @@ -0,0 +1,109 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/term" + "io" + "os" + "os/exec" +) + +type Terminal interface { + io.Closer + SetMaster(*os.File) + Attach(*exec.Cmd) error +} + +func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { + if tty { + return &TtyTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } + } + return &StdTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } +} + +type TtyTerminal struct { + stdin io.Reader + stdout, stderr io.Writer + master *os.File + state *term.State +} + +func (t *TtyTerminal) SetMaster(master *os.File) { + t.master = master +} + +func (t *TtyTerminal) Attach(command *exec.Cmd) error { + go io.Copy(t.stdout, t.master) + go io.Copy(t.master, t.stdin) + + state, err := t.setupWindow(t.master, os.Stdin) + if err != nil { + command.Process.Kill() + return err + } + t.state = state + return err +} + +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(parent.Fd()) +} + +func (t *TtyTerminal) Close() error { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + return t.master.Close() +} + +type StdTerminal struct { + stdin io.Reader + stdout, stderr io.Writer +} + +func (s *StdTerminal) SetMaster(*os.File) { + // no need to set master on non tty +} + +func (s *StdTerminal) Close() error { + return nil +} + +func (s *StdTerminal) Attach(command *exec.Cmd) error { + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + go func() { + defer inPipe.Close() + io.Copy(inPipe, s.stdin) + }() + + go io.Copy(s.stdout, outPipe) + go io.Copy(s.stderr, errPipe) + + return nil +} From a42c6fafbedef2718a149c8f2fe364203e9e133f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 22 Feb 2014 01:21:26 -0800 Subject: [PATCH 069/117] Refactor driver to use Exec function from nsini Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/state.go | 10 ++++++---- libcontainer/nsinit/term.go | 9 +++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 1f0fedd..2dbaaa5 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -4,6 +4,7 @@ import ( "fmt" "io/ioutil" "os" + "path/filepath" ) type StateWriter interface { @@ -12,13 +13,14 @@ type StateWriter interface { } type DefaultStateWriter struct { + Root string } // writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container -func (*DefaultStateWriter) WritePid(pid int) error { - return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(pid)), 0655) +func (d *DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(filepath.Join(d.Root, ".nspid"), []byte(fmt.Sprint(pid)), 0655) } -func (*DefaultStateWriter) DeletePid() error { - return os.Remove(".nspid") +func (d *DefaultStateWriter) DeletePid() error { + return os.Remove(filepath.Join(d.Root, ".nspid")) } diff --git a/libcontainer/nsinit/term.go b/libcontainer/nsinit/term.go index 6492468..58dccab 100644 --- a/libcontainer/nsinit/term.go +++ b/libcontainer/nsinit/term.go @@ -11,6 +11,7 @@ type Terminal interface { io.Closer SetMaster(*os.File) Attach(*exec.Cmd) error + Resize(h, w int) error } func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { @@ -35,6 +36,10 @@ type TtyTerminal struct { state *term.State } +func (t *TtyTerminal) Resize(h, w int) error { + return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + func (t *TtyTerminal) SetMaster(master *os.File) { t.master = master } @@ -83,6 +88,10 @@ func (s *StdTerminal) Close() error { return nil } +func (s *StdTerminal) Resize(h, w int) error { + return nil +} + func (s *StdTerminal) Attach(command *exec.Cmd) error { inPipe, err := command.StdinPipe() if err != nil { From d388db815c75b5f34909167ead0683559c2ea074 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 10:46:20 -0800 Subject: [PATCH 070/117] Look for cpu subsystem instead of memory Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 96002f0..e260d67 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -132,7 +132,7 @@ func (c *Cgroup) Apply(pid int) error { // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ // // we can pick any subsystem to find the root - cgroupRoot, err := FindCgroupMountpoint("memory") + cgroupRoot, err := FindCgroupMountpoint("cpu") if err != nil { return err } From b899d9bc446dfa6a6bb5f1b62b0b7800bab5fc74 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 13:40:17 -0800 Subject: [PATCH 071/117] Fix tests with dockerinit lookup path Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/nsinit/main.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index c299412..786c9c1 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -24,7 +24,7 @@ var ( ErrWrongArguments = errors.New("Wrong argument count") ) -func init() { +func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") @@ -33,6 +33,8 @@ func init() { } func main() { + registerFlags() + if flag.NArg() < 1 { log.Fatal(ErrWrongArguments) } From d50dc3cb7e46ad36201bbe74a802d9a4f4817cab Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 13:52:56 -0800 Subject: [PATCH 072/117] Honor user passed on container in nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 34 +++++++++++++++++++++++++--------- system/calls_linux.go | 8 ++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cdedc14..23303cd 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -8,6 +8,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/user" "log" "os" "os/exec" @@ -110,15 +111,30 @@ func resolveRootfs(uncleanRootfs string) (string, error) { } func setupUser(container *libcontainer.Container) error { - // TODO: honor user passed on container - if err := system.Setgroups(nil); err != nil { - return err - } - if err := system.Setresgid(0, 0, 0); err != nil { - return err - } - if err := system.Setresuid(0, 0, 0); err != nil { - return err + if container.User != "" { + uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) + if err != nil { + return err + } + if err := system.Setgroups(suppGids); err != nil { + return err + } + if err := system.Setgid(gid); err != nil { + return err + } + if err := system.Setuid(uid); err != nil { + return err + } + } else { + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } } return nil } diff --git a/system/calls_linux.go b/system/calls_linux.go index 42afa34..0bf42e3 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -71,6 +71,14 @@ func Setresuid(ruid, euid, suid int) error { return syscall.Setresuid(ruid, euid, suid) } +func Setgid(gid int) error { + return syscall.Setgid(gid) +} + +func Setuid(uid int) error { + return syscall.Setuid(uid) +} + func Sethostname(name string) error { return syscall.Sethostname([]byte(name)) } From 8a05149d919b001d5a7a3193529df0f6f5ebba05 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Wed, 19 Feb 2014 00:24:32 +0000 Subject: [PATCH 073/117] enable docker run -it Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- mflag/example/example.go | 1 + mflag/flag.go | 48 ++++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/mflag/example/example.go b/mflag/example/example.go index fa26c97..b0d25fb 100644 --- a/mflag/example/example.go +++ b/mflag/example/example.go @@ -27,4 +27,5 @@ func main() { fmt.Printf("b: %b\n", b) fmt.Printf("-bool: %b\n", b2) fmt.Printf("s/#hidden/-string(via lookup): %s\n", flag.Lookup("s").Value.String()) + fmt.Printf("ARGS: %v\n", flag.Args()) } diff --git a/mflag/flag.go b/mflag/flag.go index f721e04..7a0e8bf 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -77,6 +77,9 @@ import ( // ErrHelp is the error returned if the flag -help is invoked but no such flag is defined. var ErrHelp = errors.New("flag: help requested") +// ErrRetry is the error returned if you need to try letter by letter +var ErrRetry = errors.New("flag: retry") + // -- bool Value type boolValue bool @@ -733,21 +736,21 @@ func (f *FlagSet) usage() { } // parseOne parses one flag. It reports whether a flag was seen. -func (f *FlagSet) parseOne() (bool, error) { +func (f *FlagSet) parseOne() (bool, string, error) { if len(f.args) == 0 { - return false, nil + return false, "", nil } s := f.args[0] if len(s) == 0 || s[0] != '-' || len(s) == 1 { - return false, nil + return false, "", nil } if s[1] == '-' && len(s) == 2 { // "--" terminates the flags f.args = f.args[1:] - return false, nil + return false, "", nil } name := s[1:] if len(name) == 0 || name[0] == '=' { - return false, f.failf("bad flag syntax: %s", s) + return false, "", f.failf("bad flag syntax: %s", s) } // it's a flag. does it have an argument? @@ -767,14 +770,14 @@ func (f *FlagSet) parseOne() (bool, error) { if !alreadythere { if name == "-help" || name == "help" || name == "h" { // special case for nice help message. f.usage() - return false, ErrHelp + return false, "", ErrHelp } - return false, f.failf("flag provided but not defined: -%s", name) + return false, name, ErrRetry } if fv, ok := flag.Value.(boolFlag); ok && fv.IsBoolFlag() { // special case: doesn't need an arg if has_value { if err := fv.Set(value); err != nil { - return false, f.failf("invalid boolean value %q for -%s: %v", value, name, err) + return false, "", f.failf("invalid boolean value %q for -%s: %v", value, name, err) } } else { fv.Set("true") @@ -787,17 +790,17 @@ func (f *FlagSet) parseOne() (bool, error) { value, f.args = f.args[0], f.args[1:] } if !has_value { - return false, f.failf("flag needs an argument: -%s", name) + return false, "", f.failf("flag needs an argument: -%s", name) } if err := flag.Value.Set(value); err != nil { - return false, f.failf("invalid value %q for flag -%s: %v", value, name, err) + return false, "", f.failf("invalid value %q for flag -%s: %v", value, name, err) } } if f.actual == nil { f.actual = make(map[string]*Flag) } f.actual[name] = flag - return true, nil + return true, "", nil } // Parse parses flag definitions from the argument list, which should not @@ -808,13 +811,34 @@ func (f *FlagSet) Parse(arguments []string) error { f.parsed = true f.args = arguments for { - seen, err := f.parseOne() + seen, name, err := f.parseOne() if seen { continue } if err == nil { break } + if err == ErrRetry { + if len(name) > 1 { + err = nil + for _, letter := range strings.Split(name, "") { + f.args = append([]string{"-" + letter}, f.args...) + seen2, _, err2 := f.parseOne() + if seen2 { + continue + } + if err2 != nil { + err = f.failf("flag provided but not defined: -%s", name) + break + } + } + if err == nil { + continue + } + } else { + err = f.failf("flag provided but not defined: -%s", name) + } + } switch f.errorHandling { case ContinueOnError: return err From c8ad8184ec75d3abfcb9654e62827a60cc3fd79f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 15:47:23 -0800 Subject: [PATCH 074/117] Cgroups allow devices for privileged containers Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 23303cd..d6d7dc3 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -9,7 +9,6 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" - "log" "os" "os/exec" "path/filepath" @@ -23,7 +22,6 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync if err != nil { return err } - log.Printf("initializing namespace at %s", rootfs) // We always read this as it is a way to sync with the parent as well context, err := syncPipe.ReadFromParent() @@ -32,10 +30,8 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync return err } syncPipe.Close() - log.Printf("received context from parent %v", context) if console != "" { - log.Printf("setting up console for %s", console) // close pipes so that we can replace it with the pty closeStdPipes() slave, err := openTerminal(console, syscall.O_RDWR) @@ -66,11 +62,9 @@ func Init(container *libcontainer.Container, uncleanRootfs, console string, sync if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - log.Printf("dropping capabilities") if err := capabilities.DropCapabilities(container); err != nil { return fmt.Errorf("drop capabilities %s", err) } - log.Printf("setting user in namespace") if err := setupUser(container); err != nil { return fmt.Errorf("setup user %s", err) } @@ -87,7 +81,6 @@ func execArgs(args []string, env []string) error { if err != nil { return err } - log.Printf("execing %s goodbye", name) if err := system.Exec(name, args[0:], env); err != nil { return fmt.Errorf("exec %s", err) } @@ -111,7 +104,7 @@ func resolveRootfs(uncleanRootfs string) (string, error) { } func setupUser(container *libcontainer.Container) error { - if container.User != "" { + if container.User != "" && container.User != "root" { uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) if err != nil { return err From 0e4d946dc4a5ed1c689cf2a57b3da38bf99ba1b1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 18:38:24 -0800 Subject: [PATCH 075/117] Improve logging for nsinit Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 4 --- libcontainer/nsinit/exec.go | 43 +++++++++++++++--------------- libcontainer/nsinit/execin.go | 2 +- libcontainer/nsinit/init.go | 2 +- libcontainer/nsinit/nsinit.go | 29 ++++++++++++++++++++ libcontainer/nsinit/nsinit/main.go | 36 ++++++++++++++----------- 6 files changed, 72 insertions(+), 44 deletions(-) create mode 100644 libcontainer/nsinit/nsinit.go diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 61fec55..321c68e 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -4,14 +4,12 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/utils" - "log" ) type Veth struct { } func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { - log.Printf("creating veth network") var ( bridge string prefix string @@ -31,7 +29,6 @@ func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, "vethHost": name1, "vethChild": name2, } - log.Printf("veth pair created %s <> %s", name1, name2) if err := SetInterfaceMaster(name1, bridge); err != nil { return context, err } @@ -41,7 +38,6 @@ func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, if err := InterfaceUp(name1); err != nil { return context, err } - log.Printf("setting %s inside %d namespace", name2, nspid) if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { return context, err } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index ee83f4f..c407323 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -6,7 +6,6 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/system" - "log" "os" "os/exec" "syscall" @@ -14,9 +13,7 @@ import ( // Exec performes setup outside of a namespace so that a container can be // executed. Exec is a high level function for working with container namespaces. -func Exec(container *libcontainer.Container, - factory CommandFactory, state StateWriter, term Terminal, - logFile string, args []string) (int, error) { +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { var ( master *os.File console string @@ -31,7 +28,7 @@ func Exec(container *libcontainer.Container, } if container.Tty { - log.Printf("setting up master and console") + ns.logger.Printf("setting up master and console") master, console, err = CreateMasterAndConsole() if err != nil { return -1, err @@ -39,54 +36,56 @@ func Exec(container *libcontainer.Container, term.SetMaster(master) } - command := factory.Create(container, console, logFile, syncPipe.child.Fd(), args) + command := ns.commandFactory.Create(container, console, ns.logFile, syncPipe.child.Fd(), args) if err := term.Attach(command); err != nil { return -1, err } defer term.Close() - log.Printf("staring init") + ns.logger.Printf("staring init") if err := command.Start(); err != nil { return -1, err } - log.Printf("writing state file") - if err := state.WritePid(command.Process.Pid); err != nil { + ns.logger.Printf("writing state file") + if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } defer func() { - log.Printf("removing state file") - state.DeletePid() + ns.logger.Printf("removing state file") + ns.stateWriter.DeletePid() }() // Do this before syncing with child so that no children // can escape the cgroup - if err := SetupCgroups(container, command.Process.Pid); err != nil { + if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { command.Process.Kill() return -1, err } - if err := InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { + if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { command.Process.Kill() return -1, err } // Sync with child - log.Printf("closing sync pipes") + ns.logger.Printf("closing sync pipes") syncPipe.Close() - log.Printf("waiting on process") + ns.logger.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } - log.Printf("process ended") - return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil + + exitCode := command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() + ns.logger.Printf("process ended with exit code %d", exitCode) + return exitCode, nil } -func SetupCgroups(container *libcontainer.Container, nspid int) error { +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { if container.Cgroups != nil { - log.Printf("setting up cgroups") + ns.logger.Printf("setting up cgroups") if err := container.Cgroups.Apply(nspid); err != nil { return err } @@ -94,9 +93,9 @@ func SetupCgroups(container *libcontainer.Container, nspid int) error { return nil } -func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { +func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { - log.Printf("creating host network configuration type %s", container.Network.Type) + ns.logger.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) if err != nil { return err @@ -105,7 +104,7 @@ func InitializeNetworking(container *libcontainer.Container, nspid int, pipe *Sy if err != nil { return err } - log.Printf("sending %v as network context", networkContext) + ns.logger.Printf("sending %v as network context", networkContext) if err := pipe.SendToChild(networkContext); err != nil { return err } diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 85a8990..9c33f69 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -12,7 +12,7 @@ import ( ) // ExecIn uses an existing pid and joins the pid's namespaces with the new command. -func ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { if err := system.Unshare(namespaceMap[ns]); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index d6d7dc3..5e33169 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -17,7 +17,7 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { rootfs, err := resolveRootfs(uncleanRootfs) if err != nil { return err diff --git a/libcontainer/nsinit/nsinit.go b/libcontainer/nsinit/nsinit.go new file mode 100644 index 0000000..599461e --- /dev/null +++ b/libcontainer/nsinit/nsinit.go @@ -0,0 +1,29 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "log" +) + +type NsInit interface { + Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) + ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) + Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error +} + +type linuxNs struct { + root string + logFile string + logger *log.Logger + commandFactory CommandFactory + stateWriter StateWriter +} + +func NewNsInit(logger *log.Logger, logFile string, command CommandFactory, state StateWriter) NsInit { + return &linuxNs{ + logger: logger, + commandFactory: command, + stateWriter: state, + logFile: logFile, + } +} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index 786c9c1..c25037f 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -42,13 +42,13 @@ func main() { if err != nil { log.Fatal(err) } - if err := setupLogging(); err != nil { + ns, err := newNsInit() + if err != nil { log.Fatal(err) } + switch flag.Arg(0) { case "exec": // this is executed outside of the namespace in the cwd - log.SetPrefix("[nsinit exec] ") - var exitCode int nspid, err := readPid() if err != nil { @@ -57,20 +57,16 @@ func main() { } } if nspid > 0 { - exitCode, err = nsinit.ExecIn(container, nspid, flag.Args()[1:]) + exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:]) } else { term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) - exitCode, err = nsinit.Exec(container, - &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}, - term, - logFile, flag.Args()[1:]) + exitCode, err = ns.Exec(container, term, flag.Args()[1:]) } if err != nil { log.Fatal(err) } os.Exit(exitCode) case "init": // this is executed inside of the namespace to setup the container - log.SetPrefix("[nsinit init] ") cwd, err := os.Getwd() if err != nil { log.Fatal(err) @@ -82,7 +78,7 @@ func main() { if err != nil { log.Fatal(err) } - if err := nsinit.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { + if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { log.Fatal(err) } default: @@ -116,19 +112,27 @@ func readPid() (int, error) { return pid, nil } -func setupLogging() (err error) { +func newNsInit() (nsinit.NsInit, error) { + logger, err := setupLogging() + if err != nil { + return nil, err + } + return nsinit.NewNsInit(logger, logFile, &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil +} + +func setupLogging() (logger *log.Logger, err error) { var writer io.Writer + switch logFile { case "stderr": writer = os.Stderr case "none", "": writer = ioutil.Discard default: - writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) - if err != nil { - return err + if writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755); err != nil { + return } } - log.SetOutput(writer) - return nil + logger = log.New(writer, "", log.LstdFlags) + return } From 6daf56799fc9a6c024bcdfe15957d91b02437bd0 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 24 Feb 2014 21:11:52 -0800 Subject: [PATCH 076/117] Refactor and improve libcontainer and driver Remove logging for now because it is complicating things Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/network/veth.go | 3 ++ libcontainer/nsinit/command.go | 8 ++-- libcontainer/nsinit/exec.go | 35 ++-------------- libcontainer/nsinit/execin.go | 6 +-- libcontainer/nsinit/init.go | 64 ++++++++---------------------- libcontainer/nsinit/nsinit.go | 9 ++--- libcontainer/nsinit/nsinit/main.go | 26 +----------- libcontainer/nsinit/state.go | 2 + libcontainer/utils/utils.go | 11 +++++ system/calls_linux.go | 9 +++++ system/pty_linux.go | 27 +++++++++++++ 11 files changed, 84 insertions(+), 116 deletions(-) diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 321c68e..49e63f0 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -6,6 +6,9 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/utils" ) +// Veth is a network strategy that uses a bridge and creates +// a veth pair, one that stays outside on the host and the other +// is placed inside the container's namespace type Veth struct { } diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go index b1c5631..5eb378a 100644 --- a/libcontainer/nsinit/command.go +++ b/libcontainer/nsinit/command.go @@ -8,8 +8,11 @@ import ( "syscall" ) +// CommandFactory takes the container's configuration and options passed by the +// parent processes and creates an *exec.Cmd that will be used to fork/exec the +// namespaced init process type CommandFactory interface { - Create(container *libcontainer.Container, console, logFile string, syncFd uintptr, args []string) *exec.Cmd + Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd } type DefaultCommandFactory struct{} @@ -17,13 +20,12 @@ type DefaultCommandFactory struct{} // Create will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console, logFile string, pipe uintptr, args []string) *exec.Cmd { +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { // get our binary name so we can always reexec ourself name := os.Args[0] command := exec.Command(name, append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), - "-log", logFile, "init"}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index c407323..b13326b 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -28,31 +28,27 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ } if container.Tty { - ns.logger.Printf("setting up master and console") - master, console, err = CreateMasterAndConsole() + master, console, err = system.CreateMasterAndConsole() if err != nil { return -1, err } term.SetMaster(master) } - command := ns.commandFactory.Create(container, console, ns.logFile, syncPipe.child.Fd(), args) + command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args) if err := term.Attach(command); err != nil { return -1, err } defer term.Close() - ns.logger.Printf("staring init") if err := command.Start(); err != nil { return -1, err } - ns.logger.Printf("writing state file") if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { command.Process.Kill() return -1, err } defer func() { - ns.logger.Printf("removing state file") ns.stateWriter.DeletePid() }() @@ -68,24 +64,18 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ } // Sync with child - ns.logger.Printf("closing sync pipes") syncPipe.Close() - ns.logger.Printf("waiting on process") if err := command.Wait(); err != nil { if _, ok := err.(*exec.ExitError); !ok { return -1, err } } - - exitCode := command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() - ns.logger.Printf("process ended with exit code %d", exitCode) - return exitCode, nil + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { if container.Cgroups != nil { - ns.logger.Printf("setting up cgroups") if err := container.Cgroups.Apply(nspid); err != nil { return err } @@ -95,7 +85,6 @@ func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) er func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { if container.Network != nil { - ns.logger.Printf("creating host network configuration type %s", container.Network.Type) strategy, err := network.GetStrategy(container.Network.Type) if err != nil { return err @@ -104,27 +93,9 @@ func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid if err != nil { return err } - ns.logger.Printf("sending %v as network context", networkContext) if err := pipe.SendToChild(networkContext); err != nil { return err } } return nil } - -// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the -// pts name for use as the pty slave inside the container -func CreateMasterAndConsole() (*os.File, string, error) { - master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) - if err != nil { - return nil, "", err - } - console, err := system.Ptsname(master) - if err != nil { - return nil, "", err - } - if err := system.Unlockpt(master); err != nil { - return nil, "", err - } - return master, console, nil -} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 9c33f69..463196c 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -18,7 +18,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s return -1, err } } - fds, err := getNsFds(nspid, container) + fds, err := ns.getNsFds(nspid, container) closeFds := func() { for _, f := range fds { system.Closefd(f) @@ -75,13 +75,13 @@ dropAndExec: if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } - if err := system.Exec(args[0], args[0:], container.Env); err != nil { + if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err } panic("unreachable") } -func getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { +func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 5e33169..1229560 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -7,18 +7,17 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" "os" - "os/exec" - "path/filepath" "syscall" ) // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { - rootfs, err := resolveRootfs(uncleanRootfs) + rootfs, err := utils.ResolveRootfs(uncleanRootfs) if err != nil { return err } @@ -34,7 +33,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if console != "" { // close pipes so that we can replace it with the pty closeStdPipes() - slave, err := openTerminal(console, syscall.O_RDWR) + slave, err := system.OpenTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) } @@ -50,6 +49,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setctty %s", err) } } + if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent deth signal %s", err) } @@ -73,18 +73,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) } } - return execArgs(args, container.Env) -} - -func execArgs(args []string, env []string) error { - name, err := exec.LookPath(args[0]) - if err != nil { - return err - } - if err := system.Exec(name, args[0:], env); err != nil { - return fmt.Errorf("exec %s", err) - } - panic("unreachable") + return system.Execv(args[0], args[0:], container.Env) } func closeStdPipes() { @@ -93,18 +82,19 @@ func closeStdPipes() { os.Stderr.Close() } -// resolveRootfs ensures that the current working directory is -// not a symlink and returns the absolute path to the rootfs -func resolveRootfs(uncleanRootfs string) (string, error) { - rootfs, err := filepath.Abs(uncleanRootfs) - if err != nil { - return "", err - } - return filepath.EvalSymlinks(rootfs) -} - func setupUser(container *libcontainer.Container) error { - if container.User != "" && container.User != "root" { + switch container.User { + case "root", "": + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } + default: uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) if err != nil { return err @@ -118,16 +108,6 @@ func setupUser(container *libcontainer.Container) error { if err := system.Setuid(uid); err != nil { return err } - } else { - if err := system.Setgroups(nil); err != nil { - return err - } - if err := system.Setresgid(0, 0, 0); err != nil { - return err - } - if err := system.Setresuid(0, 0, 0); err != nil { - return err - } } return nil } @@ -147,16 +127,6 @@ func dupSlave(slave *os.File) error { return nil } -// openTerminal is a clone of os.OpenFile without the O_CLOEXEC -// used to open the pty slave inside the container namespace -func openTerminal(name string, flag int) (*os.File, error) { - r, e := syscall.Open(name, flag, 0) - if e != nil { - return nil, &os.PathError{"open", name, e} - } - return os.NewFile(uintptr(r), name), nil -} - // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway diff --git a/libcontainer/nsinit/nsinit.go b/libcontainer/nsinit/nsinit.go index 599461e..f09a130 100644 --- a/libcontainer/nsinit/nsinit.go +++ b/libcontainer/nsinit/nsinit.go @@ -2,9 +2,10 @@ package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" - "log" ) +// NsInit is an interface with the public facing methods to provide high level +// exec operations on a container type NsInit interface { Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) @@ -13,17 +14,13 @@ type NsInit interface { type linuxNs struct { root string - logFile string - logger *log.Logger commandFactory CommandFactory stateWriter StateWriter } -func NewNsInit(logger *log.Logger, logFile string, command CommandFactory, state StateWriter) NsInit { +func NewNsInit(command CommandFactory, state StateWriter) NsInit { return &linuxNs{ - logger: logger, commandFactory: command, stateWriter: state, - logFile: logFile, } } diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index c25037f..e385e7f 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -6,7 +6,6 @@ import ( "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" - "io" "io/ioutil" "log" "os" @@ -16,7 +15,6 @@ import ( var ( console string pipeFd int - logFile string ) var ( @@ -26,7 +24,6 @@ var ( func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") - flag.StringVar(&logFile, "log", "none", "log options (none, stderr, or a file path)") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") flag.Parse() @@ -113,26 +110,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - logger, err := setupLogging() - if err != nil { - return nil, err - } - return nsinit.NewNsInit(logger, logFile, &nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil -} - -func setupLogging() (logger *log.Logger, err error) { - var writer io.Writer - - switch logFile { - case "stderr": - writer = os.Stderr - case "none", "": - writer = ioutil.Discard - default: - if writer, err = os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755); err != nil { - return - } - } - logger = log.New(writer, "", log.LstdFlags) - return + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil } diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 2dbaaa5..5c719e1 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -7,6 +7,8 @@ import ( "path/filepath" ) +// StateWriter handles writing and deleting the pid file +// on disk type StateWriter interface { WritePid(pid int) error DeletePid() error diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 5050997..0d919bc 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -4,6 +4,7 @@ import ( "crypto/rand" "encoding/hex" "io" + "path/filepath" ) // GenerateRandomName returns a new name joined with a prefix. This size @@ -15,3 +16,13 @@ func GenerateRandomName(prefix string, size int) (string, error) { } return prefix + hex.EncodeToString(id)[:size], nil } + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} diff --git a/system/calls_linux.go b/system/calls_linux.go index 0bf42e3..b7a8f14 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -1,6 +1,7 @@ package system import ( + "os/exec" "syscall" ) @@ -16,6 +17,14 @@ func Exec(cmd string, args []string, env []string) error { return syscall.Exec(cmd, args, env) } +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + return Exec(name, args, env) +} + func Fork() (int, error) { syscall.ForkLock.Lock() pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) diff --git a/system/pty_linux.go b/system/pty_linux.go index b281b71..ca588d8 100644 --- a/system/pty_linux.go +++ b/system/pty_linux.go @@ -24,8 +24,35 @@ func Ptsname(f *os.File) (string, error) { return fmt.Sprintf("/dev/pts/%d", n), nil } +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container +func CreateMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + console, err := Ptsname(master) + if err != nil { + return nil, "", err + } + if err := Unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + // OpenPtmx opens /dev/ptmx, i.e. the PTY master. func OpenPtmx() (*os.File, error) { // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) } + +// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace +func OpenTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} From 357ca32831c94462f68cba3dafa78a6b739f07b7 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Mon, 24 Feb 2014 21:52:29 -0800 Subject: [PATCH 077/117] Better capability/namespace management Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/capabilities/capabilities.go | 20 +-- libcontainer/nsinit/execin.go | 7 +- libcontainer/nsinit/ns_linux.go | 24 +--- libcontainer/types.go | 155 ++++++++++++++++------ 4 files changed, 119 insertions(+), 87 deletions(-) diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index 65fd455..3c6d752 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -6,24 +6,6 @@ import ( "os" ) -var capMap = map[libcontainer.Capability]capability.Cap{ - libcontainer.CAP_SETPCAP: capability.CAP_SETPCAP, - libcontainer.CAP_SYS_MODULE: capability.CAP_SYS_MODULE, - libcontainer.CAP_SYS_RAWIO: capability.CAP_SYS_RAWIO, - libcontainer.CAP_SYS_PACCT: capability.CAP_SYS_PACCT, - libcontainer.CAP_SYS_ADMIN: capability.CAP_SYS_ADMIN, - libcontainer.CAP_SYS_NICE: capability.CAP_SYS_NICE, - libcontainer.CAP_SYS_RESOURCE: capability.CAP_SYS_RESOURCE, - libcontainer.CAP_SYS_TIME: capability.CAP_SYS_TIME, - libcontainer.CAP_SYS_TTY_CONFIG: capability.CAP_SYS_TTY_CONFIG, - libcontainer.CAP_MKNOD: capability.CAP_MKNOD, - libcontainer.CAP_AUDIT_WRITE: capability.CAP_AUDIT_WRITE, - libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, - libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, - libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, - libcontainer.CAP_NET_ADMIN: capability.CAP_NET_ADMIN, -} - // DropCapabilities drops capabilities for the current process based // on the container's configuration. func DropCapabilities(container *libcontainer.Container) error { @@ -45,7 +27,7 @@ func DropCapabilities(container *libcontainer.Container) error { func getCapabilities(container *libcontainer.Container) []capability.Cap { drop := []capability.Cap{} for _, c := range container.Capabilities { - drop = append(drop, capMap[c]) + drop = append(drop, c.Value) } return drop } diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 463196c..306250c 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -14,7 +14,7 @@ import ( // ExecIn uses an existing pid and joins the pid's namespaces with the new command. func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { for _, ns := range container.Namespaces { - if err := system.Unshare(namespaceMap[ns]); err != nil { + if err := system.Unshare(ns.Value); err != nil { return -1, err } } @@ -42,8 +42,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s // if the container has a new pid and mount namespace we need to // remount proc and sys to pick up the changes - if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && - container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + if container.Namespaces.Contains("CLONE_NEWNS") && container.Namespaces.Contains("CLONE_NEWPID") { pid, err := system.Fork() if err != nil { return -1, err @@ -84,7 +83,7 @@ dropAndExec: func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { fds := make([]uintptr, len(container.Namespaces)) for i, ns := range container.Namespaces { - f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", namespaceFileMap[ns]), os.O_RDONLY, 0) + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0) if err != nil { return fds, err } diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go index 58af247..ab6322e 100644 --- a/libcontainer/nsinit/ns_linux.go +++ b/libcontainer/nsinit/ns_linux.go @@ -2,35 +2,13 @@ package nsinit import ( "github.com/dotcloud/docker/pkg/libcontainer" - "syscall" ) -var namespaceMap = map[libcontainer.Namespace]int{ - libcontainer.CLONE_NEWNS: syscall.CLONE_NEWNS, - libcontainer.CLONE_NEWUTS: syscall.CLONE_NEWUTS, - libcontainer.CLONE_NEWIPC: syscall.CLONE_NEWIPC, - libcontainer.CLONE_NEWUSER: syscall.CLONE_NEWUSER, - libcontainer.CLONE_NEWPID: syscall.CLONE_NEWPID, - libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, -} - -// namespaceFileMap is used to convert the libcontainer types -// into the names of the files located in /proc//ns/* for -// each namespace -var namespaceFileMap = map[libcontainer.Namespace]string{ - libcontainer.CLONE_NEWNS: "mnt", - libcontainer.CLONE_NEWUTS: "uts", - libcontainer.CLONE_NEWIPC: "ipc", - libcontainer.CLONE_NEWUSER: "user", - libcontainer.CLONE_NEWPID: "pid", - libcontainer.CLONE_NEWNET: "net", -} - // getNamespaceFlags parses the container's Namespaces options to set the correct // flags on clone, unshare, and setns func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { for _, ns := range namespaces { - flag |= namespaceMap[ns] + flag |= ns.Value } return flag } diff --git a/libcontainer/types.go b/libcontainer/types.go index bb54ff5..cb64db1 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -1,58 +1,131 @@ package libcontainer -// These constants are defined as string types so that -// it is clear when adding the configuration in config files -// instead of using ints or other types -const ( - CAP_SETPCAP Capability = "SETPCAP" - CAP_SYS_MODULE Capability = "SYS_MODULE" - CAP_SYS_RAWIO Capability = "SYS_RAWIO" - CAP_SYS_PACCT Capability = "SYS_PACCT" - CAP_SYS_ADMIN Capability = "SYS_ADMIN" - CAP_SYS_NICE Capability = "SYS_NICE" - CAP_SYS_RESOURCE Capability = "SYS_RESOURCE" - CAP_SYS_TIME Capability = "SYS_TIME" - CAP_SYS_TTY_CONFIG Capability = "SYS_TTY_CONFIG" - CAP_MKNOD Capability = "MKNOD" - CAP_AUDIT_WRITE Capability = "AUDIT_WRITE" - CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" - CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" - CAP_MAC_ADMIN Capability = "MAC_ADMIN" - CAP_NET_ADMIN Capability = "NET_ADMIN" +import ( + "encoding/json" + "errors" + "github.com/syndtr/gocapability/capability" + "os" + "syscall" +) - CLONE_NEWNS Namespace = "NEWNS" // mount - CLONE_NEWUTS Namespace = "NEWUTS" // utsname - CLONE_NEWIPC Namespace = "NEWIPC" // ipc - CLONE_NEWUSER Namespace = "NEWUSER" // user - CLONE_NEWPID Namespace = "NEWPID" // pid - CLONE_NEWNET Namespace = "NEWNET" // network +var ( + ErrUnkownNamespace error = errors.New("Unkown namespace") +) + +// namespaceList is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace +var ( + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } + capabilityList = Capabilities{ + {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, + {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, + {Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO}, + {Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT}, + {Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN}, + {Key: "SYS_NICE", Value: capability.CAP_SYS_NICE}, + {Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE}, + {Key: "SYS_TIME", Value: capability.CAP_SYS_TIME}, + {Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG}, + {Key: "MKNOD", Value: capability.CAP_MKNOD}, + {Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE}, + {Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL}, + {Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE}, + {Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN}, + {Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN}, + } ) type ( - Namespace string - Namespaces []Namespace - Capability string - Capabilities []Capability + Namespace struct { + Key string + Value int + File string + } + Namespaces []*Namespace ) +func (ns *Namespace) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Namespace) UnmarshalJSON(src []byte) error { + var nsName string + if err := json.Unmarshal(src, &nsName); err != nil { + return err + } + ret := GetNamespace(nsName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetNamespace(key string) *Namespace { + for _, ns := range namespaceList { + if ns.Key == key { + return ns + } + } + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Namespace not found") + } + return nil +} + // Contains returns true if the specified Namespace is // in the slice -func (n Namespaces) Contains(ns Namespace) bool { - for _, nns := range n { - if nns == ns { - return true +func (n Namespaces) Contains(ns string) bool { + return GetNamespace(ns) != nil +} + +type ( + Capability struct { + Key string + Value capability.Cap + } + Capabilities []*Capability +) + +func (ns *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Capability) UnmarshalJSON(src []byte) error { + var capName string + if err := json.Unmarshal(src, &capName); err != nil { + return err + } + ret := GetCapability(capName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetCapability(key string) *Capability { + for _, capp := range capabilityList { + if capp.Key == key { + return capp } } - return false + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Namespace not found") + } + return nil } // Contains returns true if the specified Capability is // in the slice -func (c Capabilities) Contains(capp Capability) bool { - for _, cc := range c { - if cc == capp { - return true - } - } - return false +func (c Capabilities) Contains(capp string) bool { + return GetCapability(capp) != nil } From 5982af496947d5049b7b20caf6ebfd5071d1ef6d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 10:54:41 -0800 Subject: [PATCH 078/117] Address initial feedback from pr Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- system/setns_linux.go | 21 ++++++++++++++++++++- system/setns_linux_amd64.go | 8 -------- 2 files changed, 20 insertions(+), 9 deletions(-) delete mode 100644 system/setns_linux_amd64.go diff --git a/system/setns_linux.go b/system/setns_linux.go index be6f3ed..07b1c93 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -1,11 +1,30 @@ package system import ( + "errors" + "fmt" + "runtime" "syscall" ) +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +// +// We need different setns values for the different platforms and arch +// We are declaring the macro here because the SETNS syscall does not exist in th stdlib +var setNsMap = map[string]uintptr{ + "linux/amd64": 308, +} + func Setns(fd uintptr, flags uintptr) error { - _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + if !exists { + return ErrNotSupportedPlatform + } + _, _, err := syscall.RawSyscall(ns, fd, flags, 0) if err != 0 { return err } diff --git a/system/setns_linux_amd64.go b/system/setns_linux_amd64.go deleted file mode 100644 index 4e30625..0000000 --- a/system/setns_linux_amd64.go +++ /dev/null @@ -1,8 +0,0 @@ -// +build linux,amd64 - -package system - -// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 -const ( - SYS_SETNS = 308 -) From 2acaf7ca82a249db5bcef6f67f702c591b7413d3 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 12:41:31 -0800 Subject: [PATCH 079/117] Move container.json and pid file into a root specific driver dir Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/execin.go | 2 +- libcontainer/nsinit/nsinit/main.go | 12 +++++++----- libcontainer/nsinit/state.go | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 306250c..253fbdc 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -42,7 +42,7 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s // if the container has a new pid and mount namespace we need to // remount proc and sys to pick up the changes - if container.Namespaces.Contains("CLONE_NEWNS") && container.Namespaces.Contains("CLONE_NEWPID") { + if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") { pid, err := system.Fork() if err != nil { return -1, err diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e385e7f..e6b020b 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -9,12 +9,13 @@ import ( "io/ioutil" "log" "os" + "path/filepath" "strconv" ) var ( - console string - pipeFd int + root, console string + pipeFd int ) var ( @@ -25,6 +26,7 @@ var ( func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.StringVar(&root, "root", ".", "root for storing configuration data") flag.Parse() } @@ -84,7 +86,7 @@ func main() { } func loadContainer() (*libcontainer.Container, error) { - f, err := os.Open("container.json") + f, err := os.Open(filepath.Join(root, "container.json")) if err != nil { return nil, err } @@ -98,7 +100,7 @@ func loadContainer() (*libcontainer.Container, error) { } func readPid() (int, error) { - data, err := ioutil.ReadFile(".nspid") + data, err := ioutil.ReadFile(filepath.Join(root, "pid")) if err != nil { return -1, err } @@ -110,5 +112,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{}), nil + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{root}), nil } diff --git a/libcontainer/nsinit/state.go b/libcontainer/nsinit/state.go index 5c719e1..af38008 100644 --- a/libcontainer/nsinit/state.go +++ b/libcontainer/nsinit/state.go @@ -18,11 +18,11 @@ type DefaultStateWriter struct { Root string } -// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container +// writePidFile writes the namespaced processes pid to pid in the rootfs for the container func (d *DefaultStateWriter) WritePid(pid int) error { - return ioutil.WriteFile(filepath.Join(d.Root, ".nspid"), []byte(fmt.Sprint(pid)), 0655) + return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655) } func (d *DefaultStateWriter) DeletePid() error { - return os.Remove(filepath.Join(d.Root, ".nspid")) + return os.Remove(filepath.Join(d.Root, "pid")) } From 98f0a748a7a6ca7b268a5da17b156be918400e3b Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Fri, 21 Feb 2014 23:15:28 +0000 Subject: [PATCH 080/117] add version pkg Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- version/version.go | 52 +++++++++++++++++++++++++++++++++++++++++ version/version_test.go | 25 ++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 version/version.go create mode 100644 version/version_test.go diff --git a/version/version.go b/version/version.go new file mode 100644 index 0000000..3721d64 --- /dev/null +++ b/version/version.go @@ -0,0 +1,52 @@ +package version + +import ( + "strconv" + "strings" +) + +type Version string + +func (me Version) compareTo(other string) int { + var ( + meTab = strings.Split(string(me), ".") + otherTab = strings.Split(other, ".") + ) + for i, s := range meTab { + var meInt, otherInt int + meInt, _ = strconv.Atoi(s) + if len(otherTab) > i { + otherInt, _ = strconv.Atoi(otherTab[i]) + } + if meInt > otherInt { + return 1 + } + if otherInt > meInt { + return -1 + } + } + if len(otherTab) > len(meTab) { + return -1 + } + return 0 +} + +func (me Version) LessThan(other string) bool { + return me.compareTo(other) == -1 +} + +func (me Version) LessThanOrEqualTo(other string) bool { + return me.compareTo(other) <= 0 +} + +func (me Version) GreaterThan(other string) bool { + return me.compareTo(other) == 1 +} + +func (me Version) GreaterThanOrEqualTo(other string) bool { + return me.compareTo(other) >= 0 +} + +func (me Version) Equal(other string) bool { + return me.compareTo(other) == 0 +} diff --git a/version/version_test.go b/version/version_test.go new file mode 100644 index 0000000..4bebd0c --- /dev/null +++ b/version/version_test.go @@ -0,0 +1,25 @@ +package version + +import ( + "testing" +) + +func assertVersion(t *testing.T, a, b string, result int) { + if r := Version(a).compareTo(b); r != result { + t.Fatalf("Unexpected version comparison result. Found %d, expected %d", r, result) + } +} + +func TestCompareVersion(t *testing.T) { + assertVersion(t, "1.12", "1.12", 0) + assertVersion(t, "1.05.00.0156", "1.0.221.9289", 1) + assertVersion(t, "1", "1.0.1", -1) + assertVersion(t, "1.0.1", "1", 1) + assertVersion(t, "1.0.1", "1.0.2", -1) + assertVersion(t, "1.0.2", "1.0.3", -1) + assertVersion(t, "1.0.3", "1.1", -1) + assertVersion(t, "1.1", "1.1.1", -1) + assertVersion(t, "1.1.1", "1.1.2", -1) + assertVersion(t, "1.1.2", "1.2", -1) + +} From f85823b53de774a6d690832b8f27bc1bde0f01bc Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 15:19:13 -0800 Subject: [PATCH 081/117] Fix cross compile for make cross Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/command.go | 25 ++++++++++++++------- libcontainer/nsinit/execin.go | 2 ++ libcontainer/nsinit/ns_linux.go | 14 ------------ libcontainer/nsinit/nsinit/main.go | 12 +++------- libcontainer/nsinit/unsupported.go | 19 ++++++++++++++++ libcontainer/types.go | 35 ++++++++++++++++-------------- libcontainer/types_linux.go | 16 ++++++++++++++ system/calls_linux.go | 7 ++++++ system/errors.go | 9 ++++++++ system/setns_linux.go | 5 ----- system/unsupported.go | 15 +++++++++++++ 11 files changed, 107 insertions(+), 52 deletions(-) delete mode 100644 libcontainer/nsinit/ns_linux.go create mode 100644 libcontainer/nsinit/unsupported.go create mode 100644 libcontainer/types_linux.go create mode 100644 system/errors.go create mode 100644 system/unsupported.go diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go index 5eb378a..8ddf1e7 100644 --- a/libcontainer/nsinit/command.go +++ b/libcontainer/nsinit/command.go @@ -3,9 +3,9 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" "os" "os/exec" - "syscall" ) // CommandFactory takes the container's configuration and options passed by the @@ -15,22 +15,31 @@ type CommandFactory interface { Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd } -type DefaultCommandFactory struct{} +type DefaultCommandFactory struct { + Root string +} // Create will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { - // get our binary name so we can always reexec ourself - name := os.Args[0] - command := exec.Command(name, append([]string{ + // get our binary name from arg0 so we can always reexec ourself + command := exec.Command(os.Args[0], append([]string{ "-console", console, "-pipe", fmt.Sprint(pipe), + "-root", c.Root, "init"}, args...)...) - command.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(GetNamespaceFlags(container.Namespaces)), - } + system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces))) command.Env = container.Env return command } + +// GetNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= ns.Value + } + return flag +} diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 253fbdc..55f7b96 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -1,3 +1,5 @@ +// +build linux + package nsinit import ( diff --git a/libcontainer/nsinit/ns_linux.go b/libcontainer/nsinit/ns_linux.go deleted file mode 100644 index ab6322e..0000000 --- a/libcontainer/nsinit/ns_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -package nsinit - -import ( - "github.com/dotcloud/docker/pkg/libcontainer" -) - -// getNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare, and setns -func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { - for _, ns := range namespaces { - flag |= ns.Value - } - return flag -} diff --git a/libcontainer/nsinit/nsinit/main.go b/libcontainer/nsinit/nsinit/main.go index e6b020b..61921c5 100644 --- a/libcontainer/nsinit/nsinit/main.go +++ b/libcontainer/nsinit/nsinit/main.go @@ -2,7 +2,6 @@ package main import ( "encoding/json" - "errors" "flag" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/nsinit" @@ -18,11 +17,6 @@ var ( pipeFd int ) -var ( - ErrUnsupported = errors.New("Unsupported method") - ErrWrongArguments = errors.New("Wrong argument count") -) - func registerFlags() { flag.StringVar(&console, "console", "", "console (pty slave) path") flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") @@ -35,7 +29,7 @@ func main() { registerFlags() if flag.NArg() < 1 { - log.Fatal(ErrWrongArguments) + log.Fatalf("wrong number of argments %d", flag.NArg()) } container, err := loadContainer() if err != nil { @@ -71,7 +65,7 @@ func main() { log.Fatal(err) } if flag.NArg() < 2 { - log.Fatal(ErrWrongArguments) + log.Fatalf("wrong number of argments %d", flag.NArg()) } syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) if err != nil { @@ -112,5 +106,5 @@ func readPid() (int, error) { } func newNsInit() (nsinit.NsInit, error) { - return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{root}), nil + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil } diff --git a/libcontainer/nsinit/unsupported.go b/libcontainer/nsinit/unsupported.go new file mode 100644 index 0000000..2412223 --- /dev/null +++ b/libcontainer/nsinit/unsupported.go @@ -0,0 +1,19 @@ +// +build !linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + return libcontainer.ErrUnsupported +} diff --git a/libcontainer/types.go b/libcontainer/types.go index cb64db1..8c28530 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -5,25 +5,20 @@ import ( "errors" "github.com/syndtr/gocapability/capability" "os" - "syscall" ) var ( - ErrUnkownNamespace error = errors.New("Unkown namespace") + ErrUnkownNamespace = errors.New("Unknown namespace") + ErrUnkownCapability = errors.New("Unknown capability") + ErrUnsupported = errors.New("Unsupported method") ) // namespaceList is used to convert the libcontainer types // into the names of the files located in /proc//ns/* for // each namespace var ( - namespaceList = Namespaces{ - {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, - {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, - {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, - {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, - {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, - {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, - } + namespaceList = Namespaces{} + capabilityList = Capabilities{ {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, @@ -52,6 +47,10 @@ type ( Namespaces []*Namespace ) +func (ns *Namespace) String() string { + return ns.Key +} + func (ns *Namespace) MarshalJSON() ([]byte, error) { return json.Marshal(ns.Key) } @@ -95,20 +94,24 @@ type ( Capabilities []*Capability ) -func (ns *Capability) MarshalJSON() ([]byte, error) { - return json.Marshal(ns.Key) +func (c *Capability) String() string { + return c.Key } -func (ns *Capability) UnmarshalJSON(src []byte) error { +func (c *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(c.Key) +} + +func (c *Capability) UnmarshalJSON(src []byte) error { var capName string if err := json.Unmarshal(src, &capName); err != nil { return err } ret := GetCapability(capName) if ret == nil { - return ErrUnkownNamespace + return ErrUnkownCapability } - *ns = *ret + *c = *ret return nil } @@ -119,7 +122,7 @@ func GetCapability(key string) *Capability { } } if os.Getenv("DEBUG") != "" { - panic("Unreachable: Namespace not found") + panic("Unreachable: Capability not found") } return nil } diff --git a/libcontainer/types_linux.go b/libcontainer/types_linux.go new file mode 100644 index 0000000..c14531d --- /dev/null +++ b/libcontainer/types_linux.go @@ -0,0 +1,16 @@ +package libcontainer + +import ( + "syscall" +) + +func init() { + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } +} diff --git a/system/calls_linux.go b/system/calls_linux.go index b7a8f14..bf667c5 100644 --- a/system/calls_linux.go +++ b/system/calls_linux.go @@ -136,3 +136,10 @@ func Mkfifo(name string, mode uint32) error { func Umask(mask int) int { return syscall.Umask(mask) } + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Cloneflags = flag +} diff --git a/system/errors.go b/system/errors.go new file mode 100644 index 0000000..6304518 --- /dev/null +++ b/system/errors.go @@ -0,0 +1,9 @@ +package system + +import ( + "errors" +) + +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) diff --git a/system/setns_linux.go b/system/setns_linux.go index 07b1c93..2b6f9e7 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -1,16 +1,11 @@ package system import ( - "errors" "fmt" "runtime" "syscall" ) -var ( - ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") -) - // Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 // // We need different setns values for the different platforms and arch diff --git a/system/unsupported.go b/system/unsupported.go new file mode 100644 index 0000000..eb3ec7e --- /dev/null +++ b/system/unsupported.go @@ -0,0 +1,15 @@ +// +build !linux + +package system + +import ( + "os/exec" +) + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + +} + +func UsetCloseOnExec(fd uintptr) error { + return ErrNotSupportedPlatform +} From d8025d106638fba6f18ee177bb15dffaf08483a2 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 25 Feb 2014 19:45:57 -0800 Subject: [PATCH 082/117] Fix cgroups swap issue when it is not supported Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- cgroups/cgroups.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index e260d67..b40e1a3 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -223,8 +223,10 @@ func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { return err } } - if c.MemorySwap != 0 { - if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10)); err != nil { + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { return err } } From 4f6cdc6f08c4762e7cd377124e734c907cad3f9c Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 14:19:39 -0800 Subject: [PATCH 083/117] Make network a slice to support multiple types Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 21 +++++++++++---------- libcontainer/container.go | 2 +- libcontainer/container.json | 21 +++++++++++---------- libcontainer/network/strategy.go | 2 +- libcontainer/network/veth.go | 26 ++++++++++++-------------- libcontainer/nsinit/exec.go | 13 +++++-------- libcontainer/nsinit/init.go | 6 +++--- 7 files changed, 44 insertions(+), 47 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 36553af..4c8da8e 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -48,16 +48,17 @@ Sample `container.json` file: "MAC_ADMIN", "NET_ADMIN" ], - "network": { - "type": "veth", - "context": { - "bridge": "docker0", - "prefix": "dock" - }, - "address": "172.17.0.100/16", - "gateway": "172.17.42.1", - "mtu": 1500 - }, + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], "cgroups": { "name": "docker-koye", "parent": "docker", diff --git a/libcontainer/container.go b/libcontainer/container.go index 4a47977..12a3d7b 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -19,7 +19,7 @@ type Container struct { Tty bool `json:"tty,omitempty"` // setup a proper tty or not Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop - Network *Network `json:"network,omitempty"` // nil for host's network stack + Networks []*Network `json:"networks,omitempty"` // nil for host's network stack Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` } diff --git a/libcontainer/container.json b/libcontainer/container.json index c2b21f8..83e4074 100644 --- a/libcontainer/container.json +++ b/libcontainer/container.json @@ -31,16 +31,17 @@ "MAC_ADMIN", "NET_ADMIN" ], - "network": { - "type": "veth", - "context": { - "bridge": "docker0", - "prefix": "dock" - }, - "address": "172.17.0.100/16", - "gateway": "172.17.42.1", - "mtu": 1500 - }, + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], "cgroups": { "name": "docker-koye", "parent": "docker", diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go index 8ecc11a..a2f4f8f 100644 --- a/libcontainer/network/strategy.go +++ b/libcontainer/network/strategy.go @@ -16,7 +16,7 @@ var strategies = map[string]NetworkStrategy{ // NetworkStrategy represends a specific network configuration for // a containers networking stack type NetworkStrategy interface { - Create(*libcontainer.Network, int) (libcontainer.Context, error) + Create(*libcontainer.Network, int, libcontainer.Context) error Initialize(*libcontainer.Network, libcontainer.Context) error } diff --git a/libcontainer/network/veth.go b/libcontainer/network/veth.go index 49e63f0..3ab1b23 100644 --- a/libcontainer/network/veth.go +++ b/libcontainer/network/veth.go @@ -12,39 +12,37 @@ import ( type Veth struct { } -func (v *Veth) Create(n *libcontainer.Network, nspid int) (libcontainer.Context, error) { +func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error { var ( bridge string prefix string exists bool ) if bridge, exists = n.Context["bridge"]; !exists { - return nil, fmt.Errorf("bridge does not exist in network context") + return fmt.Errorf("bridge does not exist in network context") } if prefix, exists = n.Context["prefix"]; !exists { - return nil, fmt.Errorf("veth prefix does not exist in network context") + return fmt.Errorf("veth prefix does not exist in network context") } name1, name2, err := createVethPair(prefix) if err != nil { - return nil, err - } - context := libcontainer.Context{ - "vethHost": name1, - "vethChild": name2, + return err } + context["veth-host"] = name1 + context["veth-child"] = name2 if err := SetInterfaceMaster(name1, bridge); err != nil { - return context, err + return err } if err := SetMtu(name1, n.Mtu); err != nil { - return context, err + return err } if err := InterfaceUp(name1); err != nil { - return context, err + return err } if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { - return context, err + return err } - return context, nil + return nil } func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { @@ -52,7 +50,7 @@ func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Con vethChild string exists bool ) - if vethChild, exists = context["vethChild"]; !exists { + if vethChild, exists = context["veth-child"]; !exists { return fmt.Errorf("vethChild does not exist in network context") } if err := InterfaceDown(vethChild); err != nil { diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index b13326b..f7a9c17 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -84,18 +84,15 @@ func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) er } func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { - if container.Network != nil { - strategy, err := network.GetStrategy(container.Network.Type) + context := libcontainer.Context{} + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) if err != nil { return err } - networkContext, err := strategy.Create(container.Network, nspid) - if err != nil { - return err - } - if err := pipe.SendToChild(networkContext); err != nil { + if err := strategy.Create(config, nspid, context); err != nil { return err } } - return nil + return pipe.SendToChild(context) } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 1229560..cfc5058 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -56,7 +56,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetwork(container.Network, context); err != nil { + if err := setupNetwork(container, context); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -130,8 +130,8 @@ func dupSlave(slave *os.File) error { // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway -func setupNetwork(config *libcontainer.Network, context libcontainer.Context) error { - if config != nil { +func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error { + for _, config := range container.Networks { strategy, err := network.GetStrategy(config.Type) if err != nil { return err From f8262b5748caa42763b2b33ff182da159fb08b8a Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 17:21:09 -0800 Subject: [PATCH 084/117] Ensure that loopback devices are mounted inside the conatiner Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 53 +++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index a73e97e..55c2655 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -37,6 +37,9 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } + if err := setupLoopbackDevices(rootfs); err != nil { + return fmt.Errorf("setup loopback devices %s", err) + } if err := setupDev(rootfs); err != nil { return err } @@ -76,21 +79,57 @@ func copyDevNodes(rootfs string) error { "urandom", "tty", } { - stat, err := os.Stat(filepath.Join("/dev", node)) + if err := copyDevNode(rootfs, node); err != nil { + return err + } + } + return nil +} + +func setupLoopbackDevices(rootfs string) error { + for i := 0; ; i++ { + var ( + device = fmt.Sprintf("loop%d", i) + source = filepath.Join("/dev", device) + dest = filepath.Join(rootfs, "dev", device) + ) + + if _, err := os.Stat(source); err != nil { + if !os.IsNotExist(err) { + return err + } + return nil + } + if _, err := os.Stat(dest); err == nil { + os.Remove(dest) + } + f, err := os.Create(dest) if err != nil { return err } - var ( - dest = filepath.Join(rootfs, "dev", node) - st = stat.Sys().(*syscall.Stat_t) - ) - if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { - return fmt.Errorf("copy %s %s", node, err) + f.Close() + if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil { + return err } } return nil } +func copyDevNode(rootfs, node string) error { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + return nil +} + // setupDev symlinks the current processes pipes into the // appropriate destination on the containers rootfs func setupDev(rootfs string) error { From 34301be2008095904c586d5fec9a81d165347526 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 26 Feb 2014 19:19:14 -0800 Subject: [PATCH 085/117] Code review updates Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 12 +++++++----- libcontainer/network/strategy.go | 4 ++-- libcontainer/nsinit/exec.go | 4 +--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 4c8da8e..b81401c 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -9,9 +9,9 @@ for using linux namespaces with no external dependencies. libcontainer provides #### container A container is a self contained directory that is able to run one or more processes inside without affecting the host system. The directory is usually a full system tree. Inside the directory -a `container.json` file just be placed with the runtime configuration for how the process -should be contained and run. Environment, networking, and different capabilities for the -process are specified in this file. +a `container.json` file is placed with the runtime configuration for how the processes +should be contained and ran. Environment, networking, and different capabilities for the +process are specified in this file. The configuration is used for each process executed inside the container. Sample `container.json` file: ```json @@ -67,10 +67,12 @@ Sample `container.json` file: } ``` -Using this configuration and the current directory holding the rootfs for a process to live, one can se libcontainer to exec the container. Running the life of the namespace a `.nspid` file -is written to the current directory with the pid of the namespace'd process to the external word. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +Using this configuration and the current directory holding the rootfs for a process to live, one can use libcontainer to exec the container. Running the life of the namespace a `pid` file +is written to the current directory with the pid of the namespace'd process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +You may also specify an alternate root to to place the `container.json` file is read and where the `pid` file will be saved. + #### nsinit `nsinit` is a cli application used as the reference implementation of libcontainer. It is able to diff --git a/libcontainer/network/strategy.go b/libcontainer/network/strategy.go index a2f4f8f..234fcc0 100644 --- a/libcontainer/network/strategy.go +++ b/libcontainer/network/strategy.go @@ -13,8 +13,8 @@ var strategies = map[string]NetworkStrategy{ "veth": &Veth{}, } -// NetworkStrategy represends a specific network configuration for -// a containers networking stack +// NetworkStrategy represents a specific network configuration for +// a container's networking stack type NetworkStrategy interface { Create(*libcontainer.Network, int, libcontainer.Context) error Initialize(*libcontainer.Network, libcontainer.Context) error diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f7a9c17..f1a4e24 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -48,9 +48,7 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ command.Process.Kill() return -1, err } - defer func() { - ns.stateWriter.DeletePid() - }() + defer ns.stateWriter.DeletePid() // Do this before syncing with child so that no children // can escape the cgroup From 85696fdb679e368030818a0ddc90783e713b46f0 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 27 Feb 2014 09:28:26 -0800 Subject: [PATCH 086/117] Allow child process to live if daemon dies Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cfc5058..cc481e2 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -50,9 +50,11 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol } } - if err := system.ParentDeathSignal(); err != nil { - return fmt.Errorf("parent deth signal %s", err) - } + /* + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } + */ if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } From 48cfa0fbdf4d8d198562b7cb3efec46c8085f783 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Tue, 18 Feb 2014 11:41:11 +0100 Subject: [PATCH 087/117] runtime: Fix unique constraint error checks The sqlite3 version in fedora (3.8) returns a different error string in the unique constraints failure case than the one in hack/ (3.7). This updates the check to detect both, fixing one integration check failure on Fedora. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- graphdb/graphdb.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/graphdb/graphdb.go b/graphdb/graphdb.go index 9e2466b..46a23b1 100644 --- a/graphdb/graphdb.go +++ b/graphdb/graphdb.go @@ -4,6 +4,7 @@ import ( "database/sql" "fmt" "path" + "strings" "sync" ) @@ -51,6 +52,21 @@ type Database struct { mux sync.RWMutex } +func IsNonUniqueNameError(err error) bool { + str := err.Error() + // sqlite 3.7.17-1ubuntu1 returns: + // Set failure: Abort due to constraint violation: columns parent_id, name are not unique + if strings.HasSuffix(str, "name are not unique") { + return true + } + // sqlite-3.8.3-1.fc20 returns: + // Set failure: Abort due to constraint violation: UNIQUE constraint failed: edge.parent_id, edge.name + if strings.Contains(str, "UNIQUE constraint failed") && strings.Contains(str, "edge.name") { + return true + } + return false +} + // Create a new graph database initialized with a root entity func NewDatabase(conn *sql.DB, init bool) (*Database, error) { if conn == nil { From d1b77040a74fed417c080cb20060bb56e2c23885 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Mon, 3 Mar 2014 19:17:28 +0000 Subject: [PATCH 088/117] prevent flag grouping with -- Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- mflag/flag.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mflag/flag.go b/mflag/flag.go index 7a0e8bf..b5c5fa9 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -772,6 +772,9 @@ func (f *FlagSet) parseOne() (bool, string, error) { f.usage() return false, "", ErrHelp } + if len(name) > 0 && name[0] == '-' { + return false, "", f.failf("flag provided but not defined: -%s", name) + } return false, name, ErrRetry } if fv, ok := flag.Value.(boolFlag); ok && fv.IsBoolFlag() { // special case: doesn't need an arg From ab952e250272af790ae95e134a840d26e51be1bd Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 3 Mar 2014 11:31:37 -0800 Subject: [PATCH 089/117] Update readme to remove .nspid Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index b81401c..1032531 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -87,4 +87,4 @@ nsinit exec /bin/bash If you wish to spawn another process inside the container while your current bash session is running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. -You can identify if a process is running in a container by looking to see if `.nspid` is in the root of the directory. +You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory. From ed2e85ad249feb2cf6426665b1601b937f9fcc1a Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Mon, 3 Mar 2014 19:57:05 +0000 Subject: [PATCH 090/117] add warning for deprecatd flags Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- mflag/flag.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mflag/flag.go b/mflag/flag.go index b5c5fa9..6fe3e41 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -803,6 +803,11 @@ func (f *FlagSet) parseOne() (bool, string, error) { f.actual = make(map[string]*Flag) } f.actual[name] = flag + for _, n := range flag.Names { + if n == fmt.Sprintf("#%s", name) { + fmt.Fprintf(f.out(), "Warning: '-%s' is deprecated, it will be removed soon. See usage.\n", name) + } + } return true, "", nil } From 7dc071dca54e9c939f8b2376406cc5b2a4d824f8 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 3 Mar 2014 12:15:47 -0800 Subject: [PATCH 091/117] Factor out finalize namespace Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/execin.go | 5 ++--- libcontainer/nsinit/init.go | 29 +++++++++++++++++++---------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/libcontainer/nsinit/execin.go b/libcontainer/nsinit/execin.go index 55f7b96..488fe0e 100644 --- a/libcontainer/nsinit/execin.go +++ b/libcontainer/nsinit/execin.go @@ -5,7 +5,6 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/system" "os" "path/filepath" @@ -73,8 +72,8 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) } dropAndExec: - if err := capabilities.DropCapabilities(container); err != nil { - return -1, fmt.Errorf("drop capabilities %s", err) + if err := finalizeNamespace(container); err != nil { + return -1, err } if err := system.Execv(args[0], args[0:], container.Env); err != nil { return -1, err diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index cc481e2..565030f 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -64,16 +64,8 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - if err := capabilities.DropCapabilities(container); err != nil { - return fmt.Errorf("drop capabilities %s", err) - } - if err := setupUser(container); err != nil { - return fmt.Errorf("setup user %s", err) - } - if container.WorkingDir != "" { - if err := system.Chdir(container.WorkingDir); err != nil { - return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) - } + if err := finalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) } return system.Execv(args[0], args[0:], container.Env) } @@ -142,3 +134,20 @@ func setupNetwork(container *libcontainer.Container, context libcontainer.Contex } return nil } + +// finalizeNamespace drops the caps and sets the correct user +// and working dir before execing the command inside the namespace +func finalizeNamespace(container *libcontainer.Container) error { + if err := capabilities.DropCapabilities(container); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + return fmt.Errorf("setup user %s", err) + } + if container.WorkingDir != "" { + if err := system.Chdir(container.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) + } + } + return nil +} From 313d6a9e13f800bc963f6ea67fe5666ba55f3a6d Mon Sep 17 00:00:00 2001 From: Sven Dowideit Date: Thu, 27 Feb 2014 23:36:19 -0800 Subject: [PATCH 092/117] very minor spelling Docker-DCO-1.1-Signed-off-by: Sven Dowideit (github: SvenDowideit) --- libcontainer/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libcontainer/README.md b/libcontainer/README.md index 1032531..d6e4ded 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -7,7 +7,7 @@ for using linux namespaces with no external dependencies. libcontainer provides #### container -A container is a self contained directory that is able to run one or more processes inside without +A container is a self contained directory that is able to run one or more processes without affecting the host system. The directory is usually a full system tree. Inside the directory a `container.json` file is placed with the runtime configuration for how the processes should be contained and ran. Environment, networking, and different capabilities for the @@ -67,11 +67,11 @@ Sample `container.json` file: } ``` -Using this configuration and the current directory holding the rootfs for a process to live, one can use libcontainer to exec the container. Running the life of the namespace a `pid` file -is written to the current directory with the pid of the namespace'd process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace with namespace will be joined by the new process. +Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file +is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process. -You may also specify an alternate root to to place the `container.json` file is read and where the `pid` file will be saved. +You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved. #### nsinit @@ -79,7 +79,7 @@ You may also specify an alternate root to to place the `container.json` file is spawn or join new containers giving the current directory. To use `nsinit` cd into a linux rootfs and copy a `container.json` file into the directory with your specified configuration. -To execution `/bin/bash` in the current directory as a container just run: +To execute `/bin/bash` in the current directory as a container just run: ```bash nsinit exec /bin/bash ``` From 83de20deb749b34d5cb06799e965ebf48e87cc77 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Tue, 4 Mar 2014 12:44:08 +0100 Subject: [PATCH 093/117] libcontainer: Use pivot_root instead of chroot Instead of keeping all the old mounts in the container namespace and just using subtree as root we pivot_root so that the actual root in the namespace is the root we want, and then we unmount the previous mounts. This has multiple advantages: * The namespace mount tree is smaller (in the kernel) * If you break out of the chroot you could previously access the host filesystem. Now the host filesystem is fully invisible to the namespace. * We get rid of all unrelated mounts from the parent namespace, which means we don't hog these. This is important if we later switch to MS_PRIVATE instead of MS_SLAVE as otherwise these mounts would be impossible to unmount from the parent namespace. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- libcontainer/nsinit/mount.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 55c2655..9ae7ec4 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -5,6 +5,7 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/system" + "io/ioutil" "os" "path/filepath" "syscall" @@ -51,16 +52,29 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { - return fmt.Errorf("mount move %s into / %s", rootfs, err) + + pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) } - if err := system.Chroot("."); err != nil { - return fmt.Errorf("chroot . %s", err) + if err := system.Pivotroot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) } if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } + // path to pivot dir now changed, update + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + + if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + + if err := os.Remove(pivotDir); err != nil { + return fmt.Errorf("remove pivot_root dir %s", err) + } + system.Umask(0022) return nil From bd2d7a377c38b6fc075edd49e27ac3dc0a93ad27 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 4 Mar 2014 08:55:12 -0800 Subject: [PATCH 094/117] Add find tests and remove panic in DEBUG Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/types.go | 21 ++++++++++++--------- libcontainer/types_test.go | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 libcontainer/types_test.go diff --git a/libcontainer/types.go b/libcontainer/types.go index 8c28530..94fe876 100644 --- a/libcontainer/types.go +++ b/libcontainer/types.go @@ -4,7 +4,6 @@ import ( "encoding/json" "errors" "github.com/syndtr/gocapability/capability" - "os" ) var ( @@ -74,16 +73,18 @@ func GetNamespace(key string) *Namespace { return ns } } - if os.Getenv("DEBUG") != "" { - panic("Unreachable: Namespace not found") - } return nil } // Contains returns true if the specified Namespace is // in the slice func (n Namespaces) Contains(ns string) bool { - return GetNamespace(ns) != nil + for _, nsp := range n { + if nsp.Key == ns { + return true + } + } + return false } type ( @@ -121,14 +122,16 @@ func GetCapability(key string) *Capability { return capp } } - if os.Getenv("DEBUG") != "" { - panic("Unreachable: Capability not found") - } return nil } // Contains returns true if the specified Capability is // in the slice func (c Capabilities) Contains(capp string) bool { - return GetCapability(capp) != nil + for _, cap := range c { + if cap.Key == capp { + return true + } + } + return false } diff --git a/libcontainer/types_test.go b/libcontainer/types_test.go new file mode 100644 index 0000000..52b85a4 --- /dev/null +++ b/libcontainer/types_test.go @@ -0,0 +1,35 @@ +package libcontainer + +import ( + "testing" +) + +func TestNamespacesContains(t *testing.T) { + ns := Namespaces{ + GetNamespace("NEWPID"), + GetNamespace("NEWNS"), + GetNamespace("NEWUTS"), + } + + if ns.Contains("NEWNET") { + t.Fatal("namespaces should not contain NEWNET") + } + + if !ns.Contains("NEWPID") { + t.Fatal("namespaces should contain NEWPID but does not") + } +} + +func TestCapabilitiesContains(t *testing.T) { + caps := Capabilities{ + GetCapability("MKNOD"), + GetCapability("SETPCAP"), + } + + if caps.Contains("SYS_ADMIN") { + t.Fatal("capabilities should not contain SYS_ADMIN") + } + if !caps.Contains("MKNOD") { + t.Fatal("capabilities should container MKNOD but does not") + } +} From a8b87a47b648b39ebc7ff9b9ac598ed60441756e Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Tue, 4 Mar 2014 12:30:52 -0800 Subject: [PATCH 095/117] Remove loopback mount bind Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/mount.go | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 9ae7ec4..7ce90fd 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -38,9 +38,8 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } - if err := setupLoopbackDevices(rootfs); err != nil { - return fmt.Errorf("setup loopback devices %s", err) - } + // In non-privileged mode, this fails. Discard the error. + setupLoopbackDevices(rootfs) if err := setupDev(rootfs); err != nil { return err } @@ -102,29 +101,13 @@ func copyDevNodes(rootfs string) error { func setupLoopbackDevices(rootfs string) error { for i := 0; ; i++ { - var ( - device = fmt.Sprintf("loop%d", i) - source = filepath.Join("/dev", device) - dest = filepath.Join(rootfs, "dev", device) - ) - - if _, err := os.Stat(source); err != nil { + if err := copyDevNode(rootfs, fmt.Sprintf("loop%d", i)); err != nil { if !os.IsNotExist(err) { return err } - return nil - } - if _, err := os.Stat(dest); err == nil { - os.Remove(dest) - } - f, err := os.Create(dest) - if err != nil { - return err - } - f.Close() - if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil { - return err + break } + } return nil } From 91d13e5fa16c685fff3deec3bcdae476d3123e21 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Tue, 4 Mar 2014 12:32:17 -0800 Subject: [PATCH 096/117] remove /run mountpoint Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/mount.go | 1 - 1 file changed, 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 7ce90fd..6b6929c 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -197,7 +197,6 @@ func mountSystem(rootfs string) error { {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, - {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { return fmt.Errorf("mkdirall %s %s", m.path, err) From 9a5dead3c94dcf0c2da81dc59c892ae862dc210e Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Tue, 4 Mar 2014 13:21:22 -0800 Subject: [PATCH 097/117] Remove /dev tmpfs mountpoint Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/nsinit/mount.go | 1 - 1 file changed, 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 6b6929c..694254e 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -194,7 +194,6 @@ func mountSystem(rootfs string) error { }{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, - {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, } { From 2bddb20b91cf17929309382fc06f681c8f9b6fb1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Tue, 4 Mar 2014 14:18:40 -0800 Subject: [PATCH 098/117] Add shm size cap to mount Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 694254e..61586bc 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -194,7 +194,7 @@ func mountSystem(rootfs string) error { }{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777,size=65536k"}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, } { if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { From 88aed3bf92e5d13dd8f860da624a31f2b6fc48a9 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Wed, 5 Mar 2014 09:40:54 +0100 Subject: [PATCH 099/117] libcontainer: Use MS_PRIVATE instead of MS_SLAVE Now that we unmount all the mounts from the global namespace we can use a private namespace rather than a slave one (as we have no need for unmounts of inherited global mounts to propagate into the container). Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- libcontainer/nsinit/mount.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 61586bc..69d85d6 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -21,7 +21,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { // mount as slave so that the new mounts do not propagate to the host - if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := system.Mount("", "/", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { From 7eb32029e96065b05c7d5e05b081cb486b4ec9dd Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 21 Feb 2014 10:12:25 +0100 Subject: [PATCH 100/117] Create pkg/system and move stuff there from archive This is a package for generic system calls etc that for some reason is not yet supported by "syscall", or where it is different enough for the different ports to need portability wrappers. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- system/stat_linux.go | 13 ++++++++ system/stat_unsupported.go | 13 ++++++++ system/utimes_linux.go | 31 +++++++++++++++++++ system/utimes_unsupported.go | 13 ++++++++ system/xattrs_linux.go | 59 ++++++++++++++++++++++++++++++++++++ system/xattrs_unsupported.go | 11 +++++++ 6 files changed, 140 insertions(+) create mode 100644 system/stat_linux.go create mode 100644 system/stat_unsupported.go create mode 100644 system/utimes_linux.go create mode 100644 system/utimes_unsupported.go create mode 100644 system/xattrs_linux.go create mode 100644 system/xattrs_unsupported.go diff --git a/system/stat_linux.go b/system/stat_linux.go new file mode 100644 index 0000000..e702200 --- /dev/null +++ b/system/stat_linux.go @@ -0,0 +1,13 @@ +package system + +import ( + "syscall" +) + +func GetLastAccess(stat *syscall.Stat_t) syscall.Timespec { + return stat.Atim +} + +func GetLastModification(stat *syscall.Stat_t) syscall.Timespec { + return stat.Mtim +} diff --git a/system/stat_unsupported.go b/system/stat_unsupported.go new file mode 100644 index 0000000..4686a4c --- /dev/null +++ b/system/stat_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux + +package system + +import "syscall" + +func GetLastAccess(stat *syscall.Stat_t) syscall.Timespec { + return stat.Atimespec +} + +func GetLastModification(stat *syscall.Stat_t) syscall.Timespec { + return stat.Mtimespec +} diff --git a/system/utimes_linux.go b/system/utimes_linux.go new file mode 100644 index 0000000..c00f402 --- /dev/null +++ b/system/utimes_linux.go @@ -0,0 +1,31 @@ +package system + +import ( + "syscall" + "unsafe" +) + +func LUtimesNano(path string, ts []syscall.Timespec) error { + // These are not currently available in syscall + AT_FDCWD := -100 + AT_SYMLINK_NOFOLLOW := 0x100 + + var _path *byte + _path, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + + if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(AT_FDCWD), uintptr(unsafe.Pointer(_path)), uintptr(unsafe.Pointer(&ts[0])), uintptr(AT_SYMLINK_NOFOLLOW), 0, 0); err != 0 && err != syscall.ENOSYS { + return err + } + + return nil +} + +func UtimesNano(path string, ts []syscall.Timespec) error { + if err := syscall.UtimesNano(path, ts); err != nil { + return err + } + return nil +} diff --git a/system/utimes_unsupported.go b/system/utimes_unsupported.go new file mode 100644 index 0000000..d247ba2 --- /dev/null +++ b/system/utimes_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux + +package system + +import "syscall" + +func LUtimesNano(path string, ts []syscall.Timespec) error { + return ErrNotSupportedPlatform +} + +func UtimesNano(path string, ts []syscall.Timespec) error { + return ErrNotSupportedPlatform +} diff --git a/system/xattrs_linux.go b/system/xattrs_linux.go new file mode 100644 index 0000000..00edb20 --- /dev/null +++ b/system/xattrs_linux.go @@ -0,0 +1,59 @@ +package system + +import ( + "syscall" + "unsafe" +) + +// Returns a nil slice and nil error if the xattr is not set +func Lgetxattr(path string, attr string) ([]byte, error) { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return nil, err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return nil, err + } + + dest := make([]byte, 128) + destBytes := unsafe.Pointer(&dest[0]) + sz, _, errno := syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + if errno == syscall.ENODATA { + return nil, nil + } + if errno == syscall.ERANGE { + dest = make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + } + if errno != 0 { + return nil, errno + } + + return dest[:sz], nil +} + +var _zero uintptr + +func Lsetxattr(path string, attr string, data []byte, flags int) error { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return err + } + var dataBytes unsafe.Pointer + if len(data) > 0 { + dataBytes = unsafe.Pointer(&data[0]) + } else { + dataBytes = unsafe.Pointer(&_zero) + } + _, _, errno := syscall.Syscall6(syscall.SYS_LSETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(dataBytes), uintptr(len(data)), uintptr(flags), 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/system/xattrs_unsupported.go b/system/xattrs_unsupported.go new file mode 100644 index 0000000..0060c16 --- /dev/null +++ b/system/xattrs_unsupported.go @@ -0,0 +1,11 @@ +// +build !linux + +package system + +func Lgetxattr(path string, attr string) ([]byte, error) { + return nil, ErrNotSupportedPlatform +} + +func Lsetxattr(path string, attr string, data []byte, flags int) error { + return ErrNotSupportedPlatform +} From a5ba28e1f72fe286b1db15b575a570dda2fe6622 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Wed, 5 Mar 2014 19:27:39 +0000 Subject: [PATCH 101/117] fix usage for completly deprecated flag Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- mflag/example/example.go | 1 + mflag/flag.go | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mflag/example/example.go b/mflag/example/example.go index b0d25fb..352f652 100644 --- a/mflag/example/example.go +++ b/mflag/example/example.go @@ -12,6 +12,7 @@ var ( ) func init() { + flag.Bool([]string{"#hp", "#-halp"}, false, "display the halp") flag.BoolVar(&b, []string{"b"}, false, "a simple bool") flag.BoolVar(&b2, []string{"-bool"}, false, "a simple bool") flag.IntVar(&i, []string{"#integer", "-integer"}, -1, "a simple integer") diff --git a/mflag/flag.go b/mflag/flag.go index 6fe3e41..ff0de23 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -404,7 +404,9 @@ func (f *FlagSet) PrintDefaults() { names = append(names, name) } } - fmt.Fprintf(f.out(), format, strings.Join(names, ", -"), flag.DefValue, flag.Usage) + if len(names) > 0 { + fmt.Fprintf(f.out(), format, strings.Join(names, ", -"), flag.DefValue, flag.Usage) + } }) } From 7339ca86997fb822cf5a010f1e3da62d3400bd02 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Wed, 5 Mar 2014 19:45:57 +0000 Subject: [PATCH 102/117] fix panic with only long flags or only one deprecatd Docker-DCO-1.1-Signed-off-by: Victor Vieux (github: vieux) --- mflag/example/example.go | 4 ++-- mflag/flag.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mflag/example/example.go b/mflag/example/example.go index 352f652..ed940e8 100644 --- a/mflag/example/example.go +++ b/mflag/example/example.go @@ -14,8 +14,8 @@ var ( func init() { flag.Bool([]string{"#hp", "#-halp"}, false, "display the halp") flag.BoolVar(&b, []string{"b"}, false, "a simple bool") - flag.BoolVar(&b2, []string{"-bool"}, false, "a simple bool") - flag.IntVar(&i, []string{"#integer", "-integer"}, -1, "a simple integer") + flag.BoolVar(&b2, []string{"#-bool"}, false, "a simple bool") + flag.IntVar(&i, []string{"-integer", "-number"}, -1, "a simple integer") flag.StringVar(&str, []string{"s", "#hidden", "-string"}, "", "a simple string") //-s -hidden and --string will work, but -hidden won't be in the usage flag.BoolVar(&h, []string{"h", "#help", "-help"}, false, "display the help") flag.Parse() diff --git a/mflag/flag.go b/mflag/flag.go index ff0de23..7125c03 100644 --- a/mflag/flag.go +++ b/mflag/flag.go @@ -290,13 +290,13 @@ type Flag struct { func sortFlags(flags map[string]*Flag) []*Flag { var list sort.StringSlice for _, f := range flags { + fName := strings.TrimPrefix(f.Names[0], "#") if len(f.Names) == 1 { - list = append(list, f.Names[0]) + list = append(list, fName) continue } found := false - fName := strings.TrimPrefix(strings.TrimPrefix(f.Names[0], "#"), "-") for _, name := range list { if name == fName { found = true From 73233223dec282363df6fb34792c30fd4aa3077f Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 5 Mar 2014 11:59:31 -0800 Subject: [PATCH 103/117] Add AppArmor support to native driver + change pipe/dup logic Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/apparmor/apparmor.go | 42 +++++++++++++++++++++++++++++++ libcontainer/container.go | 3 ++- libcontainer/nsinit/init.go | 22 +++++++++------- 3 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 libcontainer/apparmor/apparmor.go diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go new file mode 100644 index 0000000..044b766 --- /dev/null +++ b/libcontainer/apparmor/apparmor.go @@ -0,0 +1,42 @@ +package apparmor + +import ( + "errors" + "fmt" + "io/ioutil" + "log" + "os" +) + +var AppArmorEnabled bool + +var ( + ErrAppArmorDisabled = errors.New("Error: AppArmor is not enabled on this system") +) + +func init() { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + AppArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y' +} + +func ApplyProfile(pid int, name string) error { + if !AppArmorEnabled { + return ErrAppArmorDisabled + } + + f, err := os.OpenFile(fmt.Sprintf("/proc/%d/attr/current", pid), os.O_WRONLY, 0) + if err != nil { + log.Printf("error open: %s\n", err) + return err + } + defer f.Close() + + if _, err := fmt.Fprintf(f, "changeprofile %s", name); err != nil { + log.Printf("changeprofile %s", name) + log.Printf("Error write: %s\n", err) + return err + } else { + log.Printf("Write success!") + } + return nil +} diff --git a/libcontainer/container.go b/libcontainer/container.go index 12a3d7b..bd16825 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -20,7 +20,8 @@ type Container struct { Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop Networks []*Network `json:"networks,omitempty"` // nil for host's network stack - Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups + Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) } // Network defines configuration for a container's networking stack diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 565030f..48d9213 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -5,6 +5,7 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/apparmor" "github.com/dotcloud/docker/pkg/libcontainer/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/utils" @@ -32,7 +33,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if console != "" { // close pipes so that we can replace it with the pty - closeStdPipes() + // closeStdPipes() slave, err := system.OpenTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) @@ -55,9 +56,17 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("parent death signal %s", err) } */ + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } + + if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { + if err != apparmor.ErrAppArmorDisabled { + return err + } + } + if err := setupNetwork(container, context); err != nil { return fmt.Errorf("setup networking %s", err) } @@ -67,13 +76,8 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := finalizeNamespace(container); err != nil { return fmt.Errorf("finalize namespace %s", err) } - return system.Execv(args[0], args[0:], container.Env) -} -func closeStdPipes() { - os.Stdin.Close() - os.Stdout.Close() - os.Stderr.Close() + return system.Execv(args[0], args[0:], container.Env) } func setupUser(container *libcontainer.Container) error { @@ -109,8 +113,8 @@ func setupUser(container *libcontainer.Container) error { // dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that // the slave's fd is 0, or stdin func dupSlave(slave *os.File) error { - if slave.Fd() != 0 { - return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + if err := system.Dup2(slave.Fd(), 0); err != nil { + return err } if err := system.Dup2(slave.Fd(), 1); err != nil { return err From 0eb4ea2f79a2e463d1d7b5ee60c15e43094fd126 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 5 Mar 2014 12:27:31 -0800 Subject: [PATCH 104/117] Some cleanup around logs Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/apparmor/apparmor.go | 21 ++++----------------- libcontainer/nsinit/init.go | 14 +++++--------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index 044b766..4b1bf57 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -1,42 +1,29 @@ package apparmor import ( - "errors" "fmt" "io/ioutil" - "log" "os" ) -var AppArmorEnabled bool - -var ( - ErrAppArmorDisabled = errors.New("Error: AppArmor is not enabled on this system") -) - -func init() { +func IsEnabled() bool { buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") - AppArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y' + return err == nil && len(buf) > 1 && buf[0] == 'Y' } func ApplyProfile(pid int, name string) error { - if !AppArmorEnabled { - return ErrAppArmorDisabled + if !IsEnabled() || name == "" { + return nil } f, err := os.OpenFile(fmt.Sprintf("/proc/%d/attr/current", pid), os.O_WRONLY, 0) if err != nil { - log.Printf("error open: %s\n", err) return err } defer f.Close() if _, err := fmt.Fprintf(f, "changeprofile %s", name); err != nil { - log.Printf("changeprofile %s", name) - log.Printf("Error write: %s\n", err) return err - } else { - log.Printf("Write success!") } return nil } diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 48d9213..a854f13 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -32,8 +32,6 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol syncPipe.Close() if console != "" { - // close pipes so that we can replace it with the pty - // closeStdPipes() slave, err := system.OpenTerminal(console, syscall.O_RDWR) if err != nil { return fmt.Errorf("open terminal %s", err) @@ -51,10 +49,10 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol } } - /* - if err := system.ParentDeathSignal(); err != nil { - return fmt.Errorf("parent death signal %s", err) - } + /* this is commented out so that we get the current Ghost functionality + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } */ if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { @@ -62,9 +60,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol } if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { - if err != apparmor.ErrAppArmorDisabled { - return err - } + return err } if err := setupNetwork(container, context); err != nil { From d3bbd78cc5387770a42ed7ab94dcaacc8a193ae0 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Wed, 5 Mar 2014 14:57:20 -0800 Subject: [PATCH 105/117] Generate and load custom docker profile for apparmor Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/apparmor/setup.go | 98 ++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 libcontainer/apparmor/setup.go diff --git a/libcontainer/apparmor/setup.go b/libcontainer/apparmor/setup.go new file mode 100644 index 0000000..fda810b --- /dev/null +++ b/libcontainer/apparmor/setup.go @@ -0,0 +1,98 @@ +package apparmor + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" +) + +const DefaultProfilePath = "/etc/apparmor.d/docker" +const DefaultProfile = ` +# AppArmor profile from lxc for containers. +@{HOME}=@{HOMEDIRS}/*/ /root/ +@{HOMEDIRS}=/home/ +#@{HOMEDIRS}+= +@{multiarch}=*-linux-gnu* +@{PROC}=/proc/ + +profile docker-default flags=(attach_disconnected,mediate_deleted) { + network, + capability, + file, + umount, + dbus, + + # ignore DENIED message on / remount + deny mount options=(ro, remount) -> /, + + # allow tmpfs mounts everywhere + mount fstype=tmpfs, + + # allow mqueue mounts everywhere + mount fstype=mqueue, + + # allow fuse mounts everywhere + mount fstype=fuse.*, + + # allow bind mount of /lib/init/fstab for lxcguest + mount options=(rw, bind) /lib/init/fstab.lxc/ -> /lib/init/fstab/, + + # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted + mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/, + deny @{PROC}/sys/fs/** wklx, + + # allow efivars to be mounted, writing to it will be blocked though + mount fstype=efivarfs -> /sys/firmware/efi/efivars/, + + # block some other dangerous paths + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + # deny writes in /sys except for /sys/fs/cgroup, also allow + # fusectl, securityfs and debugfs to be mounted there (read-only) + mount fstype=fusectl -> /sys/fs/fuse/connections/, + mount fstype=securityfs -> /sys/kernel/security/, + mount fstype=debugfs -> /sys/kernel/debug/, + deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/, + mount fstype=proc -> /proc/, + mount fstype=sysfs -> /sys/, + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, + mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/, + + # the container may never be allowed to mount devpts. If it does, it + # will remount the host's devpts. We could allow it to do it with + # the newinstance option (but, right now, we don't). + deny mount fstype=devpts, +} +` + +func InstallDefaultProfile() error { + if !IsEnabled() { + return nil + } + + // If the profile already exists, let it be. + if _, err := os.Stat(DefaultProfilePath); err == nil { + return nil + } + + if err := ioutil.WriteFile(DefaultProfilePath, []byte(DefaultProfile), 0644); err != nil { + return err + } + + output, err := exec.Command("/lib/init/apparmor-profile-load", "docker").CombinedOutput() + if err != nil { + return fmt.Errorf("Error loading docker profile: %s (%s)", err, output) + } + return nil +} From 0caa2d3992606370fa78eaa87213e14528288e02 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 6 Mar 2014 14:10:32 +0100 Subject: [PATCH 106/117] libcontainer: Don't use UsetCloseOnExec, it is racy We can't keep file descriptors without close-on-exec except with syscall.ForkLock held, as otherwise they could leak by accident into other children from forks in other threads. Instead we just use Cmd.ExtraFiles which handles all this for us. This fixes https://github.com/dotcloud/docker/issues/4493 Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- libcontainer/nsinit/command.go | 8 ++++---- libcontainer/nsinit/exec.go | 2 +- libcontainer/nsinit/sync_pipe.go | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/libcontainer/nsinit/command.go b/libcontainer/nsinit/command.go index 8ddf1e7..5546065 100644 --- a/libcontainer/nsinit/command.go +++ b/libcontainer/nsinit/command.go @@ -1,7 +1,6 @@ package nsinit import ( - "fmt" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "os" @@ -12,7 +11,7 @@ import ( // parent processes and creates an *exec.Cmd that will be used to fork/exec the // namespaced init process type CommandFactory interface { - Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd + Create(container *libcontainer.Container, console string, syncFd *os.File, args []string) *exec.Cmd } type DefaultCommandFactory struct { @@ -22,16 +21,17 @@ type DefaultCommandFactory struct { // Create will return an exec.Cmd with the Cloneflags set to the proper namespaces // defined on the container's configuration and use the current binary as the init with the // args provided -func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe *os.File, args []string) *exec.Cmd { // get our binary name from arg0 so we can always reexec ourself command := exec.Command(os.Args[0], append([]string{ "-console", console, - "-pipe", fmt.Sprint(pipe), + "-pipe", "3", "-root", c.Root, "init"}, args...)...) system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces))) command.Env = container.Env + command.ExtraFiles = []*os.File{pipe} return command } diff --git a/libcontainer/nsinit/exec.go b/libcontainer/nsinit/exec.go index f1a4e24..4963f12 100644 --- a/libcontainer/nsinit/exec.go +++ b/libcontainer/nsinit/exec.go @@ -35,7 +35,7 @@ func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args [ term.SetMaster(master) } - command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args) + command := ns.commandFactory.Create(container, console, syncPipe.child, args) if err := term.Attach(command); err != nil { return -1, err } diff --git a/libcontainer/nsinit/sync_pipe.go b/libcontainer/nsinit/sync_pipe.go index 7b29e98..f724f52 100644 --- a/libcontainer/nsinit/sync_pipe.go +++ b/libcontainer/nsinit/sync_pipe.go @@ -4,7 +4,6 @@ import ( "encoding/json" "fmt" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/system" "io/ioutil" "os" ) @@ -22,7 +21,6 @@ func NewSyncPipe() (s *SyncPipe, err error) { if err != nil { return nil, err } - system.UsetCloseOnExec(s.child.Fd()) return s, nil } From b359efd9ec269bf25a7ba7127ef6c6f38bfb6bb5 Mon Sep 17 00:00:00 2001 From: unclejack Date: Thu, 6 Mar 2014 18:01:18 +0200 Subject: [PATCH 107/117] remove dbus from apparmor profile This removes the dbus entry from the apparmor profile Docker creates. Docker-DCO-1.1-Signed-off-by: Cristian Staretu (github: unclejack) --- libcontainer/apparmor/setup.go | 1 - 1 file changed, 1 deletion(-) diff --git a/libcontainer/apparmor/setup.go b/libcontainer/apparmor/setup.go index fda810b..e07759c 100644 --- a/libcontainer/apparmor/setup.go +++ b/libcontainer/apparmor/setup.go @@ -21,7 +21,6 @@ profile docker-default flags=(attach_disconnected,mediate_deleted) { capability, file, umount, - dbus, # ignore DENIED message on / remount deny mount options=(ro, remount) -> /, From 0ecd2aa284181f2862acd2dba5675f71a5338d3c Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 6 Mar 2014 11:10:58 -0800 Subject: [PATCH 108/117] Use CGO for apparmor profile switch Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/apparmor/apparmor.go | 16 ++++++++-------- libcontainer/nsinit/init.go | 7 +++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index 4b1bf57..c2954fd 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -1,9 +1,12 @@ package apparmor +// #cgo LDFLAGS: -lapparmor +// #include +// #include +import "C" import ( - "fmt" "io/ioutil" - "os" + "unsafe" ) func IsEnabled() bool { @@ -16,13 +19,10 @@ func ApplyProfile(pid int, name string) error { return nil } - f, err := os.OpenFile(fmt.Sprintf("/proc/%d/attr/current", pid), os.O_WRONLY, 0) - if err != nil { - return err - } - defer f.Close() + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) - if _, err := fmt.Fprintf(f, "changeprofile %s", name); err != nil { + if _, err := C.aa_change_onexec(cName); err != nil { return err } return nil diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index a854f13..45ab881 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -59,10 +59,6 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setup mount namespace %s", err) } - if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { - return err - } - if err := setupNetwork(container, context); err != nil { return fmt.Errorf("setup networking %s", err) } @@ -73,6 +69,9 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("finalize namespace %s", err) } + if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { + return err + } return system.Execv(args[0], args[0:], container.Env) } From 729080d46ae151261c034a8a850064cc6660fee6 Mon Sep 17 00:00:00 2001 From: "Guillaume J. Charmes" Date: Thu, 6 Mar 2014 12:04:51 -0800 Subject: [PATCH 109/117] Add buildflags to allow crosscompilation for apparmor Docker-DCO-1.1-Signed-off-by: Guillaume J. Charmes (github: creack) --- libcontainer/apparmor/apparmor.go | 2 ++ libcontainer/apparmor/apparmor_disabled.go | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 libcontainer/apparmor/apparmor_disabled.go diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index c2954fd..d07c710 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -1,3 +1,5 @@ +// +build apparmor + package apparmor // #cgo LDFLAGS: -lapparmor diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go new file mode 100644 index 0000000..489484f --- /dev/null +++ b/libcontainer/apparmor/apparmor_disabled.go @@ -0,0 +1,13 @@ +// +build !apparmor + +package apparmor + +import () + +func IsEnabled() bool { + return false +} + +func ApplyProfile(pid int, name string) error { + return nil +} From 5c13d614252bfa0edaa0aaa18d4446a64dc284c5 Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Thu, 6 Mar 2014 13:39:17 -0700 Subject: [PATCH 110/117] Update build tags such that we can properly compile on all platforms (especially for packagers), and updated hack/PACKAGERS.md to mention the DOCKER_BUILDTAGS variable that will need to be set for binaries that might be used on AppArmor (such as Debian and especially Ubuntu) Docker-DCO-1.1-Signed-off-by: Andrew Page (github: tianon) --- libcontainer/apparmor/apparmor.go | 2 +- libcontainer/apparmor/apparmor_disabled.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index d07c710..a6d57d4 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -1,4 +1,4 @@ -// +build apparmor +// +build apparmor,linux,amd64 package apparmor diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go index 489484f..77543e4 100644 --- a/libcontainer/apparmor/apparmor_disabled.go +++ b/libcontainer/apparmor/apparmor_disabled.go @@ -1,4 +1,4 @@ -// +build !apparmor +// +build !apparmor !linux !amd64 package apparmor From c0d5c529bb0d9371ede031e9c60b7dfe7f13befd Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 14:14:25 -0800 Subject: [PATCH 111/117] Remove the ghosts and kill everything Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 45ab881..1f8ad36 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -48,17 +48,14 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setctty %s", err) } } - - /* this is commented out so that we get the current Ghost functionality - if err := system.ParentDeathSignal(); err != nil { - return fmt.Errorf("parent death signal %s", err) - } + /* + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } */ - if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetwork(container, context); err != nil { return fmt.Errorf("setup networking %s", err) } From fd8470acbaabdff3ee0aebd4ea192ade21dcc1d1 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 16:30:56 -0800 Subject: [PATCH 112/117] Ensure that native containers die with the parent Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/init.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 1f8ad36..8d3f908 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -48,11 +48,9 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol return fmt.Errorf("setctty %s", err) } } - /* - if err := system.ParentDeathSignal(); err != nil { - return fmt.Errorf("parent death signal %s", err) - } - */ + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } From abd1f8da60e173e192f51b9cee36fb9ed084ffee Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 16:32:06 -0800 Subject: [PATCH 113/117] Revert "libcontainer: Use pivot_root instead of chroot" This reverts commit 5b5c884cc8266d0c2a56da0bc2df14cc9d5d85e8. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 69d85d6..a97a379 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -5,7 +5,6 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/system" - "io/ioutil" "os" "path/filepath" "syscall" @@ -51,29 +50,16 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - - pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") - if err != nil { - return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) } - if err := system.Pivotroot(rootfs, pivotDir); err != nil { - return fmt.Errorf("pivot_root %s", err) + if err := system.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) } if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - // path to pivot dir now changed, update - pivotDir = filepath.Join("/", filepath.Base(pivotDir)) - - if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { - return fmt.Errorf("unmount pivot_root dir %s", err) - } - - if err := os.Remove(pivotDir); err != nil { - return fmt.Errorf("remove pivot_root dir %s", err) - } - system.Umask(0022) return nil From 31c7d134664e7c0df69bc9f3fb47d6c6d3f63f3f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 16:41:03 -0800 Subject: [PATCH 114/117] Revert "libcontainer: Use MS_PRIVATE instead of MS_SLAVE" This reverts commit 757b5775725fb90262cee1fa6068fa9dcbbff59f. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index a97a379..0506b99 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -20,7 +20,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { // mount as slave so that the new mounts do not propagate to the host - if err := system.Mount("", "/", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { + if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { From 57e7760c26bb0ab0f1bac732a9cbbf8c39f0ec07 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 17:19:47 -0800 Subject: [PATCH 115/117] Revert "Revert "libcontainer: Use MS_PRIVATE instead of MS_SLAVE"" This reverts commit bd263f5b15b51747e3429179fef7fcb425ccbe4a. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 0506b99..a97a379 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -20,7 +20,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { // mount as slave so that the new mounts do not propagate to the host - if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + if err := system.Mount("", "/", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { From ca93316795d6887094d5e53e07208cc4ad235cbc Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 17:19:59 -0800 Subject: [PATCH 116/117] Revert "Revert "libcontainer: Use pivot_root instead of chroot"" This reverts commit 82f797f14096430c3edbace1cd30e04a483ec41f. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/nsinit/mount.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index a97a379..69d85d6 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -5,6 +5,7 @@ package nsinit import ( "fmt" "github.com/dotcloud/docker/pkg/system" + "io/ioutil" "os" "path/filepath" "syscall" @@ -50,16 +51,29 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Chdir(rootfs); err != nil { return fmt.Errorf("chdir into %s %s", rootfs, err) } - if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { - return fmt.Errorf("mount move %s into / %s", rootfs, err) + + pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) } - if err := system.Chroot("."); err != nil { - return fmt.Errorf("chroot . %s", err) + if err := system.Pivotroot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) } if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } + // path to pivot dir now changed, update + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + + if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + + if err := os.Remove(pivotDir); err != nil { + return fmt.Errorf("remove pivot_root dir %s", err) + } + system.Umask(0022) return nil From da605a43d62ec827443185a4c5b94c283ba85001 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 6 Mar 2014 19:30:52 -0800 Subject: [PATCH 117/117] Add env var to toggle pivot root or ms_move Use the DOCKER_RAMDISK env var to tell the native driver not to use a pivot root when setting up the rootfs of a container. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- libcontainer/container.go | 23 ++++++++++--------- libcontainer/nsinit/init.go | 2 +- libcontainer/nsinit/mount.go | 44 +++++++++++++++++++++++++++++------- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index bd16825..a777da5 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -11,17 +11,18 @@ type Context map[string]string // Container defines configuration options for how a // container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` // hostname - ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly - User string `json:"user,omitempty"` // user to execute the process as - WorkingDir string `json:"working_dir,omitempty"` // current working directory - Env []string `json:"environment,omitempty"` // environment to set - Tty bool `json:"tty,omitempty"` // setup a proper tty or not - Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply - Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop - Networks []*Network `json:"networks,omitempty"` // nil for host's network stack - Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups - Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + NoPivotRoot bool `json:"no_pivot_root,omitempty"` // this can be enabled if you are running in ramdisk + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Tty bool `json:"tty,omitempty"` // setup a proper tty or not + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Networks []*Network `json:"networks,omitempty"` // nil for host's network stack + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups + Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) } // Network defines configuration for a container's networking stack diff --git a/libcontainer/nsinit/init.go b/libcontainer/nsinit/init.go index 8d3f908..336fc1e 100644 --- a/libcontainer/nsinit/init.go +++ b/libcontainer/nsinit/init.go @@ -51,7 +51,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent death signal %s", err) } - if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if err := setupNetwork(container, context); err != nil { diff --git a/libcontainer/nsinit/mount.go b/libcontainer/nsinit/mount.go index 69d85d6..83577cf 100644 --- a/libcontainer/nsinit/mount.go +++ b/libcontainer/nsinit/mount.go @@ -19,9 +19,12 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // // There is no need to unmount the new mounts because as soon as the mount namespace // is no longer in use, the mounts will be removed automatically -func setupNewMountNamespace(rootfs, console string, readonly bool) error { - // mount as slave so that the new mounts do not propagate to the host - if err := system.Mount("", "/", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { +func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool) error { + flag := syscall.MS_PRIVATE + if noPivotRoot { + flag = syscall.MS_SLAVE + } + if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { @@ -52,6 +55,23 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { return fmt.Errorf("chdir into %s %s", rootfs, err) } + if noPivotRoot { + if err := rootMsMove(rootfs); err != nil { + return err + } + } else { + if err := rootPivot(rootfs); err != nil { + return err + } + } + + system.Umask(0022) + + return nil +} + +// use a pivot root to setup the rootfs +func rootPivot(rootfs string) error { pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") if err != nil { return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) @@ -62,20 +82,28 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { if err := system.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) } - // path to pivot dir now changed, update pivotDir = filepath.Join("/", filepath.Base(pivotDir)) - if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("unmount pivot_root dir %s", err) } - if err := os.Remove(pivotDir); err != nil { return fmt.Errorf("remove pivot_root dir %s", err) } + return nil +} - system.Umask(0022) - +// use MS_MOVE and chroot to setup the rootfs +func rootMsMove(rootfs string) error { + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + if err := system.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + if err := system.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } return nil }