Merge pull request #55 from stephen679/refactor_tar

tar: refactoring and resolve race condition
This commit is contained in:
Vincent Batts 2016-08-10 13:31:53 -04:00 committed by GitHub
commit 71abbd944f
22 changed files with 354 additions and 140 deletions

View file

@ -42,7 +42,6 @@ func Check(root string, dh *DirectoryHierarchy, keywords []string) (*Result, err
return nil, err return nil, err
} }
sort.Sort(byPos(creator.DH.Entries)) sort.Sort(byPos(creator.DH.Entries))
var result Result var result Result
for i, e := range creator.DH.Entries { for i, e := range creator.DH.Entries {
switch e.Type { switch e.Type {
@ -118,6 +117,9 @@ func TarCheck(tarDH, dh *DirectoryHierarchy, keywords []string) (*Result, error)
break break
} }
} }
if tarRoot == nil {
return nil, fmt.Errorf("root of archive could not be found")
}
tarRoot.Next = &Entry{ tarRoot.Next = &Entry{
Name: "seen", Name: "seen",
Type: CommentType, Type: CommentType,

244
tar.go
View file

@ -2,6 +2,7 @@ package mtree
import ( import (
"archive/tar" "archive/tar"
"fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"os" "os"
@ -29,11 +30,13 @@ func NewTarStreamer(r io.Reader, keywords []string) Streamer {
tarReader: tar.NewReader(pR), tarReader: tar.NewReader(pR),
keywords: keywords, keywords: keywords,
} }
go ts.readHeaders() // I don't like this
go ts.readHeaders()
return ts return ts
} }
type tarStream struct { type tarStream struct {
root *Entry
creator dhCreator creator dhCreator
pipeReader *io.PipeReader pipeReader *io.PipeReader
pipeWriter *io.PipeWriter pipeWriter *io.PipeWriter
@ -44,20 +47,31 @@ type tarStream struct {
} }
func (ts *tarStream) readHeaders() { func (ts *tarStream) readHeaders() {
// remove "time" keyword
notimekws := []string{}
for _, kw := range ts.keywords {
if !inSlice(kw, notimekws) {
if kw != "time" {
notimekws = append(notimekws, kw)
} else {
if !inSlice("tar_time", ts.keywords) {
notimekws = append(notimekws, "tar_time")
}
}
}
}
ts.keywords = notimekws
// We have to start with the directory we're in, and anything beyond these // We have to start with the directory we're in, and anything beyond these
// items is determined at the time a tar is extracted. // items is determined at the time a tar is extracted.
rootComment := Entry{ ts.root = &Entry{
Raw: "# .",
Type: CommentType,
}
root := Entry{
Name: ".", Name: ".",
Type: RelativeType, Type: RelativeType,
Prev: &rootComment, Prev: &Entry{
Set: &Entry{ Raw: "# .",
Name: "meta-set", Type: CommentType,
Type: SpecialType,
}, },
Set: nil,
Keywords: []string{"type=dir"},
} }
metadataEntries := signatureEntries("<user specified tar archive>") metadataEntries := signatureEntries("<user specified tar archive>")
for _, e := range metadataEntries { for _, e := range metadataEntries {
@ -67,11 +81,9 @@ func (ts *tarStream) readHeaders() {
for { for {
hdr, err := ts.tarReader.Next() hdr, err := ts.tarReader.Next()
if err != nil { if err != nil {
flatten(&root, ts)
ts.pipeReader.CloseWithError(err) ts.pipeReader.CloseWithError(err)
return return
} }
// Because the content of the file may need to be read by several // Because the content of the file may need to be read by several
// KeywordFuncs, it needs to be an io.Seeker as well. So, just reading from // KeywordFuncs, it needs to be an io.Seeker as well. So, just reading from
// ts.tarReader is not enough. // ts.tarReader is not enough.
@ -108,19 +120,13 @@ func (ts *tarStream) readHeaders() {
// now collect keywords on the file // now collect keywords on the file
for _, keyword := range ts.keywords { for _, keyword := range ts.keywords {
if keyword == "time" {
keyword = "tar_time"
}
if keyFunc, ok := KeywordFuncs[keyword]; ok { if keyFunc, ok := KeywordFuncs[keyword]; ok {
// We can't extract directories on to disk, so "size" keyword // We can't extract directories on to disk, so "size" keyword
// is irrelevant for now // is irrelevant for now
if hdr.FileInfo().IsDir() && keyword == "size" { if hdr.FileInfo().IsDir() && keyword == "size" {
continue continue
} }
// TODO: handle hardlinks
if string(hdr.Typeflag) == string('1') {
// TODO: get number of hardlinks for a file
}
val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile)
if err != nil { if err != nil {
ts.setErr(err) ts.setErr(err)
@ -147,9 +153,6 @@ func (ts *tarStream) readHeaders() {
Type: SpecialType, Type: SpecialType,
} }
for _, setKW := range SetKeywords { for _, setKW := range SetKeywords {
if setKW == "time" {
setKW = "tar_time"
}
if keyFunc, ok := KeywordFuncs[setKW]; ok { if keyFunc, ok := KeywordFuncs[setKW]; ok {
val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile)
if err != nil { if err != nil {
@ -165,73 +168,73 @@ func (ts *tarStream) readHeaders() {
} }
} }
} }
if filepath.Dir(filepath.Clean(hdr.Name)) == "." {
root.Set = &s
} else {
e.Set = &s e.Set = &s
} }
err = populateTree(ts.root, &e, hdr)
if err != nil {
ts.setErr(err)
} }
populateTree(&root, &e, hdr, ts)
tmpFile.Close() tmpFile.Close()
os.Remove(tmpFile.Name()) os.Remove(tmpFile.Name())
} }
} }
type relationship int // populateTree creates a pseudo file tree hierarchy using an Entry's Parent and
const (
unknownDir relationship = iota
sameDir
childDir
parentDir
)
// populateTree creates a file tree hierarchy using an Entry's Parent and
// Children fields. When examining the Entry e to insert in the tree, we // Children fields. When examining the Entry e to insert in the tree, we
// determine if the path to that Entry exists yet. If it does, insert it in the // determine if the path to that Entry exists yet. If it does, insert it in the
// appropriate position in the tree. If not, create a path with "placeholder" // appropriate position in the tree. If not, create a path up until the Entry's
// directories, and then insert the Entry. populateTree does not consider // directory that it is contained in. Then, insert the Entry.
// symbolic links yet. // root: the "." Entry
func populateTree(root, e *Entry, hdr *tar.Header, ts *tarStream) { // e: the Entry we are looking to insert
// hdr: the tar header struct associated with e
func populateTree(root, e *Entry, hdr *tar.Header) error {
if root == nil || e == nil {
return fmt.Errorf("cannot populate or insert nil Entry's")
} else if root.Prev == nil {
return fmt.Errorf("root needs to be an Entry associated with a directory")
}
isDir := hdr.FileInfo().IsDir() isDir := hdr.FileInfo().IsDir()
wd := filepath.Clean(hdr.Name) wd := filepath.Clean(hdr.Name)
if !isDir { if !isDir {
// If entry is a file, we only want the directory it's in. // directory up until the actual file
wd = filepath.Dir(wd) wd = filepath.Dir(wd)
} if wd == "." {
if filepath.Dir(wd) == "." { // If file in root directory, no need to traverse down, just append
if isDir {
root.Keywords = e.Keywords
} else {
root.Children = append([]*Entry{e}, root.Children...) root.Children = append([]*Entry{e}, root.Children...)
e.Parent = root e.Parent = root
return nil
} }
return
} }
// TODO: what about directory/file names with "/" in it?
dirNames := strings.Split(wd, "/") dirNames := strings.Split(wd, "/")
parent := root parent := root
for _, name := range dirNames[1:] { for _, name := range dirNames[:] {
if node := parent.Descend(name); node == nil { encoded, err := Vis(name)
// Entry for directory doesn't exist in tree relative to root
var newEntry *Entry
if isDir {
newEntry = e
} else {
encodedName, err := Vis(name)
if err != nil { if err != nil {
ts.setErr(err) return err
return
} }
newEntry = &Entry{ if node := parent.Descend(encoded); node == nil {
Name: encodedName, // Entry for directory doesn't exist in tree relative to root.
// We don't know if this directory is an actual tar header (because a
// user could have just specified a path to a deep file), so we must
// specify this placeholder directory as a "type=dir", and Set=nil.
newEntry := Entry{
Name: encoded,
Type: RelativeType, Type: RelativeType,
Parent: parent,
Keywords: []string{"type=dir"}, // temp data
Set: nil, // temp data
} }
pathname, err := newEntry.Path()
if err != nil {
return err
} }
newEntry.Parent = parent newEntry.Prev = &Entry{
parent.Children = append(parent.Children, newEntry) Type: CommentType,
parent = newEntry Raw: "# " + pathname,
}
parent.Children = append(parent.Children, &newEntry)
parent = &newEntry
} else { } else {
// Entry for directory exists in tree, just keep going // Entry for directory exists in tree, just keep going
parent = node parent = node
@ -241,98 +244,107 @@ func populateTree(root, e *Entry, hdr *tar.Header, ts *tarStream) {
parent.Children = append([]*Entry{e}, parent.Children...) parent.Children = append([]*Entry{e}, parent.Children...)
e.Parent = parent e.Parent = parent
} else { } else {
commentpath, err := e.Path() // fill in the actual data from e
if err != nil { parent.Keywords = e.Keywords
ts.setErr(err) parent.Set = e.Set
}
return nil
}
// After constructing a pseudo file hierarchy tree, we want to "flatten" this
// tree by putting the Entries into a slice with appropriate positioning.
// root: the "head" of the sub-tree to flatten
// creator: a dhCreator that helps with the '/set' keyword
// keywords: keywords specified by the user that should be evaluated
func flatten(root *Entry, creator *dhCreator, keywords []string) {
if root == nil || creator == nil {
return return
} }
commentEntry := Entry{
Raw: "# " + commentpath,
Type: CommentType,
}
e.Prev = &commentEntry
}
}
// After constructing the tree from the tar stream, we want to "flatten" this
// tree by appending Entry's into ts.creator.DH.Entries in an appropriate
// manner to simplify writing the output with ts.creator.DH.WriteTo
// root: the "head" of the sub-tree to flatten
// ts : tarStream to keep track of Entry's
func flatten(root *Entry, ts *tarStream) {
if root.Prev != nil { if root.Prev != nil {
// root.Prev != nil implies root is a directory // root.Prev != nil implies root is a directory
ts.creator.DH.Entries = append(ts.creator.DH.Entries, creator.DH.Entries = append(creator.DH.Entries,
Entry{ Entry{
Type: BlankType, Type: BlankType,
Pos: len(ts.creator.DH.Entries), Pos: len(creator.DH.Entries),
}) })
root.Prev.Pos = len(ts.creator.DH.Entries) root.Prev.Pos = len(creator.DH.Entries)
ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root.Prev) creator.DH.Entries = append(creator.DH.Entries, *root.Prev)
if root.Set != nil {
// Check if we need a new set // Check if we need a new set
if ts.creator.curSet == nil { if creator.curSet == nil {
ts.creator.curSet = &Entry{ creator.curSet = &Entry{
Type: SpecialType, Type: SpecialType,
Name: "/set", Name: "/set",
Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), ts.keywords), Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords),
Pos: len(ts.creator.DH.Entries), Pos: len(creator.DH.Entries),
} }
ts.creator.DH.Entries = append(ts.creator.DH.Entries, *ts.creator.curSet) creator.DH.Entries = append(creator.DH.Entries, *creator.curSet)
} else { } else {
needNewSet := false needNewSet := false
for _, k := range root.Set.Keywords { for _, k := range root.Set.Keywords {
if !inSlice(k, ts.creator.curSet.Keywords) { if !inSlice(k, creator.curSet.Keywords) {
needNewSet = true needNewSet = true
break break
} }
} }
if needNewSet { if needNewSet {
ts.creator.curSet = &Entry{ creator.curSet = &Entry{
Name: "/set", Name: "/set",
Type: SpecialType, Type: SpecialType,
Pos: len(ts.creator.DH.Entries), Pos: len(creator.DH.Entries),
Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), ts.keywords), Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords),
} }
ts.creator.DH.Entries = append(ts.creator.DH.Entries, *ts.creator.curSet) creator.DH.Entries = append(creator.DH.Entries, *creator.curSet)
} }
} }
} else if creator.curSet != nil {
// Getting into here implies that the Entry's set has not and
// was not supposed to be evaluated, thus, we need to reset curSet
creator.DH.Entries = append(creator.DH.Entries, Entry{
Name: "/unset",
Type: SpecialType,
Pos: len(creator.DH.Entries),
})
creator.curSet = nil
} }
root.Set = ts.creator.curSet }
root.Keywords = setDifference(root.Keywords, ts.creator.curSet.Keywords) root.Set = creator.curSet
root.Pos = len(ts.creator.DH.Entries) if creator.curSet != nil {
ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root) root.Keywords = setDifference(root.Keywords, creator.curSet.Keywords)
}
root.Pos = len(creator.DH.Entries)
creator.DH.Entries = append(creator.DH.Entries, *root)
for _, c := range root.Children { for _, c := range root.Children {
flatten(c, ts) flatten(c, creator, keywords)
} }
if root.Prev != nil { if root.Prev != nil {
// Show a comment when stepping out // Show a comment when stepping out
root.Prev.Pos = len(ts.creator.DH.Entries) root.Prev.Pos = len(creator.DH.Entries)
ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root.Prev) creator.DH.Entries = append(creator.DH.Entries, *root.Prev)
dotEntry := Entry{ dotEntry := Entry{
Type: DotDotType, Type: DotDotType,
Name: "..", Name: "..",
Pos: len(ts.creator.DH.Entries), Pos: len(creator.DH.Entries),
} }
ts.creator.DH.Entries = append(ts.creator.DH.Entries, dotEntry) creator.DH.Entries = append(creator.DH.Entries, dotEntry)
} }
return
} }
// filter takes in a pointer to an Entry, and returns a slice of Entry's that // filter takes in a pointer to an Entry, and returns a slice of Entry's that
// satisfy the predicate p // satisfy the predicate p
func filter(root *Entry, p func(*Entry) bool) []Entry { func filter(root *Entry, p func(*Entry) bool) []Entry {
if root != nil {
var validEntrys []Entry var validEntrys []Entry
if len(root.Children) > 0 || root.Prev != nil { if len(root.Children) > 0 || root.Prev != nil {
for _, c := range root.Children { for _, c := range root.Children {
// if an Entry is a directory, filter the directory // filter the sub-directory
if c.Prev != nil { if c.Prev != nil {
validEntrys = append(validEntrys, filter(c, p)...) validEntrys = append(validEntrys, filter(c, p)...)
} }
if p(c) { if p(c) {
if c.Prev == nil { if c.Prev == nil {
// prepend directories
validEntrys = append([]Entry{*c}, validEntrys...) validEntrys = append([]Entry{*c}, validEntrys...)
} else { } else {
validEntrys = append(validEntrys, *c) validEntrys = append(validEntrys, *c)
@ -341,6 +353,7 @@ func filter(root *Entry, p func(*Entry) bool) []Entry {
} }
return validEntrys return validEntrys
} }
}
return nil return nil
} }
@ -357,6 +370,15 @@ func setDifference(this, that []string) []string {
return diff return diff
} }
type relationship int
const (
unknownDir relationship = iota
sameDir
childDir
parentDir
)
func compareDir(curDir, prevDir string) relationship { func compareDir(curDir, prevDir string) relationship {
curDir = filepath.Clean(curDir) curDir = filepath.Clean(curDir)
prevDir = filepath.Clean(prevDir) prevDir = filepath.Clean(prevDir)
@ -384,9 +406,15 @@ func (ts *tarStream) Close() error {
return ts.pipeReader.Close() return ts.pipeReader.Close()
} }
// Hierarchy returns the DirectoryHierarchy of the archive. It flattens the
// Entry tree before returning the DirectoryHierarchy
func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) { func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) {
if ts.err != nil && ts.err != io.EOF { if ts.err != nil && ts.err != io.EOF {
return nil, ts.err return nil, ts.err
} }
if ts.root == nil {
return nil, fmt.Errorf("root Entry not found, nothing to flatten")
}
flatten(ts.root, &ts.creator, ts.keywords)
return ts.creator.DH, nil return ts.creator.DH, nil
} }

View file

@ -142,6 +142,184 @@ func TestTar(t *testing.T) {
} }
} }
// This test checks how gomtree handles archives that were created
// with multiple directories, i.e, archives created with something like:
// `tar -cvf some.tar dir1 dir2 dir3 dir4/dir5 dir6` ... etc.
// The testdata of collection.tar resemble such an archive. the `collection` folder
// is the contents of `collection.tar` extracted
func TestArchiveCreation(t *testing.T) {
fh, err := os.Open("./testdata/collection.tar")
if err != nil {
t.Fatal(err)
}
str := NewTarStreamer(fh, []string{"sha1"})
if _, err := io.Copy(ioutil.Discard, str); err != nil && err != io.EOF {
t.Fatal(err)
}
if err := str.Close(); err != nil {
t.Fatal(err)
}
defer fh.Close()
// get DirectoryHierarcy struct from walking the tar archive
tdh, err := str.Hierarchy()
if err != nil {
t.Fatal(err)
}
// Test the tar manifest against the actual directory
res, err := Check("./testdata/collection", tdh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
// Test the tar manifest against itself
res, err = TarCheck(tdh, tdh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
// Validate the directory manifest against the archive
dh, err := Walk("./testdata/collection", nil, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
res, err = TarCheck(tdh, dh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
}
// Now test a tar file that was created with just the path to a file. In this
// test case, the traversal and creation of "placeholder" directories are
// evaluated. Also, The fact that this archive contains a single entry, yet the
// entry is associated with a file that has parent directories, means that the
// "." directory should be the lowest sub-directory under which `file` is contained.
func TestTreeTraversal(t *testing.T) {
fh, err := os.Open("./testdata/traversal.tar")
if err != nil {
t.Fatal(err)
}
str := NewTarStreamer(fh, DefaultTarKeywords)
if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF {
t.Fatal(err)
}
if err = str.Close(); err != nil {
t.Fatal(err)
}
fh.Close()
tdh, err := str.Hierarchy()
if err != nil {
t.Fatal(err)
}
res, err := TarCheck(tdh, tdh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
// top-level "." directory will contain contents of traversal.tar
res, err = Check("./testdata/.", tdh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
// Now test an archive that requires placeholder directories, i.e, there are
// no headers in the archive that are associated with the actual directory name
fh, err = os.Open("./testdata/singlefile.tar")
if err != nil {
t.Fatal(err)
}
str = NewTarStreamer(fh, DefaultTarKeywords)
if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF {
t.Fatal(err)
}
if err = str.Close(); err != nil {
t.Fatal(err)
}
tdh, err = str.Hierarchy()
if err != nil {
t.Fatal(err)
}
// Implied top-level "." directory will contain the contents of singlefile.tar
res, err = Check("./testdata/.", tdh, []string{"sha1"})
if err != nil {
t.Fatal(err)
}
if res != nil {
for _, f := range res.Failures {
t.Errorf(f.String())
}
for _, e := range res.Extra {
t.Errorf("%s extra not expected", e.Name)
}
for _, m := range res.Missing {
t.Errorf("%s missing not expected", m.Name)
}
}
}
// minimal tar archive stream that mimics what is in ./testdata/test.tar // minimal tar archive stream that mimics what is in ./testdata/test.tar
func makeTarStream() ([]byte, error) { func makeTarStream() ([]byte, error) {
buf := new(bytes.Buffer) buf := new(bytes.Buffer)

BIN
testdata/collection.tar vendored Normal file

Binary file not shown.

0
testdata/collection/dir1/file1 vendored Normal file
View file

0
testdata/collection/dir2/file2 vendored Normal file
View file

0
testdata/collection/dir3/file3 vendored Normal file
View file

0
testdata/collection/dir4/file4 vendored Normal file
View file

View file

1
testdata/collection/file1 vendored Normal file
View file

@ -0,0 +1 @@
im

1
testdata/collection/file2 vendored Normal file
View file

@ -0,0 +1 @@
hello

1
testdata/collection/file3 vendored Normal file
View file

@ -0,0 +1 @@
programming

BIN
testdata/singlefile.tar vendored Normal file

Binary file not shown.

0
testdata/singlefile/dir2/dir3/file vendored Normal file
View file

BIN
testdata/traversal.tar vendored Normal file

Binary file not shown.

View file

View file

View file

@ -0,0 +1 @@
hello

View file

@ -0,0 +1 @@
I'm

View file

@ -0,0 +1 @@
programming

0
testdata/traversal/dir2/dir3/file vendored Normal file
View file

0
testdata/traversal/dir2/dir3/file3 vendored Normal file
View file