diff --git a/check.go b/check.go index 5a40c4b..c7752b0 100644 --- a/check.go +++ b/check.go @@ -42,7 +42,6 @@ func Check(root string, dh *DirectoryHierarchy, keywords []string) (*Result, err return nil, err } sort.Sort(byPos(creator.DH.Entries)) - var result Result for i, e := range creator.DH.Entries { switch e.Type { @@ -118,6 +117,9 @@ func TarCheck(tarDH, dh *DirectoryHierarchy, keywords []string) (*Result, error) break } } + if tarRoot == nil { + return nil, fmt.Errorf("root of archive could not be found") + } tarRoot.Next = &Entry{ Name: "seen", Type: CommentType, diff --git a/tar.go b/tar.go index 9ea759d..bcab9c8 100644 --- a/tar.go +++ b/tar.go @@ -2,6 +2,7 @@ package mtree import ( "archive/tar" + "fmt" "io" "io/ioutil" "os" @@ -29,11 +30,13 @@ func NewTarStreamer(r io.Reader, keywords []string) Streamer { tarReader: tar.NewReader(pR), keywords: keywords, } - go ts.readHeaders() // I don't like this + + go ts.readHeaders() return ts } type tarStream struct { + root *Entry creator dhCreator pipeReader *io.PipeReader pipeWriter *io.PipeWriter @@ -44,20 +47,31 @@ type tarStream struct { } func (ts *tarStream) readHeaders() { + // remove "time" keyword + notimekws := []string{} + for _, kw := range ts.keywords { + if !inSlice(kw, notimekws) { + if kw != "time" { + notimekws = append(notimekws, kw) + } else { + if !inSlice("tar_time", ts.keywords) { + notimekws = append(notimekws, "tar_time") + } + } + } + } + ts.keywords = notimekws // We have to start with the directory we're in, and anything beyond these // items is determined at the time a tar is extracted. - rootComment := Entry{ - Raw: "# .", - Type: CommentType, - } - root := Entry{ + ts.root = &Entry{ Name: ".", Type: RelativeType, - Prev: &rootComment, - Set: &Entry{ - Name: "meta-set", - Type: SpecialType, + Prev: &Entry{ + Raw: "# .", + Type: CommentType, }, + Set: nil, + Keywords: []string{"type=dir"}, } metadataEntries := signatureEntries("") for _, e := range metadataEntries { @@ -67,11 +81,9 @@ func (ts *tarStream) readHeaders() { for { hdr, err := ts.tarReader.Next() if err != nil { - flatten(&root, ts) ts.pipeReader.CloseWithError(err) return } - // Because the content of the file may need to be read by several // KeywordFuncs, it needs to be an io.Seeker as well. So, just reading from // ts.tarReader is not enough. @@ -108,19 +120,13 @@ func (ts *tarStream) readHeaders() { // now collect keywords on the file for _, keyword := range ts.keywords { - if keyword == "time" { - keyword = "tar_time" - } if keyFunc, ok := KeywordFuncs[keyword]; ok { // We can't extract directories on to disk, so "size" keyword // is irrelevant for now if hdr.FileInfo().IsDir() && keyword == "size" { continue } - - if string(hdr.Typeflag) == string('1') { - // TODO: get number of hardlinks for a file - } + // TODO: handle hardlinks val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) if err != nil { ts.setErr(err) @@ -147,9 +153,6 @@ func (ts *tarStream) readHeaders() { Type: SpecialType, } for _, setKW := range SetKeywords { - if setKW == "time" { - setKW = "tar_time" - } if keyFunc, ok := KeywordFuncs[setKW]; ok { val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) if err != nil { @@ -165,73 +168,73 @@ func (ts *tarStream) readHeaders() { } } } - if filepath.Dir(filepath.Clean(hdr.Name)) == "." { - root.Set = &s - } else { - e.Set = &s - } + e.Set = &s + } + err = populateTree(ts.root, &e, hdr) + if err != nil { + ts.setErr(err) } - populateTree(&root, &e, hdr, ts) tmpFile.Close() os.Remove(tmpFile.Name()) } } -type relationship int - -const ( - unknownDir relationship = iota - sameDir - childDir - parentDir -) - -// populateTree creates a file tree hierarchy using an Entry's Parent and +// populateTree creates a pseudo file tree hierarchy using an Entry's Parent and // Children fields. When examining the Entry e to insert in the tree, we // determine if the path to that Entry exists yet. If it does, insert it in the -// appropriate position in the tree. If not, create a path with "placeholder" -// directories, and then insert the Entry. populateTree does not consider -// symbolic links yet. -func populateTree(root, e *Entry, hdr *tar.Header, ts *tarStream) { +// appropriate position in the tree. If not, create a path up until the Entry's +// directory that it is contained in. Then, insert the Entry. +// root: the "." Entry +// e: the Entry we are looking to insert +// hdr: the tar header struct associated with e +func populateTree(root, e *Entry, hdr *tar.Header) error { + if root == nil || e == nil { + return fmt.Errorf("cannot populate or insert nil Entry's") + } else if root.Prev == nil { + return fmt.Errorf("root needs to be an Entry associated with a directory") + } isDir := hdr.FileInfo().IsDir() wd := filepath.Clean(hdr.Name) - if !isDir { - // If entry is a file, we only want the directory it's in. + // directory up until the actual file wd = filepath.Dir(wd) - } - if filepath.Dir(wd) == "." { - if isDir { - root.Keywords = e.Keywords - } else { + if wd == "." { + // If file in root directory, no need to traverse down, just append root.Children = append([]*Entry{e}, root.Children...) e.Parent = root + return nil } - return } - + // TODO: what about directory/file names with "/" in it? dirNames := strings.Split(wd, "/") parent := root - for _, name := range dirNames[1:] { - if node := parent.Descend(name); node == nil { - // Entry for directory doesn't exist in tree relative to root - var newEntry *Entry - if isDir { - newEntry = e - } else { - encodedName, err := Vis(name) - if err != nil { - ts.setErr(err) - return - } - newEntry = &Entry{ - Name: encodedName, - Type: RelativeType, - } + for _, name := range dirNames[:] { + encoded, err := Vis(name) + if err != nil { + return err + } + if node := parent.Descend(encoded); node == nil { + // Entry for directory doesn't exist in tree relative to root. + // We don't know if this directory is an actual tar header (because a + // user could have just specified a path to a deep file), so we must + // specify this placeholder directory as a "type=dir", and Set=nil. + newEntry := Entry{ + Name: encoded, + Type: RelativeType, + Parent: parent, + Keywords: []string{"type=dir"}, // temp data + Set: nil, // temp data } - newEntry.Parent = parent - parent.Children = append(parent.Children, newEntry) - parent = newEntry + pathname, err := newEntry.Path() + if err != nil { + return err + } + newEntry.Prev = &Entry{ + Type: CommentType, + Raw: "# " + pathname, + } + parent.Children = append(parent.Children, &newEntry) + parent = &newEntry } else { // Entry for directory exists in tree, just keep going parent = node @@ -241,105 +244,115 @@ func populateTree(root, e *Entry, hdr *tar.Header, ts *tarStream) { parent.Children = append([]*Entry{e}, parent.Children...) e.Parent = parent } else { - commentpath, err := e.Path() - if err != nil { - ts.setErr(err) - return - } - commentEntry := Entry{ - Raw: "# " + commentpath, - Type: CommentType, - } - e.Prev = &commentEntry + // fill in the actual data from e + parent.Keywords = e.Keywords + parent.Set = e.Set } + return nil } -// After constructing the tree from the tar stream, we want to "flatten" this -// tree by appending Entry's into ts.creator.DH.Entries in an appropriate -// manner to simplify writing the output with ts.creator.DH.WriteTo -// root: the "head" of the sub-tree to flatten -// ts : tarStream to keep track of Entry's -func flatten(root *Entry, ts *tarStream) { +// After constructing a pseudo file hierarchy tree, we want to "flatten" this +// tree by putting the Entries into a slice with appropriate positioning. +// root: the "head" of the sub-tree to flatten +// creator: a dhCreator that helps with the '/set' keyword +// keywords: keywords specified by the user that should be evaluated +func flatten(root *Entry, creator *dhCreator, keywords []string) { + if root == nil || creator == nil { + return + } if root.Prev != nil { // root.Prev != nil implies root is a directory - ts.creator.DH.Entries = append(ts.creator.DH.Entries, + creator.DH.Entries = append(creator.DH.Entries, Entry{ Type: BlankType, - Pos: len(ts.creator.DH.Entries), + Pos: len(creator.DH.Entries), }) - root.Prev.Pos = len(ts.creator.DH.Entries) - ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root.Prev) + root.Prev.Pos = len(creator.DH.Entries) + creator.DH.Entries = append(creator.DH.Entries, *root.Prev) - // Check if we need a new set - if ts.creator.curSet == nil { - ts.creator.curSet = &Entry{ - Type: SpecialType, - Name: "/set", - Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), ts.keywords), - Pos: len(ts.creator.DH.Entries), - } - ts.creator.DH.Entries = append(ts.creator.DH.Entries, *ts.creator.curSet) - } else { - needNewSet := false - for _, k := range root.Set.Keywords { - if !inSlice(k, ts.creator.curSet.Keywords) { - needNewSet = true - break - } - } - if needNewSet { - ts.creator.curSet = &Entry{ - Name: "/set", + if root.Set != nil { + // Check if we need a new set + if creator.curSet == nil { + creator.curSet = &Entry{ Type: SpecialType, - Pos: len(ts.creator.DH.Entries), - Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), ts.keywords), + Name: "/set", + Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords), + Pos: len(creator.DH.Entries), + } + creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) + } else { + needNewSet := false + for _, k := range root.Set.Keywords { + if !inSlice(k, creator.curSet.Keywords) { + needNewSet = true + break + } + } + if needNewSet { + creator.curSet = &Entry{ + Name: "/set", + Type: SpecialType, + Pos: len(creator.DH.Entries), + Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords), + } + creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) } - ts.creator.DH.Entries = append(ts.creator.DH.Entries, *ts.creator.curSet) } + } else if creator.curSet != nil { + // Getting into here implies that the Entry's set has not and + // was not supposed to be evaluated, thus, we need to reset curSet + creator.DH.Entries = append(creator.DH.Entries, Entry{ + Name: "/unset", + Type: SpecialType, + Pos: len(creator.DH.Entries), + }) + creator.curSet = nil } } - root.Set = ts.creator.curSet - root.Keywords = setDifference(root.Keywords, ts.creator.curSet.Keywords) - root.Pos = len(ts.creator.DH.Entries) - ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root) - - for _, c := range root.Children { - flatten(c, ts) + root.Set = creator.curSet + if creator.curSet != nil { + root.Keywords = setDifference(root.Keywords, creator.curSet.Keywords) + } + root.Pos = len(creator.DH.Entries) + creator.DH.Entries = append(creator.DH.Entries, *root) + for _, c := range root.Children { + flatten(c, creator, keywords) } - if root.Prev != nil { // Show a comment when stepping out - root.Prev.Pos = len(ts.creator.DH.Entries) - ts.creator.DH.Entries = append(ts.creator.DH.Entries, *root.Prev) + root.Prev.Pos = len(creator.DH.Entries) + creator.DH.Entries = append(creator.DH.Entries, *root.Prev) dotEntry := Entry{ Type: DotDotType, Name: "..", - Pos: len(ts.creator.DH.Entries), + Pos: len(creator.DH.Entries), } - ts.creator.DH.Entries = append(ts.creator.DH.Entries, dotEntry) + creator.DH.Entries = append(creator.DH.Entries, dotEntry) } + return } // filter takes in a pointer to an Entry, and returns a slice of Entry's that // satisfy the predicate p func filter(root *Entry, p func(*Entry) bool) []Entry { - var validEntrys []Entry - if len(root.Children) > 0 || root.Prev != nil { - for _, c := range root.Children { - // if an Entry is a directory, filter the directory - if c.Prev != nil { - validEntrys = append(validEntrys, filter(c, p)...) - } - if p(c) { - if c.Prev == nil { - // prepend directories - validEntrys = append([]Entry{*c}, validEntrys...) - } else { - validEntrys = append(validEntrys, *c) + if root != nil { + var validEntrys []Entry + if len(root.Children) > 0 || root.Prev != nil { + for _, c := range root.Children { + // filter the sub-directory + if c.Prev != nil { + validEntrys = append(validEntrys, filter(c, p)...) + } + if p(c) { + if c.Prev == nil { + validEntrys = append([]Entry{*c}, validEntrys...) + } else { + validEntrys = append(validEntrys, *c) + } } } + return validEntrys } - return validEntrys } return nil } @@ -357,6 +370,15 @@ func setDifference(this, that []string) []string { return diff } +type relationship int + +const ( + unknownDir relationship = iota + sameDir + childDir + parentDir +) + func compareDir(curDir, prevDir string) relationship { curDir = filepath.Clean(curDir) prevDir = filepath.Clean(prevDir) @@ -384,9 +406,15 @@ func (ts *tarStream) Close() error { return ts.pipeReader.Close() } +// Hierarchy returns the DirectoryHierarchy of the archive. It flattens the +// Entry tree before returning the DirectoryHierarchy func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) { if ts.err != nil && ts.err != io.EOF { return nil, ts.err } + if ts.root == nil { + return nil, fmt.Errorf("root Entry not found, nothing to flatten") + } + flatten(ts.root, &ts.creator, ts.keywords) return ts.creator.DH, nil } diff --git a/tar_test.go b/tar_test.go index 9f5ce87..b9e5171 100644 --- a/tar_test.go +++ b/tar_test.go @@ -142,6 +142,184 @@ func TestTar(t *testing.T) { } } +// This test checks how gomtree handles archives that were created +// with multiple directories, i.e, archives created with something like: +// `tar -cvf some.tar dir1 dir2 dir3 dir4/dir5 dir6` ... etc. +// The testdata of collection.tar resemble such an archive. the `collection` folder +// is the contents of `collection.tar` extracted +func TestArchiveCreation(t *testing.T) { + fh, err := os.Open("./testdata/collection.tar") + if err != nil { + t.Fatal(err) + } + str := NewTarStreamer(fh, []string{"sha1"}) + + if _, err := io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err := str.Close(); err != nil { + t.Fatal(err) + } + defer fh.Close() + + // get DirectoryHierarcy struct from walking the tar archive + tdh, err := str.Hierarchy() + if err != nil { + t.Fatal(err) + } + + // Test the tar manifest against the actual directory + res, err := Check("./testdata/collection", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Test the tar manifest against itself + res, err = TarCheck(tdh, tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Validate the directory manifest against the archive + dh, err := Walk("./testdata/collection", nil, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + res, err = TarCheck(tdh, dh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } +} + +// Now test a tar file that was created with just the path to a file. In this +// test case, the traversal and creation of "placeholder" directories are +// evaluated. Also, The fact that this archive contains a single entry, yet the +// entry is associated with a file that has parent directories, means that the +// "." directory should be the lowest sub-directory under which `file` is contained. +func TestTreeTraversal(t *testing.T) { + fh, err := os.Open("./testdata/traversal.tar") + if err != nil { + t.Fatal(err) + } + str := NewTarStreamer(fh, DefaultTarKeywords) + + if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err = str.Close(); err != nil { + t.Fatal(err) + } + + fh.Close() + tdh, err := str.Hierarchy() + + if err != nil { + t.Fatal(err) + } + + res, err := TarCheck(tdh, tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // top-level "." directory will contain contents of traversal.tar + res, err = Check("./testdata/.", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Now test an archive that requires placeholder directories, i.e, there are + // no headers in the archive that are associated with the actual directory name + fh, err = os.Open("./testdata/singlefile.tar") + if err != nil { + t.Fatal(err) + } + str = NewTarStreamer(fh, DefaultTarKeywords) + if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err = str.Close(); err != nil { + t.Fatal(err) + } + tdh, err = str.Hierarchy() + if err != nil { + t.Fatal(err) + } + + // Implied top-level "." directory will contain the contents of singlefile.tar + res, err = Check("./testdata/.", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } +} + // minimal tar archive stream that mimics what is in ./testdata/test.tar func makeTarStream() ([]byte, error) { buf := new(bytes.Buffer) diff --git a/testdata/collection.tar b/testdata/collection.tar new file mode 100644 index 0000000..36f916c Binary files /dev/null and b/testdata/collection.tar differ diff --git a/testdata/collection/dir1/file1 b/testdata/collection/dir1/file1 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/collection/dir2/file2 b/testdata/collection/dir2/file2 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/collection/dir3/file3 b/testdata/collection/dir3/file3 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/collection/dir4/file4 b/testdata/collection/dir4/file4 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/collection/dir5/dir6/dir7/lonelyfile b/testdata/collection/dir5/dir6/dir7/lonelyfile new file mode 100644 index 0000000..e69de29 diff --git a/testdata/collection/file1 b/testdata/collection/file1 new file mode 100644 index 0000000..40b43bb --- /dev/null +++ b/testdata/collection/file1 @@ -0,0 +1 @@ +im diff --git a/testdata/collection/file2 b/testdata/collection/file2 new file mode 100644 index 0000000..ce01362 --- /dev/null +++ b/testdata/collection/file2 @@ -0,0 +1 @@ +hello diff --git a/testdata/collection/file3 b/testdata/collection/file3 new file mode 100644 index 0000000..463a353 --- /dev/null +++ b/testdata/collection/file3 @@ -0,0 +1 @@ +programming diff --git a/testdata/singlefile.tar b/testdata/singlefile.tar new file mode 100644 index 0000000..3407d6c Binary files /dev/null and b/testdata/singlefile.tar differ diff --git a/testdata/singlefile/dir2/dir3/file b/testdata/singlefile/dir2/dir3/file new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal.tar b/testdata/traversal.tar new file mode 100644 index 0000000..925bfc0 Binary files /dev/null and b/testdata/traversal.tar differ diff --git a/testdata/traversal/dir2/dir3/actualdir1/file1 b/testdata/traversal/dir2/dir3/actualdir1/file1 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/actualdir1/file2 b/testdata/traversal/dir2/dir3/actualdir1/file2 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/actualdir2/a b/testdata/traversal/dir2/dir3/actualdir2/a new file mode 100644 index 0000000..ce01362 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/a @@ -0,0 +1 @@ +hello diff --git a/testdata/traversal/dir2/dir3/actualdir2/b b/testdata/traversal/dir2/dir3/actualdir2/b new file mode 100644 index 0000000..f0e6a34 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/b @@ -0,0 +1 @@ +I'm diff --git a/testdata/traversal/dir2/dir3/actualdir2/c b/testdata/traversal/dir2/dir3/actualdir2/c new file mode 100644 index 0000000..463a353 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/c @@ -0,0 +1 @@ +programming diff --git a/testdata/traversal/dir2/dir3/file b/testdata/traversal/dir2/dir3/file new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/file3 b/testdata/traversal/dir2/dir3/file3 new file mode 100644 index 0000000..e69de29