From 3d6b74d6f77cd2f75ad4e26708dd28f6e479bdae Mon Sep 17 00:00:00 2001 From: Stephen Chung Date: Tue, 2 Aug 2016 15:30:45 -0400 Subject: [PATCH] tar: populate Entry tree under a common root Resolves #56. Now, the Entry tree will be populated under a common root (if necessary), so that a manifest can be accurately generate from a tar file that has been created using multiple directories. Signed-off-by: Stephen Chung --- check.go | 4 +- tar.go | 190 +++++++++--------- tar_test.go | 178 ++++++++++++++++ testdata/collection.tar | Bin 0 -> 10240 bytes testdata/collection/dir1/file1 | 0 testdata/collection/dir2/file2 | 0 testdata/collection/dir3/file3 | 0 testdata/collection/dir4/file4 | 0 testdata/collection/dir5/dir6/dir7/lonelyfile | 0 testdata/collection/file1 | 1 + testdata/collection/file2 | 1 + testdata/collection/file3 | 1 + testdata/singlefile.tar | Bin 0 -> 10240 bytes testdata/singlefile/dir2/dir3/file | 0 testdata/traversal.tar | Bin 0 -> 10240 bytes testdata/traversal/dir2/dir3/actualdir1/file1 | 0 testdata/traversal/dir2/dir3/actualdir1/file2 | 0 testdata/traversal/dir2/dir3/actualdir2/a | 1 + testdata/traversal/dir2/dir3/actualdir2/b | 1 + testdata/traversal/dir2/dir3/actualdir2/c | 1 + testdata/traversal/dir2/dir3/file | 0 testdata/traversal/dir2/dir3/file3 | 0 22 files changed, 287 insertions(+), 91 deletions(-) create mode 100644 testdata/collection.tar create mode 100644 testdata/collection/dir1/file1 create mode 100644 testdata/collection/dir2/file2 create mode 100644 testdata/collection/dir3/file3 create mode 100644 testdata/collection/dir4/file4 create mode 100644 testdata/collection/dir5/dir6/dir7/lonelyfile create mode 100644 testdata/collection/file1 create mode 100644 testdata/collection/file2 create mode 100644 testdata/collection/file3 create mode 100644 testdata/singlefile.tar create mode 100644 testdata/singlefile/dir2/dir3/file create mode 100644 testdata/traversal.tar create mode 100644 testdata/traversal/dir2/dir3/actualdir1/file1 create mode 100644 testdata/traversal/dir2/dir3/actualdir1/file2 create mode 100644 testdata/traversal/dir2/dir3/actualdir2/a create mode 100644 testdata/traversal/dir2/dir3/actualdir2/b create mode 100644 testdata/traversal/dir2/dir3/actualdir2/c create mode 100644 testdata/traversal/dir2/dir3/file create mode 100644 testdata/traversal/dir2/dir3/file3 diff --git a/check.go b/check.go index 5a40c4b..c7752b0 100644 --- a/check.go +++ b/check.go @@ -42,7 +42,6 @@ func Check(root string, dh *DirectoryHierarchy, keywords []string) (*Result, err return nil, err } sort.Sort(byPos(creator.DH.Entries)) - var result Result for i, e := range creator.DH.Entries { switch e.Type { @@ -118,6 +117,9 @@ func TarCheck(tarDH, dh *DirectoryHierarchy, keywords []string) (*Result, error) break } } + if tarRoot == nil { + return nil, fmt.Errorf("root of archive could not be found") + } tarRoot.Next = &Entry{ Name: "seen", Type: CommentType, diff --git a/tar.go b/tar.go index 8cc81d6..1fbf6b7 100644 --- a/tar.go +++ b/tar.go @@ -49,18 +49,15 @@ type tarStream struct { func (ts *tarStream) readHeaders() { // We have to start with the directory we're in, and anything beyond these // items is determined at the time a tar is extracted. - rootComment := Entry{ - Raw: "# .", - Type: CommentType, - } ts.root = &Entry{ Name: ".", Type: RelativeType, - Prev: &rootComment, - Set: &Entry{ - Name: "meta-set", - Type: SpecialType, + Prev: &Entry{ + Raw: "# .", + Type: CommentType, }, + Set: nil, + Keywords: []string{"type=dir"}, } metadataEntries := signatureEntries("") for _, e := range metadataEntries { @@ -163,11 +160,7 @@ func (ts *tarStream) readHeaders() { } } } - if filepath.Dir(filepath.Clean(hdr.Name)) == "." { - ts.root.Set = &s - } else { - e.Set = &s - } + e.Set = &s } err = populateTree(ts.root, &e, hdr) if err != nil { @@ -178,15 +171,6 @@ func (ts *tarStream) readHeaders() { } } -type relationship int - -const ( - unknownDir relationship = iota - sameDir - childDir - parentDir -) - // populateTree creates a pseudo file tree hierarchy using an Entry's Parent and // Children fields. When examining the Entry e to insert in the tree, we // determine if the path to that Entry exists yet. If it does, insert it in the @@ -196,35 +180,50 @@ const ( // e: the Entry we are looking to insert // hdr: the tar header struct associated with e func populateTree(root, e *Entry, hdr *tar.Header) error { + if root == nil || e == nil { + return fmt.Errorf("cannot populate or insert nil Entry's") + } else if root.Prev == nil { + return fmt.Errorf("root needs to be an Entry associated with a directory") + } isDir := hdr.FileInfo().IsDir() wd := filepath.Clean(hdr.Name) if !isDir { - // If entry is a file, we only want the directory it's in. + // directory up until the actual file wd = filepath.Dir(wd) - } - if filepath.Dir(wd) == "." { - if isDir { - root.Keywords = e.Keywords - } else { + if wd == "." { + // If file in root directory, no need to traverse down, just append root.Children = append([]*Entry{e}, root.Children...) e.Parent = root + return nil } - return nil } // TODO: what about directory/file names with "/" in it? dirNames := strings.Split(wd, "/") parent := root - for _, name := range dirNames[1:] { + for _, name := range dirNames[:] { encoded, err := Vis(name) if err != nil { return err } if node := parent.Descend(encoded); node == nil { - // Entry for directory doesn't exist in tree relative to root + // Entry for directory doesn't exist in tree relative to root. + // We don't know if this directory is an actual tar header (because a + // user could have just specified a path to a deep file), so we must + // specify this placeholder directory as a "type=dir", and Set=nil. newEntry := Entry{ - Name: encoded, - Type: RelativeType, - Parent: parent, + Name: encoded, + Type: RelativeType, + Parent: parent, + Keywords: []string{"type=dir"}, // temp data + Set: nil, // temp data + } + pathname, err := newEntry.Path() + if err != nil { + return err + } + newEntry.Prev = &Entry{ + Type: CommentType, + Raw: "# " + pathname, } parent.Children = append(parent.Children, &newEntry) parent = &newEntry @@ -237,32 +236,20 @@ func populateTree(root, e *Entry, hdr *tar.Header) error { parent.Children = append([]*Entry{e}, parent.Children...) e.Parent = parent } else { - // the "placeholder" directory already exists in the Entry "parent", - // so now we have to replace it's underlying data with that from e, - // as well as set the Parent field. Note that we don't set parent = e - // because parent is already in the pseudo tree, we just need to - // complete it's data. - e.Parent = parent.Parent - *parent = *e - commentpath, err := parent.Path() - if err != nil { - return err - } - parent.Prev = &Entry{ - Raw: "# " + commentpath, - Type: CommentType, - } + // fill in the actual data from e + parent.Keywords = e.Keywords + parent.Set = e.Set } return nil } // After constructing a pseudo file hierarchy tree, we want to "flatten" this // tree by putting the Entries into a slice with appropriate positioning. -// root: the "head" of the sub-tree to flatten -// creator: a dhCreator that helps with the '/set' keyword +// root: the "head" of the sub-tree to flatten +// creator: a dhCreator that helps with the '/set' keyword // keywords: keywords specified by the user that should be evaluated func flatten(root *Entry, creator *dhCreator, keywords []string) { - if root == nil { + if root == nil || creator == nil { return } if root.Prev != nil { @@ -275,43 +262,54 @@ func flatten(root *Entry, creator *dhCreator, keywords []string) { root.Prev.Pos = len(creator.DH.Entries) creator.DH.Entries = append(creator.DH.Entries, *root.Prev) - // Check if we need a new set - if creator.curSet == nil { - creator.curSet = &Entry{ - Type: SpecialType, - Name: "/set", - Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords), - Pos: len(creator.DH.Entries), - } - creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) - } else { - needNewSet := false - for _, k := range root.Set.Keywords { - if !inSlice(k, creator.curSet.Keywords) { - needNewSet = true - break - } - } - if needNewSet { + if root.Set != nil { + // Check if we need a new set + if creator.curSet == nil { creator.curSet = &Entry{ - Name: "/set", Type: SpecialType, - Pos: len(creator.DH.Entries), + Name: "/set", Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords), + Pos: len(creator.DH.Entries), } creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) + } else { + needNewSet := false + for _, k := range root.Set.Keywords { + if !inSlice(k, creator.curSet.Keywords) { + needNewSet = true + break + } + } + if needNewSet { + creator.curSet = &Entry{ + Name: "/set", + Type: SpecialType, + Pos: len(creator.DH.Entries), + Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords), + } + creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) + } } + } else if creator.curSet != nil { + // Getting into here implies that the Entry's set has not and + // was not supposed to be evaluated, thus, we need to reset curSet + creator.DH.Entries = append(creator.DH.Entries, Entry{ + Name: "/unset", + Type: SpecialType, + Pos: len(creator.DH.Entries), + }) + creator.curSet = nil } } root.Set = creator.curSet - root.Keywords = setDifference(root.Keywords, creator.curSet.Keywords) + if creator.curSet != nil { + root.Keywords = setDifference(root.Keywords, creator.curSet.Keywords) + } root.Pos = len(creator.DH.Entries) creator.DH.Entries = append(creator.DH.Entries, *root) - for _, c := range root.Children { flatten(c, creator, keywords) } - if root.Prev != nil { // Show a comment when stepping out root.Prev.Pos = len(creator.DH.Entries) @@ -329,23 +327,24 @@ func flatten(root *Entry, creator *dhCreator, keywords []string) { // filter takes in a pointer to an Entry, and returns a slice of Entry's that // satisfy the predicate p func filter(root *Entry, p func(*Entry) bool) []Entry { - var validEntrys []Entry - if len(root.Children) > 0 || root.Prev != nil { - for _, c := range root.Children { - // if an Entry is a directory, filter the directory - if c.Prev != nil { - validEntrys = append(validEntrys, filter(c, p)...) - } - if p(c) { - if c.Prev == nil { - // prepend files - validEntrys = append([]Entry{*c}, validEntrys...) - } else { - validEntrys = append(validEntrys, *c) + if root != nil { + var validEntrys []Entry + if len(root.Children) > 0 || root.Prev != nil { + for _, c := range root.Children { + // filter the sub-directory + if c.Prev != nil { + validEntrys = append(validEntrys, filter(c, p)...) + } + if p(c) { + if c.Prev == nil { + validEntrys = append([]Entry{*c}, validEntrys...) + } else { + validEntrys = append(validEntrys, *c) + } } } + return validEntrys } - return validEntrys } return nil } @@ -363,6 +362,15 @@ func setDifference(this, that []string) []string { return diff } +type relationship int + +const ( + unknownDir relationship = iota + sameDir + childDir + parentDir +) + func compareDir(curDir, prevDir string) relationship { curDir = filepath.Clean(curDir) prevDir = filepath.Clean(prevDir) @@ -390,12 +398,14 @@ func (ts *tarStream) Close() error { return ts.pipeReader.Close() } +// Hierarchy returns the DirectoryHierarchy of the archive. It flattens the +// Entry tree before returning the DirectoryHierarchy func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) { if ts.err != nil && ts.err != io.EOF { return nil, ts.err } if ts.root == nil { - return nil, fmt.Errorf("root Entry not found. Nothing to flatten") + return nil, fmt.Errorf("root Entry not found, nothing to flatten") } flatten(ts.root, &ts.creator, ts.keywords) return ts.creator.DH, nil diff --git a/tar_test.go b/tar_test.go index 9f5ce87..b9e5171 100644 --- a/tar_test.go +++ b/tar_test.go @@ -142,6 +142,184 @@ func TestTar(t *testing.T) { } } +// This test checks how gomtree handles archives that were created +// with multiple directories, i.e, archives created with something like: +// `tar -cvf some.tar dir1 dir2 dir3 dir4/dir5 dir6` ... etc. +// The testdata of collection.tar resemble such an archive. the `collection` folder +// is the contents of `collection.tar` extracted +func TestArchiveCreation(t *testing.T) { + fh, err := os.Open("./testdata/collection.tar") + if err != nil { + t.Fatal(err) + } + str := NewTarStreamer(fh, []string{"sha1"}) + + if _, err := io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err := str.Close(); err != nil { + t.Fatal(err) + } + defer fh.Close() + + // get DirectoryHierarcy struct from walking the tar archive + tdh, err := str.Hierarchy() + if err != nil { + t.Fatal(err) + } + + // Test the tar manifest against the actual directory + res, err := Check("./testdata/collection", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Test the tar manifest against itself + res, err = TarCheck(tdh, tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Validate the directory manifest against the archive + dh, err := Walk("./testdata/collection", nil, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + res, err = TarCheck(tdh, dh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } +} + +// Now test a tar file that was created with just the path to a file. In this +// test case, the traversal and creation of "placeholder" directories are +// evaluated. Also, The fact that this archive contains a single entry, yet the +// entry is associated with a file that has parent directories, means that the +// "." directory should be the lowest sub-directory under which `file` is contained. +func TestTreeTraversal(t *testing.T) { + fh, err := os.Open("./testdata/traversal.tar") + if err != nil { + t.Fatal(err) + } + str := NewTarStreamer(fh, DefaultTarKeywords) + + if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err = str.Close(); err != nil { + t.Fatal(err) + } + + fh.Close() + tdh, err := str.Hierarchy() + + if err != nil { + t.Fatal(err) + } + + res, err := TarCheck(tdh, tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // top-level "." directory will contain contents of traversal.tar + res, err = Check("./testdata/.", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } + + // Now test an archive that requires placeholder directories, i.e, there are + // no headers in the archive that are associated with the actual directory name + fh, err = os.Open("./testdata/singlefile.tar") + if err != nil { + t.Fatal(err) + } + str = NewTarStreamer(fh, DefaultTarKeywords) + if _, err = io.Copy(ioutil.Discard, str); err != nil && err != io.EOF { + t.Fatal(err) + } + if err = str.Close(); err != nil { + t.Fatal(err) + } + tdh, err = str.Hierarchy() + if err != nil { + t.Fatal(err) + } + + // Implied top-level "." directory will contain the contents of singlefile.tar + res, err = Check("./testdata/.", tdh, []string{"sha1"}) + if err != nil { + t.Fatal(err) + } + if res != nil { + for _, f := range res.Failures { + t.Errorf(f.String()) + } + for _, e := range res.Extra { + t.Errorf("%s extra not expected", e.Name) + } + for _, m := range res.Missing { + t.Errorf("%s missing not expected", m.Name) + } + } +} + // minimal tar archive stream that mimics what is in ./testdata/test.tar func makeTarStream() ([]byte, error) { buf := new(bytes.Buffer) diff --git a/testdata/collection.tar b/testdata/collection.tar new file mode 100644 index 0000000000000000000000000000000000000000..36f916c194be0ad3c5292223b8a6091123b9820d GIT binary patch literal 10240 zcmeI1U2cOg49D}lQ*eXD`8-dxQi=q0qU*zMKRaj&s)QI7Mw#S16t_`ru>RaQTU(9#AZI}-`$3~`=;~2juE$?|Gf^#zx4lh6k)*?bL%%nUDw}E{a@#zv)CsF zuuh);78Kj}=eB>_*P(d(R~aDw8C$<87wi0o%)i0+gvh^__dkc!(Y5Znn=Ao`Q?$iOG5Za88&)-&RW%ob)|83)DT6j*XYG)zb~|3y7TotjgdTU_p!Fmz4h&zGzcJo y00IagfB*srAbQ4( literal 0 HcmV?d00001 diff --git a/testdata/singlefile/dir2/dir3/file b/testdata/singlefile/dir2/dir3/file new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal.tar b/testdata/traversal.tar new file mode 100644 index 0000000000000000000000000000000000000000..925bfc057c3385db7f22acdcffd1b1931447cdeb GIT binary patch literal 10240 zcmeI1O>V;=5QSOy6y9|i2J>@<9)r4R63I>~$Iki7q-yI0F~~+$n?4qHsT_2vQ-U!!J!jk|r{uGKKpQO*ftg-&d75 zH~aCpi&nWlOvP`v{MC^mqQ7s3(YM#}P^9lHaIXGTtDwK7FifC-5&H8<68b;Sx6X^Q z{?_WL??2}O4uzt>{7)v-BU_UamCQ@OuGT;I|I=glunr(v|78LR|2HMxjU7Jgf13E; zhV=papWy!z>{B`3`qQrWU3WO{Fb||)-C;c8e_!I2lm3_X`~SaoAO6?ye+l-f9Q$qC zp5Xixrn`;L`k%@i1+f2f_kYsx{@;>z*#G^+piAigdT6(_wjb&Fe{%kF)!)gm48;C# zE%=P~8TzY?G_zF}>5ujQEYniKnEq_~Qu#Sgw*HE0X?gvpoeA?l&VNd(5YKf>|2kfM s(bH`GwF+Mdc>iytve17KY%~G^5C8!X009sH0T2KI5C8!X0D%aBcMyFdx&QzG literal 0 HcmV?d00001 diff --git a/testdata/traversal/dir2/dir3/actualdir1/file1 b/testdata/traversal/dir2/dir3/actualdir1/file1 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/actualdir1/file2 b/testdata/traversal/dir2/dir3/actualdir1/file2 new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/actualdir2/a b/testdata/traversal/dir2/dir3/actualdir2/a new file mode 100644 index 0000000..ce01362 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/a @@ -0,0 +1 @@ +hello diff --git a/testdata/traversal/dir2/dir3/actualdir2/b b/testdata/traversal/dir2/dir3/actualdir2/b new file mode 100644 index 0000000..f0e6a34 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/b @@ -0,0 +1 @@ +I'm diff --git a/testdata/traversal/dir2/dir3/actualdir2/c b/testdata/traversal/dir2/dir3/actualdir2/c new file mode 100644 index 0000000..463a353 --- /dev/null +++ b/testdata/traversal/dir2/dir3/actualdir2/c @@ -0,0 +1 @@ +programming diff --git a/testdata/traversal/dir2/dir3/file b/testdata/traversal/dir2/dir3/file new file mode 100644 index 0000000..e69de29 diff --git a/testdata/traversal/dir2/dir3/file3 b/testdata/traversal/dir2/dir3/file3 new file mode 100644 index 0000000..e69de29