package mtree import ( "archive/tar" "fmt" "io" "os" "path/filepath" "strings" "github.com/sirupsen/logrus" "github.com/vbatts/go-mtree/pkg/govis" ) // Streamer creates a file hierarchy out of a tar stream type Streamer interface { io.ReadCloser Hierarchy() (*DirectoryHierarchy, error) } var tarDefaultSetKeywords = []KeyVal{ "type=file", "flags=none", "mode=0664", } // NewTarStreamer streams a tar archive and creates a file hierarchy based off // of the tar metadata headers func NewTarStreamer(r io.Reader, excludes []ExcludeFunc, keywords []Keyword) Streamer { pR, pW := io.Pipe() ts := &tarStream{ pipeReader: pR, pipeWriter: pW, creator: dhCreator{DH: &DirectoryHierarchy{}}, teeReader: io.TeeReader(r, pW), tarReader: tar.NewReader(pR), keywords: keywords, hardlinks: map[string][]string{}, excludes: excludes, } go ts.readHeaders() return ts } type tarStream struct { root *Entry hardlinks map[string][]string creator dhCreator pipeReader *io.PipeReader pipeWriter *io.PipeWriter teeReader io.Reader tarReader *tar.Reader keywords []Keyword excludes []ExcludeFunc err error } func (ts *tarStream) readHeaders() { // remove "time" keyword notimekws := []Keyword{} for _, kw := range ts.keywords { if !InKeywordSlice(kw, notimekws) { if kw == "time" { if !InKeywordSlice("tar_time", ts.keywords) { notimekws = append(notimekws, "tar_time") } } else { notimekws = append(notimekws, kw) } } } ts.keywords = notimekws // We have to start with the directory we're in, and anything beyond these // items is determined at the time a tar is extracted. ts.root = &Entry{ Name: ".", Type: RelativeType, Prev: &Entry{ Raw: "# .", Type: CommentType, }, Set: nil, Keywords: []KeyVal{"type=dir"}, } // insert signature and metadata comments first (user, machine, tree, date) for _, e := range signatureEntries("") { e.Pos = len(ts.creator.DH.Entries) ts.creator.DH.Entries = append(ts.creator.DH.Entries, e) } // insert keyword metadata next for _, e := range keywordEntries(ts.keywords) { e.Pos = len(ts.creator.DH.Entries) ts.creator.DH.Entries = append(ts.creator.DH.Entries, e) } hdrloop: for { hdr, err := ts.tarReader.Next() if err != nil { ts.pipeReader.CloseWithError(err) return } for _, ex := range ts.excludes { if ex(hdr.Name, hdr.FileInfo()) { continue hdrloop } } // Because the content of the file may need to be read by several // KeywordFuncs, it needs to be an io.Seeker as well. So, just reading from // ts.tarReader is not enough. tmpFile, err := os.CreateTemp("", "ts.payload.") if err != nil { ts.pipeReader.CloseWithError(err) return } // for good measure if err := tmpFile.Chmod(0600); err != nil { tmpFile.Close() os.Remove(tmpFile.Name()) ts.pipeReader.CloseWithError(err) return } if _, err := io.Copy(tmpFile, ts.tarReader); err != nil { tmpFile.Close() os.Remove(tmpFile.Name()) ts.pipeReader.CloseWithError(err) return } // Alright, it's either file or directory encodedName, err := govis.Vis(filepath.Base(hdr.Name), DefaultVisFlags) if err != nil { tmpFile.Close() os.Remove(tmpFile.Name()) ts.pipeReader.CloseWithError(err) return } e := Entry{ Name: encodedName, Type: RelativeType, } // Keep track of which files are hardlinks so we can resolve them later if hdr.Typeflag == tar.TypeLink { keyFunc := KeywordFuncs["link"] kvs, err := keyFunc(hdr.Name, hdr.FileInfo(), nil) if err != nil { logrus.Warn(err) break // XXX is breaking an okay thing to do here? } linkname, err := govis.Unvis(KeyVal(kvs[0]).Value(), DefaultVisFlags) if err != nil { logrus.Warn(err) break // XXX is breaking an okay thing to do here? } if _, ok := ts.hardlinks[linkname]; !ok { ts.hardlinks[linkname] = []string{hdr.Name} } else { ts.hardlinks[linkname] = append(ts.hardlinks[linkname], hdr.Name) } } // now collect keywords on the file for _, keyword := range ts.keywords { if keyFunc, ok := KeywordFuncs[keyword.Prefix()]; ok { // We can't extract directories on to disk, so "size" keyword // is irrelevant for now if hdr.FileInfo().IsDir() && keyword == "size" { continue } kvs, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) if err != nil { ts.setErr(err) } // for good measure, check that we actually get a value for a keyword if len(kvs) > 0 && kvs[0] != "" { e.Keywords = append(e.Keywords, kvs[0]) } // don't forget to reset the reader if _, err := tmpFile.Seek(0, 0); err != nil { tmpFile.Close() os.Remove(tmpFile.Name()) ts.pipeReader.CloseWithError(err) return } } } // collect meta-set keywords for a directory so that we can build the // actual sets in `flatten` if hdr.FileInfo().IsDir() { s := Entry{ Name: "meta-set", Type: SpecialType, } for _, setKW := range SetKeywords { if keyFunc, ok := KeywordFuncs[setKW.Prefix()]; ok { kvs, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile) if err != nil { ts.setErr(err) } for _, kv := range kvs { if kv != "" { s.Keywords = append(s.Keywords, kv) } } if _, err := tmpFile.Seek(0, 0); err != nil { tmpFile.Close() os.Remove(tmpFile.Name()) ts.pipeReader.CloseWithError(err) } } } e.Set = &s } err = populateTree(ts.root, &e, hdr) if err != nil { ts.setErr(err) } tmpFile.Close() os.Remove(tmpFile.Name()) } } // populateTree creates a pseudo file tree hierarchy using an Entry's Parent and // Children fields. When examining the Entry e to insert in the tree, we // determine if the path to that Entry exists yet. If it does, insert it in the // appropriate position in the tree. If not, create a path up until the Entry's // directory that it is contained in. Then, insert the Entry. // root: the "." Entry // // e: the Entry we are looking to insert // hdr: the tar header struct associated with e func populateTree(root, e *Entry, hdr *tar.Header) error { if root == nil || e == nil { return fmt.Errorf("cannot populate or insert nil Entry's") } else if root.Prev == nil { return fmt.Errorf("root needs to be an Entry associated with a directory") } isDir := hdr.FileInfo().IsDir() wd := filepath.Clean(hdr.Name) if !isDir { // directory up until the actual file wd = filepath.Dir(wd) if wd == "." { root.Children = append([]*Entry{e}, root.Children...) e.Parent = root return nil } } dirNames := strings.Split(wd, "/") parent := root for _, name := range dirNames[:] { encoded, err := govis.Vis(name, DefaultVisFlags) if err != nil { return err } if node := parent.Descend(encoded); node == nil { // Entry for directory doesn't exist in tree relative to root. // We don't know if this directory is an actual tar header (because a // user could have just specified a path to a deep file), so we must // specify this placeholder directory as a "type=dir", and Set=nil. newEntry := Entry{ Name: encoded, Type: RelativeType, Parent: parent, Keywords: []KeyVal{"type=dir"}, // temp data Set: nil, // temp data } pathname, err := newEntry.Path() if err != nil { return err } newEntry.Prev = &Entry{ Type: CommentType, Raw: "# " + pathname, } parent.Children = append(parent.Children, &newEntry) parent = &newEntry } else { // Entry for directory exists in tree, just keep going parent = node } } if !isDir { parent.Children = append([]*Entry{e}, parent.Children...) e.Parent = parent } else { // fill in the actual data from e parent.Keywords = e.Keywords parent.Set = e.Set } return nil } // After constructing a pseudo file hierarchy tree, we want to "flatten" this // tree by putting the Entries into a slice with appropriate positioning. // // root: the "head" of the sub-tree to flatten // creator: a dhCreator that helps with the '/set' keyword // // keywords: keywords specified by the user that should be evaluated func flatten(root *Entry, creator *dhCreator, keywords []Keyword) { if root == nil || creator == nil { return } if root.Prev != nil { // root.Prev != nil implies root is a directory creator.DH.Entries = append(creator.DH.Entries, Entry{ Type: BlankType, Pos: len(creator.DH.Entries), }) root.Prev.Pos = len(creator.DH.Entries) creator.DH.Entries = append(creator.DH.Entries, *root.Prev) if root.Set != nil { // Check if we need a new set consolidatedKeys := keyvalSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords) if creator.curSet == nil { creator.curSet = &Entry{ Type: SpecialType, Name: "/set", Keywords: consolidatedKeys, Pos: len(creator.DH.Entries), } creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) } else { needNewSet := false for _, k := range root.Set.Keywords { if !inKeyValSlice(k, creator.curSet.Keywords) { needNewSet = true break } } if needNewSet { creator.curSet = &Entry{ Name: "/set", Type: SpecialType, Pos: len(creator.DH.Entries), Keywords: consolidatedKeys, } creator.DH.Entries = append(creator.DH.Entries, *creator.curSet) } } } else if creator.curSet != nil { // Getting into here implies that the Entry's set has not and // was not supposed to be evaluated, thus, we need to reset curSet creator.DH.Entries = append(creator.DH.Entries, Entry{ Name: "/unset", Type: SpecialType, Pos: len(creator.DH.Entries), }) creator.curSet = nil } } root.Set = creator.curSet if creator.curSet != nil { root.Keywords = keyValDifference(root.Keywords, creator.curSet.Keywords) } root.Pos = len(creator.DH.Entries) creator.DH.Entries = append(creator.DH.Entries, *root) for _, c := range root.Children { flatten(c, creator, keywords) } if root.Prev != nil { // Show a comment when stepping out root.Prev.Pos = len(creator.DH.Entries) creator.DH.Entries = append(creator.DH.Entries, *root.Prev) dotEntry := Entry{ Type: DotDotType, Name: "..", Pos: len(creator.DH.Entries), } creator.DH.Entries = append(creator.DH.Entries, dotEntry) } } // resolveHardlinks goes through an Entry tree, and finds the Entry's associated // with hardlinks and fills them in with the actual data from the base file. func resolveHardlinks(root *Entry, hardlinks map[string][]string, countlinks bool) { originals := make(map[string]*Entry) for base, links := range hardlinks { var basefile *Entry if seen, ok := originals[base]; !ok { basefile = root.Find(base) if basefile == nil { logrus.Printf("%s does not exist in this tree\n", base) continue } originals[base] = basefile } else { basefile = seen } for _, link := range links { linkfile := root.Find(link) if linkfile == nil { logrus.Printf("%s does not exist in this tree\n", link) continue } linkfile.Keywords = basefile.Keywords if countlinks { linkfile.Keywords = append(linkfile.Keywords, KeyVal(fmt.Sprintf("nlink=%d", len(links)+1))) } } if countlinks { basefile.Keywords = append(basefile.Keywords, KeyVal(fmt.Sprintf("nlink=%d", len(links)+1))) } } } func (ts *tarStream) setErr(err error) { ts.err = err } func (ts *tarStream) Read(p []byte) (n int, err error) { return ts.teeReader.Read(p) } func (ts *tarStream) Close() error { return ts.pipeReader.Close() } // Hierarchy returns the DirectoryHierarchy of the archive. It flattens the // Entry tree before returning the DirectoryHierarchy func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) { if ts.err != nil && ts.err != io.EOF { return nil, ts.err } if ts.root == nil { return nil, fmt.Errorf("root Entry not found, nothing to flatten") } resolveHardlinks(ts.root, ts.hardlinks, InKeywordSlice(Keyword("nlink"), ts.keywords)) flatten(ts.root, &ts.creator, ts.keywords) return ts.creator.DH, nil }