1
0
Fork 1
mirror of https://github.com/vbatts/tar-split.git synced 2024-11-16 13:28:37 +00:00
tar-split/tar.go
Stephen Chung 6a37331074 tar: flatten the pseudo-tree only after readHeaders() is done
Flattening within the readHeaders() function call was problematic
because readHeaders() is called as a goroutine; thus, as the
main program was calling `tdh.writeTo(os.Stdout)`, readHeaders() was
still in the process of flattening the tree structure. To get around this,
we now call flatten in ts.Hierarchy(), such that only when the main program
is ready to retrieve a "valid" DirectoryHierarchy from the archive, should
we flatten the tree.

Signed-off-by: Stephen Chung <schung@redhat.com>
2016-08-02 11:42:02 -04:00

402 lines
10 KiB
Go

package mtree
import (
"archive/tar"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"strings"
)
// Streamer creates a file hierarchy out of a tar stream
type Streamer interface {
io.ReadCloser
Hierarchy() (*DirectoryHierarchy, error)
}
var tarDefaultSetKeywords = []string{"type=file", "flags=none", "mode=0664"}
// NewTarStreamer streams a tar archive and creates a file hierarchy based off
// of the tar metadata headers
func NewTarStreamer(r io.Reader, keywords []string) Streamer {
pR, pW := io.Pipe()
ts := &tarStream{
pipeReader: pR,
pipeWriter: pW,
creator: dhCreator{DH: &DirectoryHierarchy{}},
teeReader: io.TeeReader(r, pW),
tarReader: tar.NewReader(pR),
keywords: keywords,
}
go ts.readHeaders()
return ts
}
type tarStream struct {
root *Entry
creator dhCreator
pipeReader *io.PipeReader
pipeWriter *io.PipeWriter
teeReader io.Reader
tarReader *tar.Reader
keywords []string
err error
}
func (ts *tarStream) readHeaders() {
// We have to start with the directory we're in, and anything beyond these
// items is determined at the time a tar is extracted.
rootComment := Entry{
Raw: "# .",
Type: CommentType,
}
ts.root = &Entry{
Name: ".",
Type: RelativeType,
Prev: &rootComment,
Set: &Entry{
Name: "meta-set",
Type: SpecialType,
},
}
metadataEntries := signatureEntries("<user specified tar archive>")
for _, e := range metadataEntries {
e.Pos = len(ts.creator.DH.Entries)
ts.creator.DH.Entries = append(ts.creator.DH.Entries, e)
}
for {
hdr, err := ts.tarReader.Next()
if err != nil {
ts.pipeReader.CloseWithError(err)
return
}
// Because the content of the file may need to be read by several
// KeywordFuncs, it needs to be an io.Seeker as well. So, just reading from
// ts.tarReader is not enough.
tmpFile, err := ioutil.TempFile("", "ts.payload.")
if err != nil {
ts.pipeReader.CloseWithError(err)
return
}
// for good measure
if err := tmpFile.Chmod(0600); err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
ts.pipeReader.CloseWithError(err)
return
}
if _, err := io.Copy(tmpFile, ts.tarReader); err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
ts.pipeReader.CloseWithError(err)
return
}
// Alright, it's either file or directory
encodedName, err := Vis(filepath.Base(hdr.Name))
if err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
ts.pipeReader.CloseWithError(err)
return
}
e := Entry{
Name: encodedName,
Type: RelativeType,
}
// now collect keywords on the file
for _, keyword := range ts.keywords {
if keyword == "time" {
keyword = "tar_time"
}
if keyFunc, ok := KeywordFuncs[keyword]; ok {
// We can't extract directories on to disk, so "size" keyword
// is irrelevant for now
if hdr.FileInfo().IsDir() && keyword == "size" {
continue
}
// TODO: handle hardlinks
val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile)
if err != nil {
ts.setErr(err)
}
// for good measure, check that we actually get a value for a keyword
if val != "" {
e.Keywords = append(e.Keywords, val)
}
// don't forget to reset the reader
if _, err := tmpFile.Seek(0, 0); err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
ts.pipeReader.CloseWithError(err)
return
}
}
}
// collect meta-set keywords for a directory so that we can build the
// actual sets in `flatten`
if hdr.FileInfo().IsDir() {
s := Entry{
Name: "meta-set",
Type: SpecialType,
}
for _, setKW := range SetKeywords {
if setKW == "time" {
setKW = "tar_time"
}
if keyFunc, ok := KeywordFuncs[setKW]; ok {
val, err := keyFunc(hdr.Name, hdr.FileInfo(), tmpFile)
if err != nil {
ts.setErr(err)
}
if val != "" {
s.Keywords = append(s.Keywords, val)
}
if _, err := tmpFile.Seek(0, 0); err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
ts.pipeReader.CloseWithError(err)
}
}
}
if filepath.Dir(filepath.Clean(hdr.Name)) == "." {
ts.root.Set = &s
} else {
e.Set = &s
}
}
err = populateTree(ts.root, &e, hdr)
if err != nil {
ts.setErr(err)
}
tmpFile.Close()
os.Remove(tmpFile.Name())
}
}
type relationship int
const (
unknownDir relationship = iota
sameDir
childDir
parentDir
)
// populateTree creates a pseudo file tree hierarchy using an Entry's Parent and
// Children fields. When examining the Entry e to insert in the tree, we
// determine if the path to that Entry exists yet. If it does, insert it in the
// appropriate position in the tree. If not, create a path up until the Entry's
// directory that it is contained in. Then, insert the Entry.
// root: the "." Entry
// e: the Entry we are looking to insert
// hdr: the tar header struct associated with e
func populateTree(root, e *Entry, hdr *tar.Header) error {
isDir := hdr.FileInfo().IsDir()
wd := filepath.Clean(hdr.Name)
if !isDir {
// If entry is a file, we only want the directory it's in.
wd = filepath.Dir(wd)
}
if filepath.Dir(wd) == "." {
if isDir {
root.Keywords = e.Keywords
} else {
root.Children = append([]*Entry{e}, root.Children...)
e.Parent = root
}
return nil
}
// TODO: what about directory/file names with "/" in it?
dirNames := strings.Split(wd, "/")
parent := root
for _, name := range dirNames[1:] {
encoded, err := Vis(name)
if err != nil {
return err
}
if node := parent.Descend(encoded); node == nil {
// Entry for directory doesn't exist in tree relative to root
newEntry := Entry{
Name: encoded,
Type: RelativeType,
Parent: parent,
}
parent.Children = append(parent.Children, &newEntry)
parent = &newEntry
} else {
// Entry for directory exists in tree, just keep going
parent = node
}
}
if !isDir {
parent.Children = append([]*Entry{e}, parent.Children...)
e.Parent = parent
} else {
// the "placeholder" directory already exists in the Entry "parent",
// so now we have to replace it's underlying data with that from e,
// as well as set the Parent field. Note that we don't set parent = e
// because parent is already in the pseudo tree, we just need to
// complete it's data.
e.Parent = parent.Parent
*parent = *e
commentpath, err := parent.Path()
if err != nil {
return err
}
parent.Prev = &Entry{
Raw: "# " + commentpath,
Type: CommentType,
}
}
return nil
}
// After constructing a pseudo file hierarchy tree, we want to "flatten" this
// tree by putting the Entries into a slice with appropriate positioning.
// root: the "head" of the sub-tree to flatten
// creator: a dhCreator that helps with the '/set' keyword
// keywords: keywords specified by the user that should be evaluated
func flatten(root *Entry, creator *dhCreator, keywords []string) {
if root == nil {
return
}
if root.Prev != nil {
// root.Prev != nil implies root is a directory
creator.DH.Entries = append(creator.DH.Entries,
Entry{
Type: BlankType,
Pos: len(creator.DH.Entries),
})
root.Prev.Pos = len(creator.DH.Entries)
creator.DH.Entries = append(creator.DH.Entries, *root.Prev)
// Check if we need a new set
if creator.curSet == nil {
creator.curSet = &Entry{
Type: SpecialType,
Name: "/set",
Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords),
Pos: len(creator.DH.Entries),
}
creator.DH.Entries = append(creator.DH.Entries, *creator.curSet)
} else {
needNewSet := false
for _, k := range root.Set.Keywords {
if !inSlice(k, creator.curSet.Keywords) {
needNewSet = true
break
}
}
if needNewSet {
creator.curSet = &Entry{
Name: "/set",
Type: SpecialType,
Pos: len(creator.DH.Entries),
Keywords: keywordSelector(append(tarDefaultSetKeywords, root.Set.Keywords...), keywords),
}
creator.DH.Entries = append(creator.DH.Entries, *creator.curSet)
}
}
}
root.Set = creator.curSet
root.Keywords = setDifference(root.Keywords, creator.curSet.Keywords)
root.Pos = len(creator.DH.Entries)
creator.DH.Entries = append(creator.DH.Entries, *root)
for _, c := range root.Children {
flatten(c, creator, keywords)
}
if root.Prev != nil {
// Show a comment when stepping out
root.Prev.Pos = len(creator.DH.Entries)
creator.DH.Entries = append(creator.DH.Entries, *root.Prev)
dotEntry := Entry{
Type: DotDotType,
Name: "..",
Pos: len(creator.DH.Entries),
}
creator.DH.Entries = append(creator.DH.Entries, dotEntry)
}
return
}
// filter takes in a pointer to an Entry, and returns a slice of Entry's that
// satisfy the predicate p
func filter(root *Entry, p func(*Entry) bool) []Entry {
var validEntrys []Entry
if len(root.Children) > 0 || root.Prev != nil {
for _, c := range root.Children {
// if an Entry is a directory, filter the directory
if c.Prev != nil {
validEntrys = append(validEntrys, filter(c, p)...)
}
if p(c) {
if c.Prev == nil {
// prepend files
validEntrys = append([]Entry{*c}, validEntrys...)
} else {
validEntrys = append(validEntrys, *c)
}
}
}
return validEntrys
}
return nil
}
func setDifference(this, that []string) []string {
if len(this) == 0 {
return that
}
diff := []string{}
for _, kv := range this {
if !inSlice(kv, that) {
diff = append(diff, kv)
}
}
return diff
}
func compareDir(curDir, prevDir string) relationship {
curDir = filepath.Clean(curDir)
prevDir = filepath.Clean(prevDir)
if curDir == prevDir {
return sameDir
}
if filepath.Dir(curDir) == prevDir {
return childDir
}
if curDir == filepath.Dir(prevDir) {
return parentDir
}
return unknownDir
}
func (ts *tarStream) setErr(err error) {
ts.err = err
}
func (ts *tarStream) Read(p []byte) (n int, err error) {
return ts.teeReader.Read(p)
}
func (ts *tarStream) Close() error {
return ts.pipeReader.Close()
}
func (ts *tarStream) Hierarchy() (*DirectoryHierarchy, error) {
if ts.err != nil && ts.err != io.EOF {
return nil, ts.err
}
if ts.root == nil {
return nil, fmt.Errorf("root Entry not found. Nothing to flatten")
}
flatten(ts.root, &ts.creator, ts.keywords)
return ts.creator.DH, nil
}