1
0
Fork 1
mirror of https://github.com/vbatts/tar-split.git synced 2024-12-25 06:46:31 +00:00
tar-split/tar/asm/disassemble.go

155 lines
4.1 KiB
Go
Raw Normal View History

package asm
import (
"io"
"github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/storage"
)
2015-03-02 21:49:53 +00:00
// NewInputTarStream wraps the Reader stream of a tar archive and provides a
// Reader stream of the same.
//
// In the middle it will pack the segments and file metadata to storage.Packer
// `p`.
//
2015-03-09 17:56:45 +00:00
// The the storage.FilePutter is where payload of files in the stream are
// stashed. If this stashing is not needed, you can provide a nil
// storage.FilePutter. Since the checksumming is still needed, then a default
// of NewDiscardFilePutter will be used internally
2015-03-09 17:20:26 +00:00
func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) {
// What to do here... folks will want their own access to the Reader that is
// their tar archive stream, but we'll need that same stream to use our
// forked 'archive/tar'.
// Perhaps do an io.TeeReader that hands back an io.Reader for them to read
// from, and we'll MITM the stream to store metadata.
2015-03-09 17:20:26 +00:00
// We'll need a storage.FilePutter too ...
2015-03-09 17:20:26 +00:00
// Another concern, whether to do any storage.FilePutter operations, such that we
// don't extract any amount of the archive. But then again, we're not making
2015-03-09 17:20:26 +00:00
// files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter.
// Perhaps we have a DiscardFilePutter that is a bit bucket.
// we'll return the pipe reader, since TeeReader does not buffer and will
// only read what the outputRdr Read's. Since Tar archives have padding on
// the end, we want to be the one reading the padding, even if the user's
// `archive/tar` doesn't care.
pR, pW := io.Pipe()
outputRdr := io.TeeReader(r, pW)
// we need a putter that will generate the crc64 sums of file payloads
if fp == nil {
2015-03-09 17:20:26 +00:00
fp = storage.NewDiscardFilePutter()
}
2015-03-02 21:49:53 +00:00
go func() {
tr := tar.NewReader(outputRdr)
tr.RawAccounting = true
for {
hdr, err := tr.Next()
if err != nil {
if err != io.EOF {
pW.CloseWithError(err)
return
}
// even when an EOF is reached, there is often 1024 null bytes on
// the end of an archive. Collect them too.
if b := tr.RawBytes(); len(b) > 0 {
_, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: b,
})
if err != nil {
pW.CloseWithError(err)
return
}
2015-03-02 21:49:53 +00:00
}
break // not return. We need the end of the reader.
2015-03-02 21:49:53 +00:00
}
if hdr == nil {
break // not return. We need the end of the reader.
}
2015-03-02 21:49:53 +00:00
if b := tr.RawBytes(); len(b) > 0 {
_, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: b,
})
if err != nil {
pW.CloseWithError(err)
return
}
2015-03-02 21:49:53 +00:00
}
2015-03-06 21:30:48 +00:00
var csum []byte
2015-03-02 21:49:53 +00:00
if hdr.Size > 0 {
var err error
_, csum, err = fp.Put(hdr.Name, tr)
if err != nil {
pW.CloseWithError(err)
return
2015-03-02 21:49:53 +00:00
}
}
2015-03-06 21:30:48 +00:00
entry := storage.Entry{
Type: storage.FileType,
Size: hdr.Size,
2015-03-06 21:30:48 +00:00
Payload: csum,
}
// For proper marshalling of non-utf8 characters
entry.SetName(hdr.Name)
// File entries added, regardless of size
_, err = p.AddEntry(entry)
if err != nil {
2015-03-02 21:49:53 +00:00
pW.CloseWithError(err)
return
2015-03-02 21:49:53 +00:00
}
if b := tr.RawBytes(); len(b) > 0 {
_, err = p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: b,
})
if err != nil {
pW.CloseWithError(err)
return
}
2015-03-02 21:49:53 +00:00
}
}
// It is allowable, and not uncommon that there is further padding on
// the end of an archive, apart from the expected 1024 null bytes. We
// do this in chunks rather than in one go to avoid cases where a
// maliciously crafted tar file tries to trick us into reading many GBs
// into memory.
const paddingChunkSize = 1024 * 1024
var paddingChunk [paddingChunkSize]byte
for {
var isEOF bool
n, err := outputRdr.Read(paddingChunk[:])
if err != nil {
if err != io.EOF {
pW.CloseWithError(err)
return
}
isEOF = true
}
_, err = p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: paddingChunk[:n],
})
if err != nil {
pW.CloseWithError(err)
return
}
if isEOF {
break
}
2015-03-02 21:49:53 +00:00
}
pW.Close()
2015-03-02 21:49:53 +00:00
}()
return pR, nil
}