diff --git a/.travis.yml b/.travis.yml index f700909..dcce57a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,18 @@ language: go go: - - 1.4.1 - - 1.3.3 + - tip + - 1.x + - 1.8.x + - 1.7.x + - 1.6.x + - 1.5.x # let us have pretty, fast Docker-based Travis workers! sudo: false -# we don't need "go get" here <3 -install: true +install: + - go get -d ./... script: - go test -v ./... + - go vet ./... diff --git a/DESIGN.md b/DESIGN.md deleted file mode 100644 index 1ce3fd4..0000000 --- a/DESIGN.md +++ /dev/null @@ -1,36 +0,0 @@ -Flow of TAR stream -================== - -The underlying use of `github.com/vbatts/tar-split/archive/tar` is most similar -to stdlib. - - -Packer interface ----------------- - -For ease of storage and usage of the raw bytes, there will be a storage -interface, that accepts an io.Writer (This way you could pass it an in memory -buffer or a file handle). - -Having a Packer interface can allow configuration of hash.Hash for file payloads -and providing your own io.Writer. - -Instead of having a state directory to store all the header information for all -Readers, we will leave that up to user of Reader. Because we can not assume an -ID for each Reader, and keeping that information differentiated. - - - -State Directory ---------------- - -Perhaps we could deduplicate the header info, by hashing the rawbytes and -storing them in a directory tree like: - - ./ac/dc/beef - -Then reference the hash of the header info, in the positional records for the -tar stream. Though this could be a future feature, and not required for an -initial implementation. Also, this would imply an owned state directory, rather -than just writing storage info to an io.Writer. - diff --git a/LICENSE b/LICENSE index 8ba5491..ca03685 100644 --- a/LICENSE +++ b/LICENSE @@ -1,19 +1,28 @@ Copyright (c) 2015 Vincent Batts, Raleigh, NC, USA -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +All rights reserved. -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index c5e9a71..c2e7f48 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,49 @@ -tar-split -======== +# tar-split [![Build Status](https://travis-ci.org/vbatts/tar-split.svg?branch=master)](https://travis-ci.org/vbatts/tar-split) +[![Go Report Card](https://goreportcard.com/badge/github.com/vbatts/tar-split)](https://goreportcard.com/report/github.com/vbatts/tar-split) -Extend the upstream golang stdlib `archive/tar` library, to expose the raw -bytes of the TAR, rather than just the marshalled headers and file stream. +Pristinely disassembling a tar archive, and stashing needed raw bytes and offsets to reassemble a validating original archive. -The goal being that by preserving the raw bytes of each header, padding bytes, -and the raw file payload, one could reassemble the original archive. +## Docs - -Docs ----- +Code API for libraries provided by `tar-split`: * https://godoc.org/github.com/vbatts/tar-split/tar/asm * https://godoc.org/github.com/vbatts/tar-split/tar/storage * https://godoc.org/github.com/vbatts/tar-split/archive/tar +## Install -Caveat ------- +The command line utilitiy is installable via: + +```bash +go get github.com/vbatts/tar-split/cmd/tar-split +``` + +## Usage + +For cli usage, see its [README.md](cmd/tar-split/README.md). +For the library see the [docs](#docs) + +## Demo + +### Basic disassembly and assembly + +This demonstrates the `tar-split` command and how to assemble a tar archive from the `tar-data.json.gz` + + +![basic cmd demo thumbnail](https://i.ytimg.com/vi/vh5wyjIOBtc/2.jpg?time=1445027151805) +[youtube video of basic command demo](https://youtu.be/vh5wyjIOBtc) + +### Docker layer preservation + +This demonstrates the tar-split integration for docker-1.8. Providing consistent tar archives for the image layer content. + +![docker tar-split demo](https://i.ytimg.com/vi_webp/vh5wyjIOBtc/default.webp) +[youtube vide of docker layer checksums](https://youtu.be/tV_Dia8E8xw) + +## Caveat Eventually this should detect TARs that this is not possible with. @@ -37,85 +61,21 @@ same path, we will not support this feature. If there are more than one entries with the same path, expect an err (like `ErrDuplicatePath`) or a resulting tar stream that does not validate your original checksum/signature. +## Contract -Contract --------- +Do not break the API of stdlib `archive/tar` in our fork (ideally find an upstream mergeable solution). -Do not break the API of stdlib `archive/tar` in our fork (ideally find an -upstream mergeable solution) +## Std Version + +The version of golang stdlib `archive/tar` is from go1.6 +It is minimally extended to expose the raw bytes of the TAR, rather than just the marshalled headers and file stream. -Std Version ------------ +## Design -The version of golang stdlib `archive/tar` is from go1.4.1, and their master branch around [a9dddb53f](https://github.com/golang/go/tree/a9dddb53f) +See the [design](concept/DESIGN.md). - -Example -------- - -First we'll get an archive to work with. For repeatability, we'll make an -archive from what you've just cloned: - -``` -git archive --format=tar -o tar-split.tar HEAD . -``` - -Then build the example main.go: - -``` -go build ./main.go -``` - -Now run the example over the archive: - -``` -$ ./main tar-split.tar -2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out" -pax_global_header pre: 512 read: 52 -.travis.yml pre: 972 read: 374 -DESIGN.md pre: 650 read: 1131 -LICENSE pre: 917 read: 1075 -README.md pre: 973 read: 4289 -archive/ pre: 831 read: 0 -archive/tar/ pre: 512 read: 0 -archive/tar/common.go pre: 512 read: 7790 -[...] -tar/storage/entry_test.go pre: 667 read: 1137 -tar/storage/getter.go pre: 911 read: 2741 -tar/storage/getter_test.go pre: 843 read: 1491 -tar/storage/packer.go pre: 557 read: 3141 -tar/storage/packer_test.go pre: 955 read: 3096 -EOF padding: 1512 -Remainder: 512 -Size: 215040; Sum: 215040 -``` - -*What are we seeing here?* - -* `pre` is the header of a file entry, and potentially the padding from the - end of the prior file's payload. Also with particular tar extensions and pax - attributes, the header can exceed 512 bytes. -* `read` is the size of the file payload from the entry -* `EOF padding` is the expected 1024 null bytes on the end of a tar archive, - plus potential padding from the end of the prior file entry's payload -* `Remainder` is the remaining bytes of an archive. This is typically deadspace - as most tar implmentations will return after having reached the end of the - 1024 null bytes. Though various implementations will include some amount of - bytes here, which will affect the checksum of the resulting tar archive, - therefore this must be accounted for as well. - -Ideally the input tar and output `*.out`, will match: - -``` -$ sha1sum tar-split.tar* -ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar -ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar.out -``` - - -Stored Metadata ---------------- +## Stored Metadata Since the raw bytes of the headers and padding are stored, you may be wondering what the size implications are. The headers are at least 512 bytes per @@ -123,14 +83,16 @@ file (sometimes more), at least 1024 null bytes on the end, and then various padding. This makes for a constant linear growth in the stored metadata, with a naive storage implementation. -Reusing our prior example's `tar-split.tar`, let's build the checksize.go example: +First we'll get an archive to work with. For repeatability, we'll make an +archive from what you've just cloned: -``` -go build ./checksize.go +```bash +git archive --format=tar -o tar-split.tar HEAD . ``` -``` -$ ./checksize ./tar-split.tar +```bash +$ go get github.com/vbatts/tar-split/cmd/tar-split +$ tar-split checksize ./tar-split.tar inspecting "tar-split.tar" (size 210k) -- number of files: 50 -- size of metadata uncompressed: 53k @@ -143,10 +105,10 @@ implications are as little as 3kb. But let's look at a larger archive, with many files. -``` +```bash $ ls -sh ./d.tar 1.4G ./d.tar -$ ./checksize ~/d.tar +$ tar-split checksize ~/d.tar inspecting "/home/vbatts/d.tar" (size 1420749k) -- number of files: 38718 -- size of metadata uncompressed: 43261k @@ -163,19 +125,14 @@ bytes-per-file rate for the storage implications. | ~ 1kb per/file | 0.06kb per/file | -What's Next? ------------- +## What's Next? * More implementations of storage Packer and Unpacker - - could be a redis or mongo backend * More implementations of FileGetter and FilePutter - - could be a redis or mongo backend -* cli tooling to assemble/disassemble a provided tar archive * would be interesting to have an assembler stream that implements `io.Seeker` -License -------- -See LICENSE +## License +See [LICENSE](LICENSE) diff --git a/archive/tar/common.go b/archive/tar/common.go index e363aa7..36f4e23 100644 --- a/archive/tar/common.go +++ b/archive/tar/common.go @@ -139,8 +139,8 @@ func (fi headerFileInfo) Mode() (mode os.FileMode) { } switch fi.h.Typeflag { - case TypeLink, TypeSymlink: - // hard link, symbolic link + case TypeSymlink: + // symbolic link mode |= os.ModeSymlink case TypeChar: // character device node @@ -249,6 +249,30 @@ func FileInfoHeader(fi os.FileInfo, link string) (*Header, error) { if fm&os.ModeSticky != 0 { h.Mode |= c_ISVTX } + // If possible, populate additional fields from OS-specific + // FileInfo fields. + if sys, ok := fi.Sys().(*Header); ok { + // This FileInfo came from a Header (not the OS). Use the + // original Header to populate all remaining fields. + h.Uid = sys.Uid + h.Gid = sys.Gid + h.Uname = sys.Uname + h.Gname = sys.Gname + h.AccessTime = sys.AccessTime + h.ChangeTime = sys.ChangeTime + if sys.Xattrs != nil { + h.Xattrs = make(map[string]string) + for k, v := range sys.Xattrs { + h.Xattrs[k] = v + } + } + if sys.Typeflag == TypeLink { + // hard link + h.Typeflag = TypeLink + h.Size = 0 + h.Linkname = sys.Linkname + } + } if sysStat != nil { return h, sysStat(fi, h) } @@ -303,3 +327,14 @@ func toASCII(s string) string { } return buf.String() } + +// isHeaderOnlyType checks if the given type flag is of the type that has no +// data section even if a size is specified. +func isHeaderOnlyType(flag byte) bool { + switch flag { + case TypeLink, TypeSymlink, TypeChar, TypeBlock, TypeDir, TypeFifo: + return true + default: + return false + } +} diff --git a/archive/tar/example_test.go b/archive/tar/example_test.go index 2317f44..5f0ce2f 100644 --- a/archive/tar/example_test.go +++ b/archive/tar/example_test.go @@ -26,7 +26,7 @@ func Example() { }{ {"readme.txt", "This archive contains some text files."}, {"gopher.txt", "Gopher names:\nGeorge\nGeoffrey\nGonzo"}, - {"todo.txt", "Get animal handling licence."}, + {"todo.txt", "Get animal handling license."}, } for _, file := range files { hdr := &tar.Header{ @@ -76,5 +76,5 @@ func Example() { // Geoffrey // Gonzo // Contents of todo.txt: - // Get animal handling licence. + // Get animal handling license. } diff --git a/archive/tar/reader.go b/archive/tar/reader.go index a89957e..adf3212 100644 --- a/archive/tar/reader.go +++ b/archive/tar/reader.go @@ -12,6 +12,7 @@ import ( "errors" "io" "io/ioutil" + "math" "os" "strconv" "strings" @@ -39,6 +40,10 @@ type Reader struct { rawBytes *bytes.Buffer // last raw bits } +type parser struct { + err error // Last error seen +} + // RawBytes accesses the raw bytes of the archive, apart from the file payload itself. // This includes the header and padding. // @@ -70,12 +75,36 @@ type regFileReader struct { nb int64 // number of unread bytes for current file entry } -// A sparseFileReader is a numBytesReader for reading sparse file data from a tar archive. +// A sparseFileReader is a numBytesReader for reading sparse file data from a +// tar archive. type sparseFileReader struct { - rfr *regFileReader // reads the sparse-encoded file data - sp []sparseEntry // the sparse map for the file - pos int64 // keeps track of file position - tot int64 // total size of the file + rfr numBytesReader // Reads the sparse-encoded file data + sp []sparseEntry // The sparse map for the file + pos int64 // Keeps track of file position + total int64 // Total size of the file +} + +// A sparseEntry holds a single entry in a sparse file's sparse map. +// +// Sparse files are represented using a series of sparseEntrys. +// Despite the name, a sparseEntry represents an actual data fragment that +// references data found in the underlying archive stream. All regions not +// covered by a sparseEntry are logically filled with zeros. +// +// For example, if the underlying raw file contains the 10-byte data: +// var compactData = "abcdefgh" +// +// And the sparse map has the following entries: +// var sp = []sparseEntry{ +// {offset: 2, numBytes: 5} // Data fragment for [2..7] +// {offset: 18, numBytes: 3} // Data fragment for [18..21] +// } +// +// Then the content of the resulting sparse file with a "real" size of 25 is: +// var sparseData = "\x00"*2 + "abcde" + "\x00"*11 + "fgh" + "\x00"*4 +type sparseEntry struct { + offset int64 // Starting position of the fragment + numBytes int64 // Length of the fragment } // Keywords for GNU sparse files in a PAX extended header @@ -109,7 +138,6 @@ func NewReader(r io.Reader) *Reader { return &Reader{r: r} } // // io.EOF is returned at the end of the input. func (tr *Reader) Next() (*Header, error) { - var hdr *Header if tr.RawAccounting { if tr.rawBytes == nil { tr.rawBytes = bytes.NewBuffer(nil) @@ -117,88 +145,88 @@ func (tr *Reader) Next() (*Header, error) { tr.rawBytes.Reset() } } - if tr.err == nil { - tr.skipUnread() - } - if tr.err != nil { - return hdr, tr.err - } - hdr = tr.readHeader() - if hdr == nil { - return hdr, tr.err - } - // Check for PAX/GNU header. - switch hdr.Typeflag { - case TypeXHeader: - // PAX extended header - headers, err := parsePAX(tr) - if err != nil { - return nil, err - } - // We actually read the whole file, - // but this skips alignment padding - tr.skipUnread() - hdr = tr.readHeader() - mergePAX(hdr, headers) - // Check for a PAX format sparse file - sp, err := tr.checkForGNUSparsePAXHeaders(hdr, headers) - if err != nil { - tr.err = err - return nil, err - } - if sp != nil { - // Current file is a PAX format GNU sparse file. - // Set the current file reader to a sparse file reader. - tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size} - } - return hdr, nil - case TypeGNULongName: - // We have a GNU long name header. Its contents are the real file name. - realname, err := ioutil.ReadAll(tr) - if err != nil { - return nil, err - } - var b []byte - if tr.RawAccounting { - if _, err = tr.rawBytes.Write(realname); err != nil { - return nil, err - } - b = tr.RawBytes() - } - hdr, err := tr.Next() - // since the above call to Next() resets the buffer, we need to throw the bytes over - if tr.RawAccounting { - if _, err = tr.rawBytes.Write(b); err != nil { - return nil, err - } - } - hdr.Name = cString(realname) - return hdr, err - case TypeGNULongLink: - // We have a GNU long link header. - realname, err := ioutil.ReadAll(tr) - if err != nil { - return nil, err - } - var b []byte - if tr.RawAccounting { - if _, err = tr.rawBytes.Write(realname); err != nil { - return nil, err - } - b = tr.RawBytes() - } - hdr, err := tr.Next() - // since the above call to Next() resets the buffer, we need to throw the bytes over - if tr.RawAccounting { - if _, err = tr.rawBytes.Write(b); err != nil { - return nil, err - } - } - hdr.Linkname = cString(realname) - return hdr, err + if tr.err != nil { + return nil, tr.err } - return hdr, tr.err + + var hdr *Header + var extHdrs map[string]string + + // Externally, Next iterates through the tar archive as if it is a series of + // files. Internally, the tar format often uses fake "files" to add meta + // data that describes the next file. These meta data "files" should not + // normally be visible to the outside. As such, this loop iterates through + // one or more "header files" until it finds a "normal file". +loop: + for { + tr.err = tr.skipUnread() + if tr.err != nil { + return nil, tr.err + } + + hdr = tr.readHeader() + if tr.err != nil { + return nil, tr.err + } + // Check for PAX/GNU special headers and files. + switch hdr.Typeflag { + case TypeXHeader: + extHdrs, tr.err = parsePAX(tr) + if tr.err != nil { + return nil, tr.err + } + continue loop // This is a meta header affecting the next header + case TypeGNULongName, TypeGNULongLink: + var realname []byte + realname, tr.err = ioutil.ReadAll(tr) + if tr.err != nil { + return nil, tr.err + } + + if tr.RawAccounting { + if _, tr.err = tr.rawBytes.Write(realname); tr.err != nil { + return nil, tr.err + } + } + + // Convert GNU extensions to use PAX headers. + if extHdrs == nil { + extHdrs = make(map[string]string) + } + var p parser + switch hdr.Typeflag { + case TypeGNULongName: + extHdrs[paxPath] = p.parseString(realname) + case TypeGNULongLink: + extHdrs[paxLinkpath] = p.parseString(realname) + } + if p.err != nil { + tr.err = p.err + return nil, tr.err + } + continue loop // This is a meta header affecting the next header + default: + mergePAX(hdr, extHdrs) + + // Check for a PAX format sparse file + sp, err := tr.checkForGNUSparsePAXHeaders(hdr, extHdrs) + if err != nil { + tr.err = err + return nil, err + } + if sp != nil { + // Current file is a PAX format GNU sparse file. + // Set the current file reader to a sparse file reader. + tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size) + if tr.err != nil { + return nil, tr.err + } + } + break loop // This is a file, so stop + } + } + return hdr, nil } // checkForGNUSparsePAXHeaders checks the PAX headers for GNU sparse headers. If they are found, then @@ -375,6 +403,7 @@ func parsePAX(r io.Reader) (map[string]string, error) { return nil, err } } + sbuf := string(buf) // For GNU PAX sparse format 0.0 support. // This function transforms the sparse format 0.0 headers into sparse format 0.1 headers. @@ -383,35 +412,17 @@ func parsePAX(r io.Reader) (map[string]string, error) { headers := make(map[string]string) // Each record is constructed as // "%d %s=%s\n", length, keyword, value - for len(buf) > 0 { - // or the header was empty to start with. - var sp int - // The size field ends at the first space. - sp = bytes.IndexByte(buf, ' ') - if sp == -1 { - return nil, ErrHeader - } - // Parse the first token as a decimal integer. - n, err := strconv.ParseInt(string(buf[:sp]), 10, 0) + for len(sbuf) > 0 { + key, value, residual, err := parsePAXRecord(sbuf) if err != nil { return nil, ErrHeader } - // Extract everything between the decimal and the n -1 on the - // beginning to eat the ' ', -1 on the end to skip the newline. - var record []byte - record, buf = buf[sp+1:n-1], buf[n:] - // The first equals is guaranteed to mark the end of the key. - // Everything else is value. - eq := bytes.IndexByte(record, '=') - if eq == -1 { - return nil, ErrHeader - } - key, value := record[:eq], record[eq+1:] + sbuf = residual keyStr := string(key) if keyStr == paxGNUSparseOffset || keyStr == paxGNUSparseNumBytes { // GNU sparse format 0.0 special key. Write to sparseMap instead of using the headers map. - sparseMap.Write(value) + sparseMap.WriteString(value) sparseMap.Write([]byte{','}) } else { // Normal key. Set the value in the headers map. @@ -426,9 +437,42 @@ func parsePAX(r io.Reader) (map[string]string, error) { return headers, nil } -// cString parses bytes as a NUL-terminated C-style string. +// parsePAXRecord parses the input PAX record string into a key-value pair. +// If parsing is successful, it will slice off the currently read record and +// return the remainder as r. +// +// A PAX record is of the following form: +// "%d %s=%s\n" % (size, key, value) +func parsePAXRecord(s string) (k, v, r string, err error) { + // The size field ends at the first space. + sp := strings.IndexByte(s, ' ') + if sp == -1 { + return "", "", s, ErrHeader + } + + // Parse the first token as a decimal integer. + n, perr := strconv.ParseInt(s[:sp], 10, 0) // Intentionally parse as native int + if perr != nil || n < 5 || int64(len(s)) < n { + return "", "", s, ErrHeader + } + + // Extract everything between the space and the final newline. + rec, nl, rem := s[sp+1:n-1], s[n-1:n], s[n:] + if nl != "\n" { + return "", "", s, ErrHeader + } + + // The first equals separates the key from the value. + eq := strings.IndexByte(rec, '=') + if eq == -1 { + return "", "", s, ErrHeader + } + return rec[:eq], rec[eq+1:], rem, nil +} + +// parseString parses bytes as a NUL-terminated C-style string. // If a NUL byte is not found then the whole slice is returned as a string. -func cString(b []byte) string { +func (*parser) parseString(b []byte) string { n := 0 for n < len(b) && b[n] != 0 { n++ @@ -436,19 +480,51 @@ func cString(b []byte) string { return string(b[0:n]) } -func (tr *Reader) octal(b []byte) int64 { - // Check for binary format first. +// parseNumeric parses the input as being encoded in either base-256 or octal. +// This function may return negative numbers. +// If parsing fails or an integer overflow occurs, err will be set. +func (p *parser) parseNumeric(b []byte) int64 { + // Check for base-256 (binary) format first. + // If the first bit is set, then all following bits constitute a two's + // complement encoded number in big-endian byte order. if len(b) > 0 && b[0]&0x80 != 0 { - var x int64 - for i, c := range b { - if i == 0 { - c &= 0x7f // ignore signal bit in first byte - } - x = x<<8 | int64(c) + // Handling negative numbers relies on the following identity: + // -a-1 == ^a + // + // If the number is negative, we use an inversion mask to invert the + // data bytes and treat the value as an unsigned number. + var inv byte // 0x00 if positive or zero, 0xff if negative + if b[0]&0x40 != 0 { + inv = 0xff } - return x + + var x uint64 + for i, c := range b { + c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing + if i == 0 { + c &= 0x7f // Ignore signal bit in first byte + } + if (x >> 56) > 0 { + p.err = ErrHeader // Integer overflow + return 0 + } + x = x<<8 | uint64(c) + } + if (x >> 63) > 0 { + p.err = ErrHeader // Integer overflow + return 0 + } + if inv == 0xff { + return ^int64(x) + } + return int64(x) } + // Normal case is base-8 (octal) format. + return p.parseOctal(b) +} + +func (p *parser) parseOctal(b []byte) int64 { // Because unused fields are filled with NULs, we need // to skip leading NULs. Fields may also be padded with // spaces or NULs. @@ -459,27 +535,55 @@ func (tr *Reader) octal(b []byte) int64 { if len(b) == 0 { return 0 } - x, err := strconv.ParseUint(cString(b), 8, 64) - if err != nil { - tr.err = err + x, perr := strconv.ParseUint(p.parseString(b), 8, 64) + if perr != nil { + p.err = ErrHeader } return int64(x) } -// skipUnread skips any unread bytes in the existing file entry, as well as any alignment padding. -func (tr *Reader) skipUnread() { - nr := tr.numBytes() + tr.pad // number of bytes to skip +// skipUnread skips any unread bytes in the existing file entry, as well as any +// alignment padding. It returns io.ErrUnexpectedEOF if any io.EOF is +// encountered in the data portion; it is okay to hit io.EOF in the padding. +// +// Note that this function still works properly even when sparse files are being +// used since numBytes returns the bytes remaining in the underlying io.Reader. +func (tr *Reader) skipUnread() error { + dataSkip := tr.numBytes() // Number of data bytes to skip + totalSkip := dataSkip + tr.pad // Total number of bytes to skip tr.curr, tr.pad = nil, 0 if tr.RawAccounting { - _, tr.err = io.CopyN(tr.rawBytes, tr.r, nr) - return + _, tr.err = io.CopyN(tr.rawBytes, tr.r, totalSkip) + return tr.err } - if sr, ok := tr.r.(io.Seeker); ok { - if _, err := sr.Seek(nr, os.SEEK_CUR); err == nil { - return + // If possible, Seek to the last byte before the end of the data section. + // Do this because Seek is often lazy about reporting errors; this will mask + // the fact that the tar stream may be truncated. We can rely on the + // io.CopyN done shortly afterwards to trigger any IO errors. + var seekSkipped int64 // Number of bytes skipped via Seek + if sr, ok := tr.r.(io.Seeker); ok && dataSkip > 1 { + // Not all io.Seeker can actually Seek. For example, os.Stdin implements + // io.Seeker, but calling Seek always returns an error and performs + // no action. Thus, we try an innocent seek to the current position + // to see if Seek is really supported. + pos1, err := sr.Seek(0, os.SEEK_CUR) + if err == nil { + // Seek seems supported, so perform the real Seek. + pos2, err := sr.Seek(dataSkip-1, os.SEEK_CUR) + if err != nil { + tr.err = err + return tr.err + } + seekSkipped = pos2 - pos1 } } - _, tr.err = io.CopyN(ioutil.Discard, tr.r, nr) + + var copySkipped int64 // Number of bytes skipped via CopyN + copySkipped, tr.err = io.CopyN(ioutil.Discard, tr.r, totalSkip-seekSkipped) + if tr.err == io.EOF && seekSkipped+copySkipped < dataSkip { + tr.err = io.ErrUnexpectedEOF + } + return tr.err } func (tr *Reader) verifyChecksum(header []byte) bool { @@ -487,23 +591,32 @@ func (tr *Reader) verifyChecksum(header []byte) bool { return false } - given := tr.octal(header[148:156]) + var p parser + given := p.parseOctal(header[148:156]) unsigned, signed := checksum(header) - return given == unsigned || given == signed + return p.err == nil && (given == unsigned || given == signed) } +// readHeader reads the next block header and assumes that the underlying reader +// is already aligned to a block boundary. +// +// The err will be set to io.EOF only when one of the following occurs: +// * Exactly 0 bytes are read and EOF is hit. +// * Exactly 1 block of zeros is read and EOF is hit. +// * At least 2 blocks of zeros are read. func (tr *Reader) readHeader() *Header { header := tr.hdrBuff[:] copy(header, zeroBlock) - if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil { + if n, err := io.ReadFull(tr.r, header); err != nil { + tr.err = err // because it could read some of the block, but reach EOF first if tr.err == io.EOF && tr.RawAccounting { - if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { - return nil + if _, err := tr.rawBytes.Write(header[:n]); err != nil { + tr.err = err } } - return nil + return nil // io.EOF is okay here } if tr.RawAccounting { if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { @@ -513,14 +626,15 @@ func (tr *Reader) readHeader() *Header { // Two blocks of zero bytes marks the end of the archive. if bytes.Equal(header, zeroBlock[0:blockSize]) { - if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil { + if n, err := io.ReadFull(tr.r, header); err != nil { + tr.err = err // because it could read some of the block, but reach EOF first if tr.err == io.EOF && tr.RawAccounting { - if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { - return nil + if _, err := tr.rawBytes.Write(header[:n]); err != nil { + tr.err = err } } - return nil + return nil // io.EOF is okay here } if tr.RawAccounting { if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { @@ -541,18 +655,19 @@ func (tr *Reader) readHeader() *Header { } // Unpack + var p parser hdr := new(Header) s := slicer(header) - hdr.Name = cString(s.next(100)) - hdr.Mode = tr.octal(s.next(8)) - hdr.Uid = int(tr.octal(s.next(8))) - hdr.Gid = int(tr.octal(s.next(8))) - hdr.Size = tr.octal(s.next(12)) - hdr.ModTime = time.Unix(tr.octal(s.next(12)), 0) + hdr.Name = p.parseString(s.next(100)) + hdr.Mode = p.parseNumeric(s.next(8)) + hdr.Uid = int(p.parseNumeric(s.next(8))) + hdr.Gid = int(p.parseNumeric(s.next(8))) + hdr.Size = p.parseNumeric(s.next(12)) + hdr.ModTime = time.Unix(p.parseNumeric(s.next(12)), 0) s.next(8) // chksum hdr.Typeflag = s.next(1)[0] - hdr.Linkname = cString(s.next(100)) + hdr.Linkname = p.parseString(s.next(100)) // The remainder of the header depends on the value of magic. // The original (v7) version of tar had no explicit magic field, @@ -572,70 +687,76 @@ func (tr *Reader) readHeader() *Header { switch format { case "posix", "gnu", "star": - hdr.Uname = cString(s.next(32)) - hdr.Gname = cString(s.next(32)) + hdr.Uname = p.parseString(s.next(32)) + hdr.Gname = p.parseString(s.next(32)) devmajor := s.next(8) devminor := s.next(8) if hdr.Typeflag == TypeChar || hdr.Typeflag == TypeBlock { - hdr.Devmajor = tr.octal(devmajor) - hdr.Devminor = tr.octal(devminor) + hdr.Devmajor = p.parseNumeric(devmajor) + hdr.Devminor = p.parseNumeric(devminor) } var prefix string switch format { case "posix", "gnu": - prefix = cString(s.next(155)) + prefix = p.parseString(s.next(155)) case "star": - prefix = cString(s.next(131)) - hdr.AccessTime = time.Unix(tr.octal(s.next(12)), 0) - hdr.ChangeTime = time.Unix(tr.octal(s.next(12)), 0) + prefix = p.parseString(s.next(131)) + hdr.AccessTime = time.Unix(p.parseNumeric(s.next(12)), 0) + hdr.ChangeTime = time.Unix(p.parseNumeric(s.next(12)), 0) } if len(prefix) > 0 { hdr.Name = prefix + "/" + hdr.Name } } - if tr.err != nil { + if p.err != nil { + tr.err = p.err + return nil + } + + nb := hdr.Size + if isHeaderOnlyType(hdr.Typeflag) { + nb = 0 + } + if nb < 0 { tr.err = ErrHeader return nil } - // Maximum value of hdr.Size is 64 GB (12 octal digits), - // so there's no risk of int64 overflowing. - nb := int64(hdr.Size) - tr.pad = -nb & (blockSize - 1) // blockSize is a power of two - // Set the current file reader. + tr.pad = -nb & (blockSize - 1) // blockSize is a power of two tr.curr = ®FileReader{r: tr.r, nb: nb} // Check for old GNU sparse format entry. if hdr.Typeflag == TypeGNUSparse { // Get the real size of the file. - hdr.Size = tr.octal(header[483:495]) + hdr.Size = p.parseNumeric(header[483:495]) + if p.err != nil { + tr.err = p.err + return nil + } // Read the sparse map. sp := tr.readOldGNUSparseMap(header) if tr.err != nil { return nil } + // Current file is a GNU sparse file. Update the current file reader. - tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size} + tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size) + if tr.err != nil { + return nil + } } return hdr } -// A sparseEntry holds a single entry in a sparse file's sparse map. -// A sparse entry indicates the offset and size in a sparse file of a -// block of data. -type sparseEntry struct { - offset int64 - numBytes int64 -} - // readOldGNUSparseMap reads the sparse map as stored in the old GNU sparse format. // The sparse map is stored in the tar header if it's small enough. If it's larger than four entries, // then one or more extension headers are used to store the rest of the sparse map. func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry { + var p parser isExtended := header[oldGNUSparseMainHeaderIsExtendedOffset] != 0 spCap := oldGNUSparseMainHeaderNumEntries if isExtended { @@ -646,10 +767,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry { // Read the four entries from the main tar header for i := 0; i < oldGNUSparseMainHeaderNumEntries; i++ { - offset := tr.octal(s.next(oldGNUSparseOffsetSize)) - numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize)) - if tr.err != nil { - tr.err = ErrHeader + offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize)) + numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize)) + if p.err != nil { + tr.err = p.err return nil } if offset == 0 && numBytes == 0 { @@ -673,10 +794,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry { isExtended = sparseHeader[oldGNUSparseExtendedHeaderIsExtendedOffset] != 0 s = slicer(sparseHeader) for i := 0; i < oldGNUSparseExtendedHeaderNumEntries; i++ { - offset := tr.octal(s.next(oldGNUSparseOffsetSize)) - numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize)) - if tr.err != nil { - tr.err = ErrHeader + offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize)) + numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize)) + if p.err != nil { + tr.err = p.err return nil } if offset == 0 && numBytes == 0 { @@ -688,134 +809,111 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry { return sp } -// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format version 1.0. -// The sparse map is stored just before the file data and padded out to the nearest block boundary. +// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format +// version 1.0. The format of the sparse map consists of a series of +// newline-terminated numeric fields. The first field is the number of entries +// and is always present. Following this are the entries, consisting of two +// fields (offset, numBytes). This function must stop reading at the end +// boundary of the block containing the last newline. +// +// Note that the GNU manual says that numeric values should be encoded in octal +// format. However, the GNU tar utility itself outputs these values in decimal. +// As such, this library treats values as being encoded in decimal. func readGNUSparseMap1x0(r io.Reader) ([]sparseEntry, error) { - buf := make([]byte, 2*blockSize) - sparseHeader := buf[:blockSize] + var cntNewline int64 + var buf bytes.Buffer + var blk = make([]byte, blockSize) - // readDecimal is a helper function to read a decimal integer from the sparse map - // while making sure to read from the file in blocks of size blockSize - readDecimal := func() (int64, error) { - // Look for newline - nl := bytes.IndexByte(sparseHeader, '\n') - if nl == -1 { - if len(sparseHeader) >= blockSize { - // This is an error - return 0, ErrHeader + // feedTokens copies data in numBlock chunks from r into buf until there are + // at least cnt newlines in buf. It will not read more blocks than needed. + var feedTokens = func(cnt int64) error { + for cntNewline < cnt { + if _, err := io.ReadFull(r, blk); err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return err } - oldLen := len(sparseHeader) - newLen := oldLen + blockSize - if cap(sparseHeader) < newLen { - // There's more header, but we need to make room for the next block - copy(buf, sparseHeader) - sparseHeader = buf[:newLen] - } else { - // There's more header, and we can just reslice - sparseHeader = sparseHeader[:newLen] - } - - // Now that sparseHeader is large enough, read next block - if _, err := io.ReadFull(r, sparseHeader[oldLen:newLen]); err != nil { - return 0, err - } - // leaving this function for io.Reader makes it more testable - if tr, ok := r.(*Reader); ok && tr.RawAccounting { - if _, err := tr.rawBytes.Write(sparseHeader[oldLen:newLen]); err != nil { - return 0, err + buf.Write(blk) + for _, c := range blk { + if c == '\n' { + cntNewline++ } } - - // Look for a newline in the new data - nl = bytes.IndexByte(sparseHeader[oldLen:newLen], '\n') - if nl == -1 { - // This is an error - return 0, ErrHeader - } - nl += oldLen // We want the position from the beginning } - // Now that we've found a newline, read a number - n, err := strconv.ParseInt(string(sparseHeader[:nl]), 10, 0) - if err != nil { - return 0, ErrHeader - } - - // Update sparseHeader to consume this number - sparseHeader = sparseHeader[nl+1:] - return n, nil + return nil } - // Read the first block - if _, err := io.ReadFull(r, sparseHeader); err != nil { + // nextToken gets the next token delimited by a newline. This assumes that + // at least one newline exists in the buffer. + var nextToken = func() string { + cntNewline-- + tok, _ := buf.ReadString('\n') + return tok[:len(tok)-1] // Cut off newline + } + + // Parse for the number of entries. + // Use integer overflow resistant math to check this. + if err := feedTokens(1); err != nil { return nil, err } - // leaving this function for io.Reader makes it more testable - if tr, ok := r.(*Reader); ok && tr.RawAccounting { - if _, err := tr.rawBytes.Write(sparseHeader); err != nil { - return nil, err - } + numEntries, err := strconv.ParseInt(nextToken(), 10, 0) // Intentionally parse as native int + if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) { + return nil, ErrHeader } - // The first line contains the number of entries - numEntries, err := readDecimal() - if err != nil { + // Parse for all member entries. + // numEntries is trusted after this since a potential attacker must have + // committed resources proportional to what this library used. + if err := feedTokens(2 * numEntries); err != nil { return nil, err } - - // Read all the entries sp := make([]sparseEntry, 0, numEntries) for i := int64(0); i < numEntries; i++ { - // Read the offset - offset, err := readDecimal() + offset, err := strconv.ParseInt(nextToken(), 10, 64) if err != nil { - return nil, err + return nil, ErrHeader } - // Read numBytes - numBytes, err := readDecimal() + numBytes, err := strconv.ParseInt(nextToken(), 10, 64) if err != nil { - return nil, err + return nil, ErrHeader } - sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) } - return sp, nil } -// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format version 0.1. -// The sparse map is stored in the PAX headers. -func readGNUSparseMap0x1(headers map[string]string) ([]sparseEntry, error) { - // Get number of entries - numEntriesStr, ok := headers[paxGNUSparseNumBlocks] - if !ok { - return nil, ErrHeader - } - numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0) - if err != nil { +// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format +// version 0.1. The sparse map is stored in the PAX headers. +func readGNUSparseMap0x1(extHdrs map[string]string) ([]sparseEntry, error) { + // Get number of entries. + // Use integer overflow resistant math to check this. + numEntriesStr := extHdrs[paxGNUSparseNumBlocks] + numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0) // Intentionally parse as native int + if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) { return nil, ErrHeader } - sparseMap := strings.Split(headers[paxGNUSparseMap], ",") - - // There should be two numbers in sparseMap for each entry + // There should be two numbers in sparseMap for each entry. + sparseMap := strings.Split(extHdrs[paxGNUSparseMap], ",") if int64(len(sparseMap)) != 2*numEntries { return nil, ErrHeader } - // Loop through the entries in the sparse map + // Loop through the entries in the sparse map. + // numEntries is trusted now. sp := make([]sparseEntry, 0, numEntries) for i := int64(0); i < numEntries; i++ { - offset, err := strconv.ParseInt(sparseMap[2*i], 10, 0) + offset, err := strconv.ParseInt(sparseMap[2*i], 10, 64) if err != nil { return nil, ErrHeader } - numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 0) + numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 64) if err != nil { return nil, ErrHeader } sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) } - return sp, nil } @@ -832,10 +930,18 @@ func (tr *Reader) numBytes() int64 { // Read reads from the current entry in the tar archive. // It returns 0, io.EOF when it reaches the end of that entry, // until Next is called to advance to the next entry. +// +// Calling Read on special types like TypeLink, TypeSymLink, TypeChar, +// TypeBlock, TypeDir, and TypeFifo returns 0, io.EOF regardless of what +// the Header.Size claims. func (tr *Reader) Read(b []byte) (n int, err error) { + if tr.err != nil { + return 0, tr.err + } if tr.curr == nil { return 0, io.EOF } + n, err = tr.curr.Read(b) if err != nil && err != io.EOF { tr.err = err @@ -865,9 +971,33 @@ func (rfr *regFileReader) numBytes() int64 { return rfr.nb } -// readHole reads a sparse file hole ending at offset toOffset -func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int { - n64 := toOffset - sfr.pos +// newSparseFileReader creates a new sparseFileReader, but validates all of the +// sparse entries before doing so. +func newSparseFileReader(rfr numBytesReader, sp []sparseEntry, total int64) (*sparseFileReader, error) { + if total < 0 { + return nil, ErrHeader // Total size cannot be negative + } + + // Validate all sparse entries. These are the same checks as performed by + // the BSD tar utility. + for i, s := range sp { + switch { + case s.offset < 0 || s.numBytes < 0: + return nil, ErrHeader // Negative values are never okay + case s.offset > math.MaxInt64-s.numBytes: + return nil, ErrHeader // Integer overflow with large length + case s.offset+s.numBytes > total: + return nil, ErrHeader // Region extends beyond the "real" size + case i > 0 && sp[i-1].offset+sp[i-1].numBytes > s.offset: + return nil, ErrHeader // Regions can't overlap and must be in order + } + } + return &sparseFileReader{rfr: rfr, sp: sp, total: total}, nil +} + +// readHole reads a sparse hole ending at endOffset. +func (sfr *sparseFileReader) readHole(b []byte, endOffset int64) int { + n64 := endOffset - sfr.pos if n64 > int64(len(b)) { n64 = int64(len(b)) } @@ -881,46 +1011,54 @@ func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int { // Read reads the sparse file data in expanded form. func (sfr *sparseFileReader) Read(b []byte) (n int, err error) { - if len(sfr.sp) == 0 { - // No more data fragments to read from. - if sfr.pos < sfr.tot { - // We're in the last hole - n = sfr.readHole(b, sfr.tot) - return - } - // Otherwise, we're at the end of the file - return 0, io.EOF - } - if sfr.pos < sfr.sp[0].offset { - // We're in a hole - n = sfr.readHole(b, sfr.sp[0].offset) - return + // Skip past all empty fragments. + for len(sfr.sp) > 0 && sfr.sp[0].numBytes == 0 { + sfr.sp = sfr.sp[1:] } - // We're not in a hole, so we'll read from the next data fragment - posInFragment := sfr.pos - sfr.sp[0].offset - bytesLeft := sfr.sp[0].numBytes - posInFragment + // If there are no more fragments, then it is possible that there + // is one last sparse hole. + if len(sfr.sp) == 0 { + // This behavior matches the BSD tar utility. + // However, GNU tar stops returning data even if sfr.total is unmet. + if sfr.pos < sfr.total { + return sfr.readHole(b, sfr.total), nil + } + return 0, io.EOF + } + + // In front of a data fragment, so read a hole. + if sfr.pos < sfr.sp[0].offset { + return sfr.readHole(b, sfr.sp[0].offset), nil + } + + // In a data fragment, so read from it. + // This math is overflow free since we verify that offset and numBytes can + // be safely added when creating the sparseFileReader. + endPos := sfr.sp[0].offset + sfr.sp[0].numBytes // End offset of fragment + bytesLeft := endPos - sfr.pos // Bytes left in fragment if int64(len(b)) > bytesLeft { - b = b[0:bytesLeft] + b = b[:bytesLeft] } n, err = sfr.rfr.Read(b) sfr.pos += int64(n) - - if int64(n) == bytesLeft { - // We're done with this fragment - sfr.sp = sfr.sp[1:] + if err == io.EOF { + if sfr.pos < endPos { + err = io.ErrUnexpectedEOF // There was supposed to be more data + } else if sfr.pos < sfr.total { + err = nil // There is still an implicit sparse hole at the end + } } - if err == io.EOF && sfr.pos < sfr.tot { - // We reached the end of the last fragment's data, but there's a final hole - err = nil + if sfr.pos == endPos { + sfr.sp = sfr.sp[1:] // We are done with this fragment, so pop it } - return + return n, err } // numBytes returns the number of bytes left to read in the sparse file's // sparse-encoded data in the tar archive. func (sfr *sparseFileReader) numBytes() int64 { - return sfr.rfr.nb + return sfr.rfr.numBytes() } diff --git a/archive/tar/reader_test.go b/archive/tar/reader_test.go index 9601ffe..821b4f0 100644 --- a/archive/tar/reader_test.go +++ b/archive/tar/reader_test.go @@ -10,6 +10,7 @@ import ( "fmt" "io" "io/ioutil" + "math" "os" "reflect" "strings" @@ -18,9 +19,10 @@ import ( ) type untarTest struct { - file string - headers []*Header - cksums []string + file string // Test input file + headers []*Header // Expected output headers + chksums []string // MD5 checksum of files, leave as nil if not checked + err error // Expected error to occur } var gnuTarTest = &untarTest{ @@ -49,7 +51,7 @@ var gnuTarTest = &untarTest{ Gname: "eng", }, }, - cksums: []string{ + chksums: []string{ "e38b27eaccb4391bdec553a7f3ae6b2f", "c65bd2e50a56a2138bf1716f2fd56fe9", }, @@ -129,7 +131,7 @@ var sparseTarTest = &untarTest{ Devminor: 0, }, }, - cksums: []string{ + chksums: []string{ "6f53234398c2449fe67c1812d993012f", "6f53234398c2449fe67c1812d993012f", "6f53234398c2449fe67c1812d993012f", @@ -286,37 +288,118 @@ var untarTests = []*untarTest{ }, }, }, + { + // Matches the behavior of GNU, BSD, and STAR tar utilities. + file: "testdata/gnu-multi-hdrs.tar", + headers: []*Header{ + { + Name: "GNU2/GNU2/long-path-name", + Linkname: "GNU4/GNU4/long-linkpath-name", + ModTime: time.Unix(0, 0), + Typeflag: '2', + }, + }, + }, + { + // Matches the behavior of GNU and BSD tar utilities. + file: "testdata/pax-multi-hdrs.tar", + headers: []*Header{ + { + Name: "bar", + Linkname: "PAX4/PAX4/long-linkpath-name", + ModTime: time.Unix(0, 0), + Typeflag: '2', + }, + }, + }, + { + file: "testdata/neg-size.tar", + err: ErrHeader, + }, + { + file: "testdata/issue10968.tar", + err: ErrHeader, + }, + { + file: "testdata/issue11169.tar", + err: ErrHeader, + }, + { + file: "testdata/issue12435.tar", + err: ErrHeader, + }, } func TestReader(t *testing.T) { -testLoop: - for i, test := range untarTests { - f, err := os.Open(test.file) + for i, v := range untarTests { + f, err := os.Open(v.file) if err != nil { - t.Errorf("test %d: Unexpected error: %v", i, err) + t.Errorf("file %s, test %d: unexpected error: %v", v.file, i, err) continue } defer f.Close() - tr := NewReader(f) - for j, header := range test.headers { - hdr, err := tr.Next() - if err != nil || hdr == nil { - t.Errorf("test %d, entry %d: Didn't get entry: %v", i, j, err) - f.Close() - continue testLoop + + // Capture all headers and checksums. + var ( + tr = NewReader(f) + hdrs []*Header + chksums []string + ) + for { + var hdr *Header + hdr, err = tr.Next() + if err != nil { + if err == io.EOF { + err = nil // Expected error + } + break } - if !reflect.DeepEqual(*hdr, *header) { - t.Errorf("test %d, entry %d: Incorrect header:\nhave %+v\nwant %+v", - i, j, *hdr, *header) + hdrs = append(hdrs, hdr) + + if v.chksums == nil { + continue + } + h := md5.New() + _, err = io.Copy(h, tr) // Effectively an incremental read + if err != nil { + break + } + chksums = append(chksums, fmt.Sprintf("%x", h.Sum(nil))) + } + + for j, hdr := range hdrs { + if j >= len(v.headers) { + t.Errorf("file %s, test %d, entry %d: unexpected header:\ngot %+v", + v.file, i, j, *hdr) + continue + } + if !reflect.DeepEqual(*hdr, *v.headers[j]) { + t.Errorf("file %s, test %d, entry %d: incorrect header:\ngot %+v\nwant %+v", + v.file, i, j, *hdr, *v.headers[j]) } } - hdr, err := tr.Next() - if err == io.EOF { - continue testLoop + if len(hdrs) != len(v.headers) { + t.Errorf("file %s, test %d: got %d headers, want %d headers", + v.file, i, len(hdrs), len(v.headers)) } - if hdr != nil || err != nil { - t.Errorf("test %d: Unexpected entry or error: hdr=%v err=%v", i, hdr, err) + + for j, sum := range chksums { + if j >= len(v.chksums) { + t.Errorf("file %s, test %d, entry %d: unexpected sum: got %s", + v.file, i, j, sum) + continue + } + if sum != v.chksums[j] { + t.Errorf("file %s, test %d, entry %d: incorrect checksum: got %s, want %s", + v.file, i, j, sum, v.chksums[j]) + } } + + if err != v.err { + t.Errorf("file %s, test %d: unexpected error: got %v, want %v", + v.file, i, err, v.err) + } + f.Close() } } @@ -356,89 +439,6 @@ func TestPartialRead(t *testing.T) { } } -func TestIncrementalRead(t *testing.T) { - test := gnuTarTest - f, err := os.Open(test.file) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer f.Close() - - tr := NewReader(f) - - headers := test.headers - cksums := test.cksums - nread := 0 - - // loop over all files - for ; ; nread++ { - hdr, err := tr.Next() - if hdr == nil || err == io.EOF { - break - } - - // check the header - if !reflect.DeepEqual(*hdr, *headers[nread]) { - t.Errorf("Incorrect header:\nhave %+v\nwant %+v", - *hdr, headers[nread]) - } - - // read file contents in little chunks EOF, - // checksumming all the way - h := md5.New() - rdbuf := make([]uint8, 8) - for { - nr, err := tr.Read(rdbuf) - if err == io.EOF { - break - } - if err != nil { - t.Errorf("Read: unexpected error %v\n", err) - break - } - h.Write(rdbuf[0:nr]) - } - // verify checksum - have := fmt.Sprintf("%x", h.Sum(nil)) - want := cksums[nread] - if want != have { - t.Errorf("Bad checksum on file %s:\nhave %+v\nwant %+v", hdr.Name, have, want) - } - } - if nread != len(headers) { - t.Errorf("Didn't process all files\nexpected: %d\nprocessed %d\n", len(headers), nread) - } -} - -func TestNonSeekable(t *testing.T) { - test := gnuTarTest - f, err := os.Open(test.file) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer f.Close() - - type readerOnly struct { - io.Reader - } - tr := NewReader(readerOnly{f}) - nread := 0 - - for ; ; nread++ { - _, err := tr.Next() - if err == io.EOF { - break - } - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - } - - if nread != len(test.headers) { - t.Errorf("Didn't process all files\nexpected: %d\nprocessed %d\n", len(test.headers), nread) - } -} - func TestParsePAXHeader(t *testing.T) { paxTests := [][3]string{ {"a", "a=name", "10 a=name\n"}, // Test case involving multiple acceptable lengths @@ -462,9 +462,14 @@ func TestParsePAXHeader(t *testing.T) { t.Error("Buffer wasn't consumed") } } - badHeader := bytes.NewReader([]byte("3 somelongkey=")) - if _, err := parsePAX(badHeader); err != ErrHeader { - t.Fatal("Unexpected success when parsing bad header") + badHeaderTests := [][]byte{ + []byte("3 somelongkey=\n"), + []byte("50 tooshort=\n"), + } + for _, test := range badHeaderTests { + if _, err := parsePAX(bytes.NewReader(test)); err != ErrHeader { + t.Fatal("Unexpected success when parsing bad header") + } } } @@ -509,220 +514,312 @@ func TestMergePAX(t *testing.T) { } } -func TestSparseEndToEnd(t *testing.T) { - test := sparseTarTest - f, err := os.Open(test.file) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - defer f.Close() - - tr := NewReader(f) - - headers := test.headers - cksums := test.cksums - nread := 0 - - // loop over all files - for ; ; nread++ { - hdr, err := tr.Next() - if hdr == nil || err == io.EOF { - break - } - - // check the header - if !reflect.DeepEqual(*hdr, *headers[nread]) { - t.Errorf("Incorrect header:\nhave %+v\nwant %+v", - *hdr, headers[nread]) - } - - // read and checksum the file data - h := md5.New() - _, err = io.Copy(h, tr) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - - // verify checksum - have := fmt.Sprintf("%x", h.Sum(nil)) - want := cksums[nread] - if want != have { - t.Errorf("Bad checksum on file %s:\nhave %+v\nwant %+v", hdr.Name, have, want) - } - } - if nread != len(headers) { - t.Errorf("Didn't process all files\nexpected: %d\nprocessed %d\n", len(headers), nread) - } -} - -type sparseFileReadTest struct { - sparseData []byte - sparseMap []sparseEntry - realSize int64 - expected []byte -} - -var sparseFileReadTests = []sparseFileReadTest{ - { - sparseData: []byte("abcde"), - sparseMap: []sparseEntry{ - {offset: 0, numBytes: 2}, - {offset: 5, numBytes: 3}, - }, - realSize: 8, - expected: []byte("ab\x00\x00\x00cde"), - }, - { - sparseData: []byte("abcde"), - sparseMap: []sparseEntry{ - {offset: 0, numBytes: 2}, - {offset: 5, numBytes: 3}, - }, - realSize: 10, - expected: []byte("ab\x00\x00\x00cde\x00\x00"), - }, - { - sparseData: []byte("abcde"), - sparseMap: []sparseEntry{ - {offset: 1, numBytes: 3}, - {offset: 6, numBytes: 2}, - }, - realSize: 8, - expected: []byte("\x00abc\x00\x00de"), - }, - { - sparseData: []byte("abcde"), - sparseMap: []sparseEntry{ - {offset: 1, numBytes: 3}, - {offset: 6, numBytes: 2}, - }, - realSize: 10, - expected: []byte("\x00abc\x00\x00de\x00\x00"), - }, - { - sparseData: []byte(""), - sparseMap: nil, - realSize: 2, - expected: []byte("\x00\x00"), - }, -} - func TestSparseFileReader(t *testing.T) { - for i, test := range sparseFileReadTests { - r := bytes.NewReader(test.sparseData) - nb := int64(r.Len()) - sfr := &sparseFileReader{ - rfr: ®FileReader{r: r, nb: nb}, - sp: test.sparseMap, - pos: 0, - tot: test.realSize, - } - if sfr.numBytes() != nb { - t.Errorf("test %d: Before reading, sfr.numBytes() = %d, want %d", i, sfr.numBytes(), nb) - } - buf, err := ioutil.ReadAll(sfr) + var vectors = []struct { + realSize int64 // Real size of the output file + sparseMap []sparseEntry // Input sparse map + sparseData string // Input compact data + expected string // Expected output data + err error // Expected error outcome + }{{ + realSize: 8, + sparseMap: []sparseEntry{ + {offset: 0, numBytes: 2}, + {offset: 5, numBytes: 3}, + }, + sparseData: "abcde", + expected: "ab\x00\x00\x00cde", + }, { + realSize: 10, + sparseMap: []sparseEntry{ + {offset: 0, numBytes: 2}, + {offset: 5, numBytes: 3}, + }, + sparseData: "abcde", + expected: "ab\x00\x00\x00cde\x00\x00", + }, { + realSize: 8, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 2}, + }, + sparseData: "abcde", + expected: "\x00abc\x00\x00de", + }, { + realSize: 8, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 0}, + {offset: 6, numBytes: 0}, + {offset: 6, numBytes: 2}, + }, + sparseData: "abcde", + expected: "\x00abc\x00\x00de", + }, { + realSize: 10, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 2}, + }, + sparseData: "abcde", + expected: "\x00abc\x00\x00de\x00\x00", + }, { + realSize: 10, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 2}, + {offset: 8, numBytes: 0}, + {offset: 8, numBytes: 0}, + {offset: 8, numBytes: 0}, + {offset: 8, numBytes: 0}, + }, + sparseData: "abcde", + expected: "\x00abc\x00\x00de\x00\x00", + }, { + realSize: 2, + sparseMap: []sparseEntry{}, + sparseData: "", + expected: "\x00\x00", + }, { + realSize: -2, + sparseMap: []sparseEntry{}, + err: ErrHeader, + }, { + realSize: -10, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 2}, + }, + sparseData: "abcde", + err: ErrHeader, + }, { + realSize: 10, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 5}, + }, + sparseData: "abcde", + err: ErrHeader, + }, { + realSize: 35, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: 5}, + }, + sparseData: "abcde", + err: io.ErrUnexpectedEOF, + }, { + realSize: 35, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 6, numBytes: -5}, + }, + sparseData: "abcde", + err: ErrHeader, + }, { + realSize: 35, + sparseMap: []sparseEntry{ + {offset: math.MaxInt64, numBytes: 3}, + {offset: 6, numBytes: -5}, + }, + sparseData: "abcde", + err: ErrHeader, + }, { + realSize: 10, + sparseMap: []sparseEntry{ + {offset: 1, numBytes: 3}, + {offset: 2, numBytes: 2}, + }, + sparseData: "abcde", + err: ErrHeader, + }} + + for i, v := range vectors { + r := bytes.NewReader([]byte(v.sparseData)) + rfr := ®FileReader{r: r, nb: int64(len(v.sparseData))} + + var sfr *sparseFileReader + var err error + var buf []byte + + sfr, err = newSparseFileReader(rfr, v.sparseMap, v.realSize) if err != nil { - t.Errorf("test %d: Unexpected error: %v", i, err) + goto fail } - if e := test.expected; !bytes.Equal(buf, e) { - t.Errorf("test %d: Contents = %v, want %v", i, buf, e) + if sfr.numBytes() != int64(len(v.sparseData)) { + t.Errorf("test %d, numBytes() before reading: got %d, want %d", i, sfr.numBytes(), len(v.sparseData)) + } + buf, err = ioutil.ReadAll(sfr) + if err != nil { + goto fail + } + if string(buf) != v.expected { + t.Errorf("test %d, ReadAll(): got %q, want %q", i, string(buf), v.expected) } if sfr.numBytes() != 0 { - t.Errorf("test %d: After draining the reader, numBytes() was nonzero", i) + t.Errorf("test %d, numBytes() after reading: got %d, want %d", i, sfr.numBytes(), 0) } - } -} -func TestSparseIncrementalRead(t *testing.T) { - sparseMap := []sparseEntry{{10, 2}} - sparseData := []byte("Go") - expected := "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00Go\x00\x00\x00\x00\x00\x00\x00\x00" - - r := bytes.NewReader(sparseData) - nb := int64(r.Len()) - sfr := &sparseFileReader{ - rfr: ®FileReader{r: r, nb: nb}, - sp: sparseMap, - pos: 0, - tot: int64(len(expected)), - } - - // We'll read the data 6 bytes at a time, with a hole of size 10 at - // the beginning and one of size 8 at the end. - var outputBuf bytes.Buffer - buf := make([]byte, 6) - for { - n, err := sfr.Read(buf) - if err == io.EOF { - break + fail: + if err != v.err { + t.Errorf("test %d, unexpected error: got %v, want %v", i, err, v.err) } - if err != nil { - t.Errorf("Read: unexpected error %v\n", err) - } - if n > 0 { - _, err := outputBuf.Write(buf[:n]) - if err != nil { - t.Errorf("Write: unexpected error %v\n", err) - } - } - } - got := outputBuf.String() - if got != expected { - t.Errorf("Contents = %v, want %v", got, expected) } } func TestReadGNUSparseMap0x1(t *testing.T) { - headers := map[string]string{ - paxGNUSparseNumBlocks: "4", - paxGNUSparseMap: "0,5,10,5,20,5,30,5", - } - expected := []sparseEntry{ - {offset: 0, numBytes: 5}, - {offset: 10, numBytes: 5}, - {offset: 20, numBytes: 5}, - {offset: 30, numBytes: 5}, - } + const ( + maxUint = ^uint(0) + maxInt = int(maxUint >> 1) + ) + var ( + big1 = fmt.Sprintf("%d", int64(maxInt)) + big2 = fmt.Sprintf("%d", (int64(maxInt)/2)+1) + big3 = fmt.Sprintf("%d", (int64(maxInt) / 3)) + ) - sp, err := readGNUSparseMap0x1(headers) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !reflect.DeepEqual(sp, expected) { - t.Errorf("Incorrect sparse map: got %v, wanted %v", sp, expected) + var vectors = []struct { + extHdrs map[string]string // Input data + sparseMap []sparseEntry // Expected sparse entries to be outputted + err error // Expected errors that may be raised + }{{ + extHdrs: map[string]string{paxGNUSparseNumBlocks: "-4"}, + err: ErrHeader, + }, { + extHdrs: map[string]string{paxGNUSparseNumBlocks: "fee "}, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: big1, + paxGNUSparseMap: "0,5,10,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: big2, + paxGNUSparseMap: "0,5,10,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: big3, + paxGNUSparseMap: "0,5,10,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: "4", + paxGNUSparseMap: "0.5,5,10,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: "4", + paxGNUSparseMap: "0,5.5,10,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: "4", + paxGNUSparseMap: "0,fewafewa.5,fewafw,5,20,5,30,5", + }, + err: ErrHeader, + }, { + extHdrs: map[string]string{ + paxGNUSparseNumBlocks: "4", + paxGNUSparseMap: "0,5,10,5,20,5,30,5", + }, + sparseMap: []sparseEntry{{0, 5}, {10, 5}, {20, 5}, {30, 5}}, + }} + + for i, v := range vectors { + sp, err := readGNUSparseMap0x1(v.extHdrs) + if !reflect.DeepEqual(sp, v.sparseMap) && !(len(sp) == 0 && len(v.sparseMap) == 0) { + t.Errorf("test %d, readGNUSparseMap0x1(...): got %v, want %v", i, sp, v.sparseMap) + } + if err != v.err { + t.Errorf("test %d, unexpected error: got %v, want %v", i, err, v.err) + } } } func TestReadGNUSparseMap1x0(t *testing.T) { - // This test uses lots of holes so the sparse header takes up more than two blocks - numEntries := 100 - expected := make([]sparseEntry, 0, numEntries) - sparseMap := new(bytes.Buffer) - - fmt.Fprintf(sparseMap, "%d\n", numEntries) - for i := 0; i < numEntries; i++ { - offset := int64(2048 * i) - numBytes := int64(1024) - expected = append(expected, sparseEntry{offset: offset, numBytes: numBytes}) - fmt.Fprintf(sparseMap, "%d\n%d\n", offset, numBytes) + var sp = []sparseEntry{{1, 2}, {3, 4}} + for i := 0; i < 98; i++ { + sp = append(sp, sparseEntry{54321, 12345}) } - // Make the header the smallest multiple of blockSize that fits the sparseMap - headerBlocks := (sparseMap.Len() + blockSize - 1) / blockSize - bufLen := blockSize * headerBlocks - buf := make([]byte, bufLen) - copy(buf, sparseMap.Bytes()) + var vectors = []struct { + input string // Input data + sparseMap []sparseEntry // Expected sparse entries to be outputted + cnt int // Expected number of bytes read + err error // Expected errors that may be raised + }{{ + input: "", + cnt: 0, + err: io.ErrUnexpectedEOF, + }, { + input: "ab", + cnt: 2, + err: io.ErrUnexpectedEOF, + }, { + input: strings.Repeat("\x00", 512), + cnt: 512, + err: io.ErrUnexpectedEOF, + }, { + input: strings.Repeat("\x00", 511) + "\n", + cnt: 512, + err: ErrHeader, + }, { + input: strings.Repeat("\n", 512), + cnt: 512, + err: ErrHeader, + }, { + input: "0\n" + strings.Repeat("\x00", 510) + strings.Repeat("a", 512), + sparseMap: []sparseEntry{}, + cnt: 512, + }, { + input: strings.Repeat("0", 512) + "0\n" + strings.Repeat("\x00", 510), + sparseMap: []sparseEntry{}, + cnt: 1024, + }, { + input: strings.Repeat("0", 1024) + "1\n2\n3\n" + strings.Repeat("\x00", 506), + sparseMap: []sparseEntry{{2, 3}}, + cnt: 1536, + }, { + input: strings.Repeat("0", 1024) + "1\n2\n\n" + strings.Repeat("\x00", 509), + cnt: 1536, + err: ErrHeader, + }, { + input: strings.Repeat("0", 1024) + "1\n2\n" + strings.Repeat("\x00", 508), + cnt: 1536, + err: io.ErrUnexpectedEOF, + }, { + input: "-1\n2\n\n" + strings.Repeat("\x00", 506), + cnt: 512, + err: ErrHeader, + }, { + input: "1\nk\n2\n" + strings.Repeat("\x00", 506), + cnt: 512, + err: ErrHeader, + }, { + input: "100\n1\n2\n3\n4\n" + strings.Repeat("54321\n0000000000000012345\n", 98) + strings.Repeat("\x00", 512), + cnt: 2560, + sparseMap: sp, + }} - // Get an reader to read the sparse map - r := bytes.NewReader(buf) - - // Read the sparse map - sp, err := readGNUSparseMap1x0(r) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !reflect.DeepEqual(sp, expected) { - t.Errorf("Incorrect sparse map: got %v, wanted %v", sp, expected) + for i, v := range vectors { + r := strings.NewReader(v.input) + sp, err := readGNUSparseMap1x0(r) + if !reflect.DeepEqual(sp, v.sparseMap) && !(len(sp) == 0 && len(v.sparseMap) == 0) { + t.Errorf("test %d, readGNUSparseMap1x0(...): got %v, want %v", i, sp, v.sparseMap) + } + if numBytes := len(v.input) - r.Len(); numBytes != v.cnt { + t.Errorf("test %d, bytes read: got %v, want %v", i, numBytes, v.cnt) + } + if err != v.err { + t.Errorf("test %d, unexpected error: got %v, want %v", i, err, v.err) + } } } @@ -741,3 +838,287 @@ func TestUninitializedRead(t *testing.T) { } } + +type reader struct{ io.Reader } +type readSeeker struct{ io.ReadSeeker } +type readBadSeeker struct{ io.ReadSeeker } + +func (rbs *readBadSeeker) Seek(int64, int) (int64, error) { return 0, fmt.Errorf("illegal seek") } + +// TestReadTruncation test the ending condition on various truncated files and +// that truncated files are still detected even if the underlying io.Reader +// satisfies io.Seeker. +func TestReadTruncation(t *testing.T) { + var ss []string + for _, p := range []string{ + "testdata/gnu.tar", + "testdata/ustar-file-reg.tar", + "testdata/pax-path-hdr.tar", + "testdata/sparse-formats.tar", + } { + buf, err := ioutil.ReadFile(p) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + ss = append(ss, string(buf)) + } + + data1, data2, pax, sparse := ss[0], ss[1], ss[2], ss[3] + data2 += strings.Repeat("\x00", 10*512) + trash := strings.Repeat("garbage ", 64) // Exactly 512 bytes + + var vectors = []struct { + input string // Input stream + cnt int // Expected number of headers read + err error // Expected error outcome + }{ + {"", 0, io.EOF}, // Empty file is a "valid" tar file + {data1[:511], 0, io.ErrUnexpectedEOF}, + {data1[:512], 1, io.ErrUnexpectedEOF}, + {data1[:1024], 1, io.EOF}, + {data1[:1536], 2, io.ErrUnexpectedEOF}, + {data1[:2048], 2, io.EOF}, + {data1, 2, io.EOF}, + {data1[:2048] + data2[:1536], 3, io.EOF}, + {data2[:511], 0, io.ErrUnexpectedEOF}, + {data2[:512], 1, io.ErrUnexpectedEOF}, + {data2[:1195], 1, io.ErrUnexpectedEOF}, + {data2[:1196], 1, io.EOF}, // Exact end of data and start of padding + {data2[:1200], 1, io.EOF}, + {data2[:1535], 1, io.EOF}, + {data2[:1536], 1, io.EOF}, // Exact end of padding + {data2[:1536] + trash[:1], 1, io.ErrUnexpectedEOF}, + {data2[:1536] + trash[:511], 1, io.ErrUnexpectedEOF}, + {data2[:1536] + trash, 1, ErrHeader}, + {data2[:2048], 1, io.EOF}, // Exactly 1 empty block + {data2[:2048] + trash[:1], 1, io.ErrUnexpectedEOF}, + {data2[:2048] + trash[:511], 1, io.ErrUnexpectedEOF}, + {data2[:2048] + trash, 1, ErrHeader}, + {data2[:2560], 1, io.EOF}, // Exactly 2 empty blocks (normal end-of-stream) + {data2[:2560] + trash[:1], 1, io.EOF}, + {data2[:2560] + trash[:511], 1, io.EOF}, + {data2[:2560] + trash, 1, io.EOF}, + {data2[:3072], 1, io.EOF}, + {pax, 0, io.EOF}, // PAX header without data is a "valid" tar file + {pax + trash[:1], 0, io.ErrUnexpectedEOF}, + {pax + trash[:511], 0, io.ErrUnexpectedEOF}, + {sparse[:511], 0, io.ErrUnexpectedEOF}, + // TODO(dsnet): This should pass, but currently fails. + // {sparse[:512], 0, io.ErrUnexpectedEOF}, + {sparse[:3584], 1, io.EOF}, + {sparse[:9200], 1, io.EOF}, // Terminate in padding of sparse header + {sparse[:9216], 1, io.EOF}, + {sparse[:9728], 2, io.ErrUnexpectedEOF}, + {sparse[:10240], 2, io.EOF}, + {sparse[:11264], 2, io.ErrUnexpectedEOF}, + {sparse, 5, io.EOF}, + {sparse + trash, 5, io.EOF}, + } + + for i, v := range vectors { + for j := 0; j < 6; j++ { + var tr *Reader + var s1, s2 string + + switch j { + case 0: + tr = NewReader(&reader{strings.NewReader(v.input)}) + s1, s2 = "io.Reader", "auto" + case 1: + tr = NewReader(&reader{strings.NewReader(v.input)}) + s1, s2 = "io.Reader", "manual" + case 2: + tr = NewReader(&readSeeker{strings.NewReader(v.input)}) + s1, s2 = "io.ReadSeeker", "auto" + case 3: + tr = NewReader(&readSeeker{strings.NewReader(v.input)}) + s1, s2 = "io.ReadSeeker", "manual" + case 4: + tr = NewReader(&readBadSeeker{strings.NewReader(v.input)}) + s1, s2 = "ReadBadSeeker", "auto" + case 5: + tr = NewReader(&readBadSeeker{strings.NewReader(v.input)}) + s1, s2 = "ReadBadSeeker", "manual" + } + + var cnt int + var err error + for { + if _, err = tr.Next(); err != nil { + break + } + cnt++ + if s2 == "manual" { + if _, err = io.Copy(ioutil.Discard, tr); err != nil { + break + } + } + } + if err != v.err { + t.Errorf("test %d, NewReader(%s(...)) with %s discard: got %v, want %v", + i, s1, s2, err, v.err) + } + if cnt != v.cnt { + t.Errorf("test %d, NewReader(%s(...)) with %s discard: got %d headers, want %d headers", + i, s1, s2, cnt, v.cnt) + } + } + } +} + +// TestReadHeaderOnly tests that Reader does not attempt to read special +// header-only files. +func TestReadHeaderOnly(t *testing.T) { + f, err := os.Open("testdata/hdr-only.tar") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + defer f.Close() + + var hdrs []*Header + tr := NewReader(f) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Errorf("Next(): got %v, want %v", err, nil) + continue + } + hdrs = append(hdrs, hdr) + + // If a special flag, we should read nothing. + cnt, _ := io.ReadFull(tr, []byte{0}) + if cnt > 0 && hdr.Typeflag != TypeReg { + t.Errorf("ReadFull(...): got %d bytes, want 0 bytes", cnt) + } + } + + // File is crafted with 16 entries. The later 8 are identical to the first + // 8 except that the size is set. + if len(hdrs) != 16 { + t.Fatalf("len(hdrs): got %d, want %d", len(hdrs), 16) + } + for i := 0; i < 8; i++ { + var hdr1, hdr2 = hdrs[i+0], hdrs[i+8] + hdr1.Size, hdr2.Size = 0, 0 + if !reflect.DeepEqual(*hdr1, *hdr2) { + t.Errorf("incorrect header:\ngot %+v\nwant %+v", *hdr1, *hdr2) + } + } +} + +func TestParsePAXRecord(t *testing.T) { + var medName = strings.Repeat("CD", 50) + var longName = strings.Repeat("AB", 100) + + var vectors = []struct { + input string + residual string + outputKey string + outputVal string + ok bool + }{ + {"6 k=v\n\n", "\n", "k", "v", true}, + {"19 path=/etc/hosts\n", "", "path", "/etc/hosts", true}, + {"210 path=" + longName + "\nabc", "abc", "path", longName, true}, + {"110 path=" + medName + "\n", "", "path", medName, true}, + {"9 foo=ba\n", "", "foo", "ba", true}, + {"11 foo=bar\n\x00", "\x00", "foo", "bar", true}, + {"18 foo=b=\nar=\n==\x00\n", "", "foo", "b=\nar=\n==\x00", true}, + {"27 foo=hello9 foo=ba\nworld\n", "", "foo", "hello9 foo=ba\nworld", true}, + {"27 ☺☻☹=日a本b語ç\nmeow mix", "meow mix", "☺☻☹", "日a本b語ç", true}, + {"17 \x00hello=\x00world\n", "", "\x00hello", "\x00world", true}, + {"1 k=1\n", "1 k=1\n", "", "", false}, + {"6 k~1\n", "6 k~1\n", "", "", false}, + {"6_k=1\n", "6_k=1\n", "", "", false}, + {"6 k=1 ", "6 k=1 ", "", "", false}, + {"632 k=1\n", "632 k=1\n", "", "", false}, + {"16 longkeyname=hahaha\n", "16 longkeyname=hahaha\n", "", "", false}, + {"3 somelongkey=\n", "3 somelongkey=\n", "", "", false}, + {"50 tooshort=\n", "50 tooshort=\n", "", "", false}, + } + + for _, v := range vectors { + key, val, res, err := parsePAXRecord(v.input) + ok := (err == nil) + if v.ok != ok { + if v.ok { + t.Errorf("parsePAXRecord(%q): got parsing failure, want success", v.input) + } else { + t.Errorf("parsePAXRecord(%q): got parsing success, want failure", v.input) + } + } + if ok && (key != v.outputKey || val != v.outputVal) { + t.Errorf("parsePAXRecord(%q): got (%q: %q), want (%q: %q)", + v.input, key, val, v.outputKey, v.outputVal) + } + if res != v.residual { + t.Errorf("parsePAXRecord(%q): got residual %q, want residual %q", + v.input, res, v.residual) + } + } +} + +func TestParseNumeric(t *testing.T) { + var vectors = []struct { + input string + output int64 + ok bool + }{ + // Test base-256 (binary) encoded values. + {"", 0, true}, + {"\x80", 0, true}, + {"\x80\x00", 0, true}, + {"\x80\x00\x00", 0, true}, + {"\xbf", (1 << 6) - 1, true}, + {"\xbf\xff", (1 << 14) - 1, true}, + {"\xbf\xff\xff", (1 << 22) - 1, true}, + {"\xff", -1, true}, + {"\xff\xff", -1, true}, + {"\xff\xff\xff", -1, true}, + {"\xc0", -1 * (1 << 6), true}, + {"\xc0\x00", -1 * (1 << 14), true}, + {"\xc0\x00\x00", -1 * (1 << 22), true}, + {"\x87\x76\xa2\x22\xeb\x8a\x72\x61", 537795476381659745, true}, + {"\x80\x00\x00\x00\x07\x76\xa2\x22\xeb\x8a\x72\x61", 537795476381659745, true}, + {"\xf7\x76\xa2\x22\xeb\x8a\x72\x61", -615126028225187231, true}, + {"\xff\xff\xff\xff\xf7\x76\xa2\x22\xeb\x8a\x72\x61", -615126028225187231, true}, + {"\x80\x7f\xff\xff\xff\xff\xff\xff\xff", math.MaxInt64, true}, + {"\x80\x80\x00\x00\x00\x00\x00\x00\x00", 0, false}, + {"\xff\x80\x00\x00\x00\x00\x00\x00\x00", math.MinInt64, true}, + {"\xff\x7f\xff\xff\xff\xff\xff\xff\xff", 0, false}, + {"\xf5\xec\xd1\xc7\x7e\x5f\x26\x48\x81\x9f\x8f\x9b", 0, false}, + + // Test base-8 (octal) encoded values. + {"0000000\x00", 0, true}, + {" \x0000000\x00", 0, true}, + {" \x0000003\x00", 3, true}, + {"00000000227\x00", 0227, true}, + {"032033\x00 ", 032033, true}, + {"320330\x00 ", 0320330, true}, + {"0000660\x00 ", 0660, true}, + {"\x00 0000660\x00 ", 0660, true}, + {"0123456789abcdef", 0, false}, + {"0123456789\x00abcdef", 0, false}, + {"01234567\x0089abcdef", 342391, true}, + {"0123\x7e\x5f\x264123", 0, false}, + } + + for _, v := range vectors { + var p parser + num := p.parseNumeric([]byte(v.input)) + ok := (p.err == nil) + if v.ok != ok { + if v.ok { + t.Errorf("parseNumeric(%q): got parsing failure, want success", v.input) + } else { + t.Errorf("parseNumeric(%q): got parsing success, want failure", v.input) + } + } + if ok && num != v.output { + t.Errorf("parseNumeric(%q): got %d, want %d", v.input, num, v.output) + } + } +} diff --git a/archive/tar/tar_test.go b/archive/tar/tar_test.go index ed333f3..9ef319a 100644 --- a/archive/tar/tar_test.go +++ b/archive/tar/tar_test.go @@ -94,13 +94,12 @@ func TestRoundTrip(t *testing.T) { var b bytes.Buffer tw := NewWriter(&b) hdr := &Header{ - Name: "file.txt", - Uid: 1 << 21, // too big for 8 octal digits - Size: int64(len(data)), - ModTime: time.Now(), + Name: "file.txt", + Uid: 1 << 21, // too big for 8 octal digits + Size: int64(len(data)), + // https://github.com/golang/go/commit/0e3355903d2ebcf5ee9e76096f51ac9a116a9dbb#diff-d7bf2a98d7b57b6ff754ca406f1b7581R105 + ModTime: time.Now().AddDate(0, 0, 0).Round(1 * time.Second), } - // tar only supports second precision. - hdr.ModTime = hdr.ModTime.Add(-time.Duration(hdr.ModTime.Nanosecond()) * time.Nanosecond) if err := tw.WriteHeader(hdr); err != nil { t.Fatalf("tw.WriteHeader: %v", err) } @@ -147,17 +146,6 @@ func TestHeaderRoundTrip(t *testing.T) { }, fm: 0644, }, - // hard link. - { - h: &Header{ - Name: "hard.txt", - Mode: 0644 | c_ISLNK, - Size: 0, - ModTime: time.Unix(1360600916, 0), - Typeflag: TypeLink, - }, - fm: 0644 | os.ModeSymlink, - }, // symbolic link. { h: &Header{ @@ -246,6 +234,33 @@ func TestHeaderRoundTrip(t *testing.T) { }, fm: 0600 | os.ModeSticky, }, + // hard link. + { + h: &Header{ + Name: "hard.txt", + Mode: 0644 | c_ISREG, + Size: 0, + Linkname: "file.txt", + ModTime: time.Unix(1360600916, 0), + Typeflag: TypeLink, + }, + fm: 0644, + }, + // More information. + { + h: &Header{ + Name: "info.txt", + Mode: 0600 | c_ISREG, + Size: 0, + Uid: 1000, + Gid: 1000, + ModTime: time.Unix(1360602540, 0), + Uname: "slartibartfast", + Gname: "users", + Typeflag: TypeReg, + }, + fm: 0600, + }, } for i, g := range golden { @@ -268,12 +283,37 @@ func TestHeaderRoundTrip(t *testing.T) { if got, want := h2.Size, g.h.Size; got != want { t.Errorf("i=%d: Size: got %v, want %v", i, got, want) } + if got, want := h2.Uid, g.h.Uid; got != want { + t.Errorf("i=%d: Uid: got %d, want %d", i, got, want) + } + if got, want := h2.Gid, g.h.Gid; got != want { + t.Errorf("i=%d: Gid: got %d, want %d", i, got, want) + } + if got, want := h2.Uname, g.h.Uname; got != want { + t.Errorf("i=%d: Uname: got %q, want %q", i, got, want) + } + if got, want := h2.Gname, g.h.Gname; got != want { + t.Errorf("i=%d: Gname: got %q, want %q", i, got, want) + } + if got, want := h2.Linkname, g.h.Linkname; got != want { + t.Errorf("i=%d: Linkname: got %v, want %v", i, got, want) + } + if got, want := h2.Typeflag, g.h.Typeflag; got != want { + t.Logf("%#v %#v", g.h, fi.Sys()) + t.Errorf("i=%d: Typeflag: got %q, want %q", i, got, want) + } if got, want := h2.Mode, g.h.Mode; got != want { t.Errorf("i=%d: Mode: got %o, want %o", i, got, want) } if got, want := fi.Mode(), g.fm; got != want { t.Errorf("i=%d: fi.Mode: got %o, want %o", i, got, want) } + if got, want := h2.AccessTime, g.h.AccessTime; got != want { + t.Errorf("i=%d: AccessTime: got %v, want %v", i, got, want) + } + if got, want := h2.ChangeTime, g.h.ChangeTime; got != want { + t.Errorf("i=%d: ChangeTime: got %v, want %v", i, got, want) + } if got, want := h2.ModTime, g.h.ModTime; got != want { t.Errorf("i=%d: ModTime: got %v, want %v", i, got, want) } diff --git a/archive/tar/testdata/gnu-multi-hdrs.tar b/archive/tar/testdata/gnu-multi-hdrs.tar new file mode 100644 index 0000000..8bcad55 Binary files /dev/null and b/archive/tar/testdata/gnu-multi-hdrs.tar differ diff --git a/archive/tar/testdata/hardlink.tar b/archive/tar/testdata/hardlink.tar new file mode 100644 index 0000000..9cd1a26 Binary files /dev/null and b/archive/tar/testdata/hardlink.tar differ diff --git a/archive/tar/testdata/hdr-only.tar b/archive/tar/testdata/hdr-only.tar new file mode 100644 index 0000000..f250340 Binary files /dev/null and b/archive/tar/testdata/hdr-only.tar differ diff --git a/archive/tar/testdata/issue10968.tar b/archive/tar/testdata/issue10968.tar new file mode 100644 index 0000000..1cc837b Binary files /dev/null and b/archive/tar/testdata/issue10968.tar differ diff --git a/archive/tar/testdata/issue11169.tar b/archive/tar/testdata/issue11169.tar new file mode 100644 index 0000000..4d71fa1 Binary files /dev/null and b/archive/tar/testdata/issue11169.tar differ diff --git a/archive/tar/testdata/issue12435.tar b/archive/tar/testdata/issue12435.tar new file mode 100644 index 0000000..3542dd8 Binary files /dev/null and b/archive/tar/testdata/issue12435.tar differ diff --git a/archive/tar/testdata/neg-size.tar b/archive/tar/testdata/neg-size.tar new file mode 100644 index 0000000..21edf38 Binary files /dev/null and b/archive/tar/testdata/neg-size.tar differ diff --git a/archive/tar/testdata/pax-multi-hdrs.tar b/archive/tar/testdata/pax-multi-hdrs.tar new file mode 100644 index 0000000..14bc759 Binary files /dev/null and b/archive/tar/testdata/pax-multi-hdrs.tar differ diff --git a/archive/tar/testdata/pax-path-hdr.tar b/archive/tar/testdata/pax-path-hdr.tar new file mode 100644 index 0000000..ab8fc32 Binary files /dev/null and b/archive/tar/testdata/pax-path-hdr.tar differ diff --git a/archive/tar/testdata/ustar-file-reg.tar b/archive/tar/testdata/ustar-file-reg.tar new file mode 100644 index 0000000..c84fa27 Binary files /dev/null and b/archive/tar/testdata/ustar-file-reg.tar differ diff --git a/archive/tar/writer.go b/archive/tar/writer.go index dafb2ca..0426381 100644 --- a/archive/tar/writer.go +++ b/archive/tar/writer.go @@ -12,8 +12,8 @@ import ( "errors" "fmt" "io" - "os" "path" + "sort" "strconv" "strings" "time" @@ -23,7 +23,6 @@ var ( ErrWriteTooLong = errors.New("archive/tar: write too long") ErrFieldTooLong = errors.New("archive/tar: header field too long") ErrWriteAfterClose = errors.New("archive/tar: write after close") - errNameTooLong = errors.New("archive/tar: name too long") errInvalidHeader = errors.New("archive/tar: header field too long or contains invalid values") ) @@ -43,6 +42,10 @@ type Writer struct { paxHdrBuff [blockSize]byte // buffer to use in writeHeader when writing a pax header } +type formatter struct { + err error // Last error seen +} + // NewWriter creates a new Writer writing to w. func NewWriter(w io.Writer) *Writer { return &Writer{w: w} } @@ -69,17 +72,9 @@ func (tw *Writer) Flush() error { } // Write s into b, terminating it with a NUL if there is room. -// If the value is too long for the field and allowPax is true add a paxheader record instead -func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string, paxHeaders map[string]string) { - needsPaxHeader := allowPax && len(s) > len(b) || !isASCII(s) - if needsPaxHeader { - paxHeaders[paxKeyword] = s - return - } +func (f *formatter) formatString(b []byte, s string) { if len(s) > len(b) { - if tw.err == nil { - tw.err = ErrFieldTooLong - } + f.err = ErrFieldTooLong return } ascii := toASCII(s) @@ -90,40 +85,40 @@ func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string, } // Encode x as an octal ASCII string and write it into b with leading zeros. -func (tw *Writer) octal(b []byte, x int64) { +func (f *formatter) formatOctal(b []byte, x int64) { s := strconv.FormatInt(x, 8) // leading zeros, but leave room for a NUL. for len(s)+1 < len(b) { s = "0" + s } - tw.cString(b, s, false, paxNone, nil) + f.formatString(b, s) } -// Write x into b, either as octal or as binary (GNUtar/star extension). -// If the value is too long for the field and writingPax is enabled both for the field and the add a paxheader record instead -func (tw *Writer) numeric(b []byte, x int64, allowPax bool, paxKeyword string, paxHeaders map[string]string) { - // Try octal first. - s := strconv.FormatInt(x, 8) - if len(s) < len(b) { - tw.octal(b, x) +// fitsInBase256 reports whether x can be encoded into n bytes using base-256 +// encoding. Unlike octal encoding, base-256 encoding does not require that the +// string ends with a NUL character. Thus, all n bytes are available for output. +// +// If operating in binary mode, this assumes strict GNU binary mode; which means +// that the first byte can only be either 0x80 or 0xff. Thus, the first byte is +// equivalent to the sign bit in two's complement form. +func fitsInBase256(n int, x int64) bool { + var binBits = uint(n-1) * 8 + return n >= 9 || (x >= -1<= 0; i-- { + b[i] = byte(x) + x >>= 8 + } + b[0] |= 0x80 // Highest bit indicates binary format return } - // If it is too long for octal, and pax is preferred, use a pax header - if allowPax && tw.preferPax { - tw.octal(b, 0) - s := strconv.FormatInt(x, 10) - paxHeaders[paxKeyword] = s - return - } - - // Too big: use binary (big-endian). - tw.usedBinary = true - for i := len(b) - 1; x > 0 && i >= 0; i-- { - b[i] = byte(x) - x >>= 8 - } - b[0] |= 0x80 // highest bit indicates binary format + f.formatOctal(b, 0) // Last resort, just write zero + f.err = ErrFieldTooLong } var ( @@ -162,6 +157,7 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { // subsecond time resolution, but for now let's just capture // too long fields or non ascii characters + var f formatter var header []byte // We need to select which scratch buffer to use carefully, @@ -176,10 +172,40 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { copy(header, zeroBlock) s := slicer(header) + // Wrappers around formatter that automatically sets paxHeaders if the + // argument extends beyond the capacity of the input byte slice. + var formatString = func(b []byte, s string, paxKeyword string) { + needsPaxHeader := paxKeyword != paxNone && len(s) > len(b) || !isASCII(s) + if needsPaxHeader { + paxHeaders[paxKeyword] = s + return + } + f.formatString(b, s) + } + var formatNumeric = func(b []byte, x int64, paxKeyword string) { + // Try octal first. + s := strconv.FormatInt(x, 8) + if len(s) < len(b) { + f.formatOctal(b, x) + return + } + + // If it is too long for octal, and PAX is preferred, use a PAX header. + if paxKeyword != paxNone && tw.preferPax { + f.formatOctal(b, 0) + s := strconv.FormatInt(x, 10) + paxHeaders[paxKeyword] = s + return + } + + tw.usedBinary = true + f.formatNumeric(b, x) + } + // keep a reference to the filename to allow to overwrite it later if we detect that we can use ustar longnames instead of pax pathHeaderBytes := s.next(fileNameSize) - tw.cString(pathHeaderBytes, hdr.Name, true, paxPath, paxHeaders) + formatString(pathHeaderBytes, hdr.Name, paxPath) // Handle out of range ModTime carefully. var modTime int64 @@ -187,25 +213,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { modTime = hdr.ModTime.Unix() } - tw.octal(s.next(8), hdr.Mode) // 100:108 - tw.numeric(s.next(8), int64(hdr.Uid), true, paxUid, paxHeaders) // 108:116 - tw.numeric(s.next(8), int64(hdr.Gid), true, paxGid, paxHeaders) // 116:124 - tw.numeric(s.next(12), hdr.Size, true, paxSize, paxHeaders) // 124:136 - tw.numeric(s.next(12), modTime, false, paxNone, nil) // 136:148 --- consider using pax for finer granularity - s.next(8) // chksum (148:156) - s.next(1)[0] = hdr.Typeflag // 156:157 + f.formatOctal(s.next(8), hdr.Mode) // 100:108 + formatNumeric(s.next(8), int64(hdr.Uid), paxUid) // 108:116 + formatNumeric(s.next(8), int64(hdr.Gid), paxGid) // 116:124 + formatNumeric(s.next(12), hdr.Size, paxSize) // 124:136 + formatNumeric(s.next(12), modTime, paxNone) // 136:148 --- consider using pax for finer granularity + s.next(8) // chksum (148:156) + s.next(1)[0] = hdr.Typeflag // 156:157 - tw.cString(s.next(100), hdr.Linkname, true, paxLinkpath, paxHeaders) + formatString(s.next(100), hdr.Linkname, paxLinkpath) - copy(s.next(8), []byte("ustar\x0000")) // 257:265 - tw.cString(s.next(32), hdr.Uname, true, paxUname, paxHeaders) // 265:297 - tw.cString(s.next(32), hdr.Gname, true, paxGname, paxHeaders) // 297:329 - tw.numeric(s.next(8), hdr.Devmajor, false, paxNone, nil) // 329:337 - tw.numeric(s.next(8), hdr.Devminor, false, paxNone, nil) // 337:345 + copy(s.next(8), []byte("ustar\x0000")) // 257:265 + formatString(s.next(32), hdr.Uname, paxUname) // 265:297 + formatString(s.next(32), hdr.Gname, paxGname) // 297:329 + formatNumeric(s.next(8), hdr.Devmajor, paxNone) // 329:337 + formatNumeric(s.next(8), hdr.Devminor, paxNone) // 337:345 // keep a reference to the prefix to allow to overwrite it later if we detect that we can use ustar longnames instead of pax prefixHeaderBytes := s.next(155) - tw.cString(prefixHeaderBytes, "", false, paxNone, nil) // 345:500 prefix + formatString(prefixHeaderBytes, "", paxNone) // 345:500 prefix // Use the GNU magic instead of POSIX magic if we used any GNU extensions. if tw.usedBinary { @@ -215,37 +241,26 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { _, paxPathUsed := paxHeaders[paxPath] // try to use a ustar header when only the name is too long if !tw.preferPax && len(paxHeaders) == 1 && paxPathUsed { - suffix := hdr.Name - prefix := "" - if len(hdr.Name) > fileNameSize && isASCII(hdr.Name) { - var err error - prefix, suffix, err = tw.splitUSTARLongName(hdr.Name) - if err == nil { - // ok we can use a ustar long name instead of pax, now correct the fields + prefix, suffix, ok := splitUSTARPath(hdr.Name) + if ok { + // Since we can encode in USTAR format, disable PAX header. + delete(paxHeaders, paxPath) - // remove the path field from the pax header. this will suppress the pax header - delete(paxHeaders, paxPath) - - // update the path fields - tw.cString(pathHeaderBytes, suffix, false, paxNone, nil) - tw.cString(prefixHeaderBytes, prefix, false, paxNone, nil) - - // Use the ustar magic if we used ustar long names. - if len(prefix) > 0 && !tw.usedBinary { - copy(header[257:265], []byte("ustar\x00")) - } - } + // Update the path fields + formatString(pathHeaderBytes, suffix, paxNone) + formatString(prefixHeaderBytes, prefix, paxNone) } } // The chksum field is terminated by a NUL and a space. // This is different from the other octal fields. chksum, _ := checksum(header) - tw.octal(header[148:155], chksum) + f.formatOctal(header[148:155], chksum) // Never fails header[155] = ' ' - if tw.err != nil { - // problem with header; probably integer too big for a field. + // Check if there were any formatting errors. + if f.err != nil { + tw.err = f.err return tw.err } @@ -270,28 +285,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { return tw.err } -// writeUSTARLongName splits a USTAR long name hdr.Name. -// name must be < 256 characters. errNameTooLong is returned -// if hdr.Name can't be split. The splitting heuristic -// is compatible with gnu tar. -func (tw *Writer) splitUSTARLongName(name string) (prefix, suffix string, err error) { +// splitUSTARPath splits a path according to USTAR prefix and suffix rules. +// If the path is not splittable, then it will return ("", "", false). +func splitUSTARPath(name string) (prefix, suffix string, ok bool) { length := len(name) - if length > fileNamePrefixSize+1 { + if length <= fileNameSize || !isASCII(name) { + return "", "", false + } else if length > fileNamePrefixSize+1 { length = fileNamePrefixSize + 1 } else if name[length-1] == '/' { length-- } + i := strings.LastIndex(name[:length], "/") - // nlen contains the resulting length in the name field. - // plen contains the resulting length in the prefix field. - nlen := len(name) - i - 1 - plen := i + nlen := len(name) - i - 1 // nlen is length of suffix + plen := i // plen is length of prefix if i <= 0 || nlen > fileNameSize || nlen == 0 || plen > fileNamePrefixSize { - err = errNameTooLong - return + return "", "", false } - prefix, suffix = name[:i], name[i+1:] - return + return name[:i], name[i+1:], true } // writePaxHeader writes an extended pax header to the @@ -304,11 +316,11 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro // succeed, and seems harmless enough. ext.ModTime = hdr.ModTime // The spec asks that we namespace our pseudo files - // with the current pid. - pid := os.Getpid() + // with the current pid. However, this results in differing outputs + // for identical inputs. As such, the constant 0 is now used instead. + // golang.org/issue/12358 dir, file := path.Split(hdr.Name) - fullName := path.Join(dir, - fmt.Sprintf("PaxHeaders.%d", pid), file) + fullName := path.Join(dir, "PaxHeaders.0", file) ascii := toASCII(fullName) if len(ascii) > 100 { @@ -318,8 +330,15 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro // Construct the body var buf bytes.Buffer - for k, v := range paxHeaders { - fmt.Fprint(&buf, paxHeader(k+"="+v)) + // Keys are sorted before writing to body to allow deterministic output. + var keys []string + for k := range paxHeaders { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Fprint(&buf, formatPAXRecord(k, paxHeaders[k])) } ext.Size = int64(len(buf.Bytes())) @@ -335,17 +354,18 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro return nil } -// paxHeader formats a single pax record, prefixing it with the appropriate length -func paxHeader(msg string) string { - const padding = 2 // Extra padding for space and newline - size := len(msg) + padding +// formatPAXRecord formats a single PAX record, prefixing it with the +// appropriate length. +func formatPAXRecord(k, v string) string { + const padding = 3 // Extra padding for ' ', '=', and '\n' + size := len(k) + len(v) + padding size += len(strconv.Itoa(size)) - record := fmt.Sprintf("%d %s\n", size, msg) + record := fmt.Sprintf("%d %s=%s\n", size, k, v) + + // Final adjustment if adding size field increased the record size. if len(record) != size { - // Final adjustment if adding size increased - // the number of digits in size size = len(record) - record = fmt.Sprintf("%d %s\n", size, msg) + record = fmt.Sprintf("%d %s=%s\n", size, k, v) } return record } @@ -355,7 +375,7 @@ func paxHeader(msg string) string { // hdr.Size bytes are written after WriteHeader. func (tw *Writer) Write(b []byte) (n int, err error) { if tw.closed { - err = ErrWriteTooLong + err = ErrWriteAfterClose return } overwrite := false diff --git a/archive/tar/writer_test.go b/archive/tar/writer_test.go index 5e42e32..6e91d90 100644 --- a/archive/tar/writer_test.go +++ b/archive/tar/writer_test.go @@ -9,8 +9,10 @@ import ( "fmt" "io" "io/ioutil" + "math" "os" "reflect" + "sort" "strings" "testing" "testing/iotest" @@ -147,6 +149,44 @@ var writerTests = []*writerTest{ }, }, }, + // This file was produced using gnu tar 1.26 + // echo "Slartibartfast" > file.txt + // ln file.txt hard.txt + // tar -b 1 --format=ustar -c -f hardlink.tar file.txt hard.txt + { + file: "testdata/hardlink.tar", + entries: []*writerTestEntry{ + { + header: &Header{ + Name: "file.txt", + Mode: 0644, + Uid: 1000, + Gid: 100, + Size: 15, + ModTime: time.Unix(1425484303, 0), + Typeflag: '0', + Uname: "vbatts", + Gname: "users", + }, + contents: "Slartibartfast\n", + }, + { + header: &Header{ + Name: "hard.txt", + Mode: 0644, + Uid: 1000, + Gid: 100, + Size: 0, + ModTime: time.Unix(1425484303, 0), + Typeflag: '1', + Linkname: "file.txt", + Uname: "vbatts", + Gname: "users", + }, + // no contents + }, + }, + }, } // Render byte array in a two-character hexadecimal string, spaced for easy visual inspection. @@ -253,7 +293,7 @@ func TestPax(t *testing.T) { t.Fatal(err) } // Simple test to make sure PAX extensions are in effect - if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) { + if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) { t.Fatal("Expected at least one PAX header to be written.") } // Test that we can get a long name back out of the archive. @@ -292,7 +332,7 @@ func TestPaxSymlink(t *testing.T) { t.Fatal(err) } // Simple test to make sure PAX extensions are in effect - if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) { + if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) { t.Fatal("Expected at least one PAX header to be written.") } // Test that we can get a long name back out of the archive. @@ -342,7 +382,7 @@ func TestPaxNonAscii(t *testing.T) { t.Fatal(err) } // Simple test to make sure PAX extensions are in effect - if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) { + if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) { t.Fatal("Expected at least one PAX header to be written.") } // Test that we can get a long name back out of the archive. @@ -401,21 +441,49 @@ func TestPaxXattrs(t *testing.T) { } } -func TestPAXHeader(t *testing.T) { - medName := strings.Repeat("CD", 50) - longName := strings.Repeat("AB", 100) - paxTests := [][2]string{ - {paxPath + "=/etc/hosts", "19 path=/etc/hosts\n"}, - {"a=b", "6 a=b\n"}, // Single digit length - {"a=names", "11 a=names\n"}, // Test case involving carries - {paxPath + "=" + longName, fmt.Sprintf("210 path=%s\n", longName)}, - {paxPath + "=" + medName, fmt.Sprintf("110 path=%s\n", medName)}} +func TestPaxHeadersSorted(t *testing.T) { + fileinfo, err := os.Stat("testdata/small.txt") + if err != nil { + t.Fatal(err) + } + hdr, err := FileInfoHeader(fileinfo, "") + if err != nil { + t.Fatalf("os.Stat: %v", err) + } + contents := strings.Repeat(" ", int(hdr.Size)) - for _, test := range paxTests { - key, expected := test[0], test[1] - if result := paxHeader(key); result != expected { - t.Fatalf("paxHeader: got %s, expected %s", result, expected) - } + hdr.Xattrs = map[string]string{ + "foo": "foo", + "bar": "bar", + "baz": "baz", + "qux": "qux", + } + + var buf bytes.Buffer + writer := NewWriter(&buf) + if err := writer.WriteHeader(hdr); err != nil { + t.Fatal(err) + } + if _, err = writer.Write([]byte(contents)); err != nil { + t.Fatal(err) + } + if err := writer.Close(); err != nil { + t.Fatal(err) + } + // Simple test to make sure PAX extensions are in effect + if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) { + t.Fatal("Expected at least one PAX header to be written.") + } + + // xattr bar should always appear before others + indices := []int{ + bytes.Index(buf.Bytes(), []byte("bar=bar")), + bytes.Index(buf.Bytes(), []byte("baz=baz")), + bytes.Index(buf.Bytes(), []byte("foo=foo")), + bytes.Index(buf.Bytes(), []byte("qux=qux")), + } + if !sort.IntsAreSorted(indices) { + t.Fatal("PAX headers are not sorted") } } @@ -489,3 +557,166 @@ func TestValidTypeflagWithPAXHeader(t *testing.T) { } } } + +func TestWriteAfterClose(t *testing.T) { + var buffer bytes.Buffer + tw := NewWriter(&buffer) + + hdr := &Header{ + Name: "small.txt", + Size: 5, + } + if err := tw.WriteHeader(hdr); err != nil { + t.Fatalf("Failed to write header: %s", err) + } + tw.Close() + if _, err := tw.Write([]byte("Kilts")); err != ErrWriteAfterClose { + t.Fatalf("Write: got %v; want ErrWriteAfterClose", err) + } +} + +func TestSplitUSTARPath(t *testing.T) { + var sr = strings.Repeat + + var vectors = []struct { + input string // Input path + prefix string // Expected output prefix + suffix string // Expected output suffix + ok bool // Split success? + }{ + {"", "", "", false}, + {"abc", "", "", false}, + {"用戶名", "", "", false}, + {sr("a", fileNameSize), "", "", false}, + {sr("a", fileNameSize) + "/", "", "", false}, + {sr("a", fileNameSize) + "/a", sr("a", fileNameSize), "a", true}, + {sr("a", fileNamePrefixSize) + "/", "", "", false}, + {sr("a", fileNamePrefixSize) + "/a", sr("a", fileNamePrefixSize), "a", true}, + {sr("a", fileNameSize+1), "", "", false}, + {sr("/", fileNameSize+1), sr("/", fileNameSize-1), "/", true}, + {sr("a", fileNamePrefixSize) + "/" + sr("b", fileNameSize), + sr("a", fileNamePrefixSize), sr("b", fileNameSize), true}, + {sr("a", fileNamePrefixSize) + "//" + sr("b", fileNameSize), "", "", false}, + {sr("a/", fileNameSize), sr("a/", 77) + "a", sr("a/", 22), true}, + } + + for _, v := range vectors { + prefix, suffix, ok := splitUSTARPath(v.input) + if prefix != v.prefix || suffix != v.suffix || ok != v.ok { + t.Errorf("splitUSTARPath(%q):\ngot (%q, %q, %v)\nwant (%q, %q, %v)", + v.input, prefix, suffix, ok, v.prefix, v.suffix, v.ok) + } + } +} + +func TestFormatPAXRecord(t *testing.T) { + var medName = strings.Repeat("CD", 50) + var longName = strings.Repeat("AB", 100) + + var vectors = []struct { + inputKey string + inputVal string + output string + }{ + {"k", "v", "6 k=v\n"}, + {"path", "/etc/hosts", "19 path=/etc/hosts\n"}, + {"path", longName, "210 path=" + longName + "\n"}, + {"path", medName, "110 path=" + medName + "\n"}, + {"foo", "ba", "9 foo=ba\n"}, + {"foo", "bar", "11 foo=bar\n"}, + {"foo", "b=\nar=\n==\x00", "18 foo=b=\nar=\n==\x00\n"}, + {"foo", "hello9 foo=ba\nworld", "27 foo=hello9 foo=ba\nworld\n"}, + {"☺☻☹", "日a本b語ç", "27 ☺☻☹=日a本b語ç\n"}, + {"\x00hello", "\x00world", "17 \x00hello=\x00world\n"}, + } + + for _, v := range vectors { + output := formatPAXRecord(v.inputKey, v.inputVal) + if output != v.output { + t.Errorf("formatPAXRecord(%q, %q): got %q, want %q", + v.inputKey, v.inputVal, output, v.output) + } + } +} + +func TestFitsInBase256(t *testing.T) { + var vectors = []struct { + input int64 + width int + ok bool + }{ + {+1, 8, true}, + {0, 8, true}, + {-1, 8, true}, + {1 << 56, 8, false}, + {(1 << 56) - 1, 8, true}, + {-1 << 56, 8, true}, + {(-1 << 56) - 1, 8, false}, + {121654, 8, true}, + {-9849849, 8, true}, + {math.MaxInt64, 9, true}, + {0, 9, true}, + {math.MinInt64, 9, true}, + {math.MaxInt64, 12, true}, + {0, 12, true}, + {math.MinInt64, 12, true}, + } + + for _, v := range vectors { + ok := fitsInBase256(v.width, v.input) + if ok != v.ok { + t.Errorf("checkNumeric(%d, %d): got %v, want %v", v.input, v.width, ok, v.ok) + } + } +} + +func TestFormatNumeric(t *testing.T) { + var vectors = []struct { + input int64 + output string + ok bool + }{ + // Test base-256 (binary) encoded values. + {-1, "\xff", true}, + {-1, "\xff\xff", true}, + {-1, "\xff\xff\xff", true}, + {(1 << 0), "0", false}, + {(1 << 8) - 1, "\x80\xff", true}, + {(1 << 8), "0\x00", false}, + {(1 << 16) - 1, "\x80\xff\xff", true}, + {(1 << 16), "00\x00", false}, + {-1 * (1 << 0), "\xff", true}, + {-1*(1<<0) - 1, "0", false}, + {-1 * (1 << 8), "\xff\x00", true}, + {-1*(1<<8) - 1, "0\x00", false}, + {-1 * (1 << 16), "\xff\x00\x00", true}, + {-1*(1<<16) - 1, "00\x00", false}, + {537795476381659745, "0000000\x00", false}, + {537795476381659745, "\x80\x00\x00\x00\x07\x76\xa2\x22\xeb\x8a\x72\x61", true}, + {-615126028225187231, "0000000\x00", false}, + {-615126028225187231, "\xff\xff\xff\xff\xf7\x76\xa2\x22\xeb\x8a\x72\x61", true}, + {math.MaxInt64, "0000000\x00", false}, + {math.MaxInt64, "\x80\x00\x00\x00\x7f\xff\xff\xff\xff\xff\xff\xff", true}, + {math.MinInt64, "0000000\x00", false}, + {math.MinInt64, "\xff\xff\xff\xff\x80\x00\x00\x00\x00\x00\x00\x00", true}, + {math.MaxInt64, "\x80\x7f\xff\xff\xff\xff\xff\xff\xff", true}, + {math.MinInt64, "\xff\x80\x00\x00\x00\x00\x00\x00\x00", true}, + } + + for _, v := range vectors { + var f formatter + output := make([]byte, len(v.output)) + f.formatNumeric(output, v.input) + ok := (f.err == nil) + if ok != v.ok { + if v.ok { + t.Errorf("formatNumeric(%d): got formatting failure, want success", v.input) + } else { + t.Errorf("formatNumeric(%d): got formatting success, want failure", v.input) + } + } + if string(output) != v.output { + t.Errorf("formatNumeric(%d): got %q, want %q", v.input, output, v.output) + } + } +} diff --git a/cmd/tar-split/README.md b/cmd/tar-split/README.md new file mode 100644 index 0000000..02a2218 --- /dev/null +++ b/cmd/tar-split/README.md @@ -0,0 +1,39 @@ +# tar-split utility + +## Installation + + go get -u github.com/vbatts/tar-split/cmd/tar-split + +## Usage + +### Disassembly + +```bash +$ sha256sum archive.tar +d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 archive.tar +$ mkdir ./x +$ tar-split disasm --output tar-data.json.gz ./archive.tar | tar -C ./x -x +time="2015-07-20T15:45:04-04:00" level=info msg="created tar-data.json.gz from ./archive.tar (read 204800 bytes)" +``` + +### Assembly + +```bash +$ tar-split asm --output new.tar --input ./tar-data.json.gz --path ./x/ +INFO[0000] created new.tar from ./x/ and ./tar-data.json.gz (wrote 204800 bytes) +$ sha256sum new.tar +d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 new.tar +``` + +### Estimating metadata size + +```bash +$ tar-split checksize ./archive.tar +inspecting "./archive.tar" (size 200k) + -- number of files: 28 + -- size of metadata uncompressed: 28k + -- size of gzip compressed metadata: 1k +``` + + + diff --git a/cmd/tar-split/asm.go b/cmd/tar-split/asm.go new file mode 100644 index 0000000..e188ce1 --- /dev/null +++ b/cmd/tar-split/asm.go @@ -0,0 +1,64 @@ +package main + +import ( + "compress/gzip" + "io" + "os" + + "github.com/Sirupsen/logrus" + "github.com/urfave/cli" + "github.com/vbatts/tar-split/tar/asm" + "github.com/vbatts/tar-split/tar/storage" +) + +func CommandAsm(c *cli.Context) { + if len(c.Args()) > 0 { + logrus.Warnf("%d additional arguments passed are ignored", len(c.Args())) + } + if len(c.String("input")) == 0 { + logrus.Fatalf("--input filename must be set") + } + if len(c.String("output")) == 0 { + logrus.Fatalf("--output filename must be set ([FILENAME|-])") + } + if len(c.String("path")) == 0 { + logrus.Fatalf("--path must be set") + } + + var outputStream io.Writer + if c.String("output") == "-" { + outputStream = os.Stdout + } else { + fh, err := os.Create(c.String("output")) + if err != nil { + logrus.Fatal(err) + } + defer fh.Close() + outputStream = fh + } + + // Get the tar metadata reader + mf, err := os.Open(c.String("input")) + if err != nil { + logrus.Fatal(err) + } + defer mf.Close() + mfz, err := gzip.NewReader(mf) + if err != nil { + logrus.Fatal(err) + } + defer mfz.Close() + + metaUnpacker := storage.NewJSONUnpacker(mfz) + // XXX maybe get the absolute path here + fileGetter := storage.NewPathFileGetter(c.String("path")) + + ots := asm.NewOutputTarStream(fileGetter, metaUnpacker) + defer ots.Close() + i, err := io.Copy(outputStream, ots) + if err != nil { + logrus.Fatal(err) + } + + logrus.Infof("created %s from %s and %s (wrote %d bytes)", c.String("output"), c.String("path"), c.String("input"), i) +} diff --git a/checksize.go b/cmd/tar-split/checksize.go similarity index 83% rename from checksize.go rename to cmd/tar-split/checksize.go index a6d3c08..1e5eed7 100644 --- a/checksize.go +++ b/cmd/tar-split/checksize.go @@ -1,29 +1,25 @@ -// +build ignore - package main import ( "archive/tar" "compress/gzip" - "flag" "fmt" "io" "io/ioutil" "log" "os" + "github.com/Sirupsen/logrus" + "github.com/urfave/cli" "github.com/vbatts/tar-split/tar/asm" "github.com/vbatts/tar-split/tar/storage" ) -var ( - flCleanup = flag.Bool("cleanup", true, "cleanup tempfiles") -) - -func main() { - flag.Parse() - - for _, arg := range flag.Args() { +func CommandChecksize(c *cli.Context) { + if len(c.Args()) == 0 { + logrus.Fatalf("please specify tar archives to check ('-' will check stdin)") + } + for _, arg := range c.Args() { fh, err := os.Open(arg) if err != nil { log.Fatal(err) @@ -40,8 +36,10 @@ func main() { log.Fatal(err) } defer packFh.Close() - if *flCleanup { + if !c.Bool("work") { defer os.Remove(packFh.Name()) + } else { + fmt.Printf(" -- working file preserved: %s\n", packFh.Name()) } sp := storage.NewJSONPacker(packFh) @@ -83,7 +81,7 @@ func main() { log.Fatal(err) } defer gzPackFh.Close() - if *flCleanup { + if !c.Bool("work") { defer os.Remove(gzPackFh.Name()) } diff --git a/cmd/tar-split/disasm.go b/cmd/tar-split/disasm.go new file mode 100644 index 0000000..5472894 --- /dev/null +++ b/cmd/tar-split/disasm.go @@ -0,0 +1,63 @@ +package main + +import ( + "compress/gzip" + "io" + "io/ioutil" + "os" + + "github.com/Sirupsen/logrus" + "github.com/urfave/cli" + "github.com/vbatts/tar-split/tar/asm" + "github.com/vbatts/tar-split/tar/storage" +) + +func CommandDisasm(c *cli.Context) { + if len(c.Args()) != 1 { + logrus.Fatalf("please specify tar to be disabled ") + } + if len(c.String("output")) == 0 { + logrus.Fatalf("--output filename must be set") + } + + // Set up the tar input stream + var inputStream io.Reader + if c.Args()[0] == "-" { + inputStream = os.Stdin + } else { + fh, err := os.Open(c.Args()[0]) + if err != nil { + logrus.Fatal(err) + } + defer fh.Close() + inputStream = fh + } + + // Set up the metadata storage + mf, err := os.OpenFile(c.String("output"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(0600)) + if err != nil { + logrus.Fatal(err) + } + defer mf.Close() + mfz := gzip.NewWriter(mf) + defer mfz.Close() + metaPacker := storage.NewJSONPacker(mfz) + + // we're passing nil here for the file putter, because the ApplyDiff will + // handle the extraction of the archive + its, err := asm.NewInputTarStream(inputStream, metaPacker, nil) + if err != nil { + logrus.Fatal(err) + } + var out io.Writer + if c.Bool("no-stdout") { + out = ioutil.Discard + } else { + out = os.Stdout + } + i, err := io.Copy(out, its) + if err != nil { + logrus.Fatal(err) + } + logrus.Infof("created %s from %s (read %d bytes)", c.String("output"), c.Args()[0], i) +} diff --git a/cmd/tar-split/main.go b/cmd/tar-split/main.go new file mode 100644 index 0000000..8b4035f --- /dev/null +++ b/cmd/tar-split/main.go @@ -0,0 +1,91 @@ +package main + +import ( + "os" + + "github.com/Sirupsen/logrus" + "github.com/urfave/cli" + "github.com/vbatts/tar-split/version" +) + +func main() { + app := cli.NewApp() + app.Name = "tar-split" + app.Usage = "tar assembly and disassembly utility" + app.Version = version.VERSION + app.Author = "Vincent Batts" + app.Email = "vbatts@hashbangbash.com" + app.Action = cli.ShowAppHelp + app.Before = func(c *cli.Context) error { + logrus.SetOutput(os.Stderr) + if c.Bool("debug") { + logrus.SetLevel(logrus.DebugLevel) + } + return nil + } + app.Flags = []cli.Flag{ + cli.BoolFlag{ + Name: "debug, D", + Usage: "debug output", + // defaults to false + }, + } + app.Commands = []cli.Command{ + { + Name: "disasm", + Aliases: []string{"d"}, + Usage: "disassemble the input tar stream", + Action: CommandDisasm, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "output", + Value: "tar-data.json.gz", + Usage: "output of disassembled tar stream", + }, + cli.BoolFlag{ + Name: "no-stdout", + Usage: "do not throughput the stream to STDOUT", + }, + }, + }, + { + Name: "asm", + Aliases: []string{"a"}, + Usage: "assemble tar stream", + Action: CommandAsm, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "input", + Value: "tar-data.json.gz", + Usage: "input of disassembled tar stream", + }, + cli.StringFlag{ + Name: "output", + Value: "-", + Usage: "reassembled tar archive", + }, + cli.StringFlag{ + Name: "path", + Value: "", + Usage: "relative path of extracted tar", + }, + }, + }, + { + Name: "checksize", + Usage: "displays size estimates for metadata storage of a Tar archive", + Action: CommandChecksize, + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "work", + Usage: "do not delete the working directory", + // defaults to false + }, + }, + }, + } + + if err := app.Run(os.Args); err != nil { + logrus.Fatal(err) + } +} diff --git a/concept/DESIGN.md b/concept/DESIGN.md new file mode 100644 index 0000000..4bfa82c --- /dev/null +++ b/concept/DESIGN.md @@ -0,0 +1,94 @@ +# Flow of TAR stream + +## `./archive/tar` + +The import path `github.com/vbatts/tar-split/archive/tar` is fork of upstream golang stdlib [`archive/tar`](http://golang.org/pkg/archive/tar/). +It adds plumbing to access raw bytes of the tar stream as the headers and payload are read. + +## Packer interface + +For ease of storage and usage of the raw bytes, there will be a storage +interface, that accepts an io.Writer (This way you could pass it an in memory +buffer or a file handle). + +Having a Packer interface can allow configuration of hash.Hash for file payloads +and providing your own io.Writer. + +Instead of having a state directory to store all the header information for all +Readers, we will leave that up to user of Reader. Because we can not assume an +ID for each Reader, and keeping that information differentiated. + +## State Directory + +Perhaps we could deduplicate the header info, by hashing the rawbytes and +storing them in a directory tree like: + + ./ac/dc/beef + +Then reference the hash of the header info, in the positional records for the +tar stream. Though this could be a future feature, and not required for an +initial implementation. Also, this would imply an owned state directory, rather +than just writing storage info to an io.Writer. + +## Concept Example + +First we'll get an archive to work with. For repeatability, we'll make an +archive from what you've just cloned: + +``` +git archive --format=tar -o tar-split.tar HEAD . +``` + +Then build the example main.go: + +``` +go build ./main.go +``` + +Now run the example over the archive: + +``` +$ ./main tar-split.tar +2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out" +pax_global_header pre: 512 read: 52 +.travis.yml pre: 972 read: 374 +DESIGN.md pre: 650 read: 1131 +LICENSE pre: 917 read: 1075 +README.md pre: 973 read: 4289 +archive/ pre: 831 read: 0 +archive/tar/ pre: 512 read: 0 +archive/tar/common.go pre: 512 read: 7790 +[...] +tar/storage/entry_test.go pre: 667 read: 1137 +tar/storage/getter.go pre: 911 read: 2741 +tar/storage/getter_test.go pre: 843 read: 1491 +tar/storage/packer.go pre: 557 read: 3141 +tar/storage/packer_test.go pre: 955 read: 3096 +EOF padding: 1512 +Remainder: 512 +Size: 215040; Sum: 215040 +``` + +*What are we seeing here?* + +* `pre` is the header of a file entry, and potentially the padding from the + end of the prior file's payload. Also with particular tar extensions and pax + attributes, the header can exceed 512 bytes. +* `read` is the size of the file payload from the entry +* `EOF padding` is the expected 1024 null bytes on the end of a tar archive, + plus potential padding from the end of the prior file entry's payload +* `Remainder` is the remaining bytes of an archive. This is typically deadspace + as most tar implmentations will return after having reached the end of the + 1024 null bytes. Though various implementations will include some amount of + bytes here, which will affect the checksum of the resulting tar archive, + therefore this must be accounted for as well. + +Ideally the input tar and output `*.out`, will match: + +``` +$ sha1sum tar-split.tar* +ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar +ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar.out +``` + + diff --git a/main.go b/concept/main.go similarity index 100% rename from main.go rename to concept/main.go diff --git a/tar/asm/assemble.go b/tar/asm/assemble.go index ec15612..d624450 100644 --- a/tar/asm/assemble.go +++ b/tar/asm/assemble.go @@ -3,13 +3,15 @@ package asm import ( "bytes" "fmt" + "hash" "hash/crc64" "io" + "sync" "github.com/vbatts/tar-split/tar/storage" ) -// NewOutputTarStream returns an io.ReadCloser that is an assemble tar archive +// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive // stream. // // It takes a storage.FileGetter, for mapping the file payloads that are to be read in, @@ -23,44 +25,106 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose } pr, pw := io.Pipe() go func() { - for { - entry, err := up.Next() - if err != nil { - pw.CloseWithError(err) - break - } - switch entry.Type { - case storage.SegmentType: - if _, err := pw.Write(entry.Payload); err != nil { - pw.CloseWithError(err) - break - } - case storage.FileType: - if entry.Size == 0 { - continue - } - fh, err := fg.Get(entry.Name) - if err != nil { - pw.CloseWithError(err) - break - } - defer fh.Close() - c := crc64.New(storage.CRCTable) - tRdr := io.TeeReader(fh, c) - if _, err := io.Copy(pw, tRdr); err != nil { - pw.CloseWithError(err) - break - } - if !bytes.Equal(c.Sum(nil), entry.Payload) { - // I would rather this be a comparable ErrInvalidChecksum or such, - // but since it's coming through the PipeReader, the context of - // _which_ file would be lost... - pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name)) - break - } - } + err := WriteOutputTarStream(fg, up, pw) + if err != nil { + pw.CloseWithError(err) + } else { + pw.Close() } - pw.Close() }() return pr } + +// WriteOutputTarStream writes assembled tar archive to a writer. +func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error { + // ... Since these are interfaces, this is possible, so let's not have a nil pointer + if fg == nil || up == nil { + return nil + } + var copyBuffer []byte + var crcHash hash.Hash + var crcSum []byte + var multiWriter io.Writer + for { + entry, err := up.Next() + if err != nil { + if err == io.EOF { + return nil + } + return err + } + switch entry.Type { + case storage.SegmentType: + if _, err := w.Write(entry.Payload); err != nil { + return err + } + case storage.FileType: + if entry.Size == 0 { + continue + } + fh, err := fg.Get(entry.GetName()) + if err != nil { + return err + } + if crcHash == nil { + crcHash = crc64.New(storage.CRCTable) + crcSum = make([]byte, 8) + multiWriter = io.MultiWriter(w, crcHash) + copyBuffer = byteBufferPool.Get().([]byte) + defer byteBufferPool.Put(copyBuffer) + } else { + crcHash.Reset() + } + + if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil { + fh.Close() + return err + } + + if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) { + // I would rather this be a comparable ErrInvalidChecksum or such, + // but since it's coming through the PipeReader, the context of + // _which_ file would be lost... + fh.Close() + return fmt.Errorf("file integrity checksum failed for %q", entry.GetName()) + } + fh.Close() + } + } +} + +var byteBufferPool = &sync.Pool{ + New: func() interface{} { + return make([]byte, 32*1024) + }, +} + +// copyWithBuffer is taken from stdlib io.Copy implementation +// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367 +func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) { + for { + nr, er := src.Read(buf) + if nr > 0 { + nw, ew := dst.Write(buf[0:nr]) + if nw > 0 { + written += int64(nw) + } + if ew != nil { + err = ew + break + } + if nr != nw { + err = io.ErrShortWrite + break + } + } + if er == io.EOF { + break + } + if er != nil { + err = er + break + } + } + return written, err +} diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index 203e716..afdce9d 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -5,6 +5,7 @@ import ( "compress/gzip" "crypto/sha1" "fmt" + "hash/crc64" "io" "io/ioutil" "os" @@ -33,108 +34,223 @@ var entries = []struct { Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, Size: 26, }, - + Body: []byte("café con leche, por favor"), + }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip. + Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + Body: []byte("café con leche, por favor"), + }, +} +var entriesMangled = []struct { + Entry storage.Entry + Body []byte +}{ + { + Entry: storage.Entry{ + Type: storage.FileType, + Name: "./hurr.txt", + Payload: []byte{3, 116, 164, 177, 171, 236, 107, 78}, + Size: 20, + }, + // switch + Body: []byte("imma derp til I hurr"), + }, + { + Entry: storage.Entry{ + Type: storage.FileType, + Name: "./ermahgerd.txt", + Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + // san not con + Body: []byte("café sans leche, por favor"), + }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, + Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, Body: []byte("café con leche, por favor"), }, } -func TestTarStreamOld(t *testing.T) { +func TestTarStreamMangledGetterPutter(t *testing.T) { fgp := storage.NewBufferFileGetPutter() // first lets prep a GetPutter and Packer for i := range entries { if entries[i].Entry.Type == storage.FileType { - j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body)) + j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body)) if err != nil { t.Error(err) } if j != entries[i].Entry.Size { t.Errorf("size %q: expected %d; got %d", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Size, j) } if !bytes.Equal(csum, entries[i].Entry.Payload) { t.Errorf("checksum %q: expected %v; got %v", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Payload, csum) } } } - // next we'll use these to produce a tar stream. - _ = NewOutputTarStream(fgp, nil) - // TODO finish this + for _, e := range entriesMangled { + if e.Entry.Type == storage.FileType { + rdr, err := fgp.Get(e.Entry.GetName()) + if err != nil { + t.Error(err) + } + c := crc64.New(storage.CRCTable) + i, err := io.Copy(c, rdr) + if err != nil { + t.Fatal(err) + } + rdr.Close() + + csum := c.Sum(nil) + if bytes.Equal(csum, e.Entry.Payload) { + t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v", + i, + e.Entry.GetName(), + csum) + } + } + } +} + +var testCases = []struct { + path string + expectedSHA1Sum string + expectedSize int64 +}{ + {"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240}, + {"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480}, + {"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880}, + {"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240}, + {"./testdata/extranils.tar.gz", "e187b4b3e739deaccc257342f4940f34403dc588", 10648}, + {"./testdata/notenoughnils.tar.gz", "72f93f41efd95290baa5c174c234f5d4c22ce601", 512}, } func TestTarStream(t *testing.T) { - var ( - expectedSum = "1eb237ff69bca6e22789ecb05b45d35ca307adbd" - expectedSize int64 = 10240 - ) - fh, err := os.Open("./testdata/t.tar.gz") - if err != nil { - t.Fatal(err) - } - defer fh.Close() - gzRdr, err := gzip.NewReader(fh) - if err != nil { - t.Fatal(err) - } - defer gzRdr.Close() + for _, tc := range testCases { + fh, err := os.Open(tc.path) + if err != nil { + t.Fatal(err) + } + defer fh.Close() + gzRdr, err := gzip.NewReader(fh) + if err != nil { + t.Fatal(err) + } + defer gzRdr.Close() - // Setup where we'll store the metadata - w := bytes.NewBuffer([]byte{}) - sp := storage.NewJSONPacker(w) - fgp := storage.NewBufferFileGetPutter() + // Setup where we'll store the metadata + w := bytes.NewBuffer([]byte{}) + sp := storage.NewJSONPacker(w) + fgp := storage.NewBufferFileGetPutter() - // wrap the disassembly stream - tarStream, err := NewInputTarStream(gzRdr, sp, fgp) - if err != nil { - t.Fatal(err) - } + // wrap the disassembly stream + tarStream, err := NewInputTarStream(gzRdr, sp, fgp) + if err != nil { + t.Fatal(err) + } - // get a sum of the stream after it has passed through to ensure it's the same. - h0 := sha1.New() - tRdr0 := io.TeeReader(tarStream, h0) + // get a sum of the stream after it has passed through to ensure it's the same. + h0 := sha1.New() + i, err := io.Copy(h0, tarStream) + if err != nil { + t.Fatal(err) + } - // read it all to the bit bucket - i, err := io.Copy(ioutil.Discard, tRdr0) - if err != nil { - t.Fatal(err) - } + if i != tc.expectedSize { + t.Errorf("size of tar: expected %d; got %d", tc.expectedSize, i) + } + if fmt.Sprintf("%x", h0.Sum(nil)) != tc.expectedSHA1Sum { + t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil)) + } - if i != expectedSize { - t.Errorf("size of tar: expected %d; got %d", expectedSize, i) - } - if fmt.Sprintf("%x", h0.Sum(nil)) != expectedSum { - t.Fatalf("checksum of tar: expected %s; got %x", expectedSum, h0.Sum(nil)) - } + //t.Logf("%s", w.String()) // if we fail, then show the packed info - t.Logf("%s", w.String()) // if we fail, then show the packed info + // If we've made it this far, then we'll turn it around and create a tar + // stream from the packed metadata and buffered file contents. + r := bytes.NewBuffer(w.Bytes()) + sup := storage.NewJSONUnpacker(r) + // and reuse the fgp that we Put the payloads to. - // If we've made it this far, then we'll turn it around and create a tar - // stream from the packed metadata and buffered file contents. - r := bytes.NewBuffer(w.Bytes()) - sup := storage.NewJSONUnpacker(r) - // and reuse the fgp that we Put the payloads to. + rc := NewOutputTarStream(fgp, sup) + h1 := sha1.New() + i, err = io.Copy(h1, rc) + if err != nil { + t.Fatal(err) + } - rc := NewOutputTarStream(fgp, sup) - h1 := sha1.New() - tRdr1 := io.TeeReader(rc, h1) - - // read it all to the bit bucket - i, err = io.Copy(ioutil.Discard, tRdr1) - if err != nil { - t.Fatal(err) - } - - if i != expectedSize { - t.Errorf("size of output tar: expected %d; got %d", expectedSize, i) - } - if fmt.Sprintf("%x", h1.Sum(nil)) != expectedSum { - t.Fatalf("checksum of output tar: expected %s; got %x", expectedSum, h1.Sum(nil)) + if i != tc.expectedSize { + t.Errorf("size of output tar: expected %d; got %d", tc.expectedSize, i) + } + if fmt.Sprintf("%x", h1.Sum(nil)) != tc.expectedSHA1Sum { + t.Fatalf("checksum of output tar: expected %s; got %x", tc.expectedSHA1Sum, h1.Sum(nil)) + } + } +} + +func BenchmarkAsm(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, tc := range testCases { + func() { + fh, err := os.Open(tc.path) + if err != nil { + b.Fatal(err) + } + defer fh.Close() + gzRdr, err := gzip.NewReader(fh) + if err != nil { + b.Fatal(err) + } + defer gzRdr.Close() + + // Setup where we'll store the metadata + w := bytes.NewBuffer([]byte{}) + sp := storage.NewJSONPacker(w) + fgp := storage.NewBufferFileGetPutter() + + // wrap the disassembly stream + tarStream, err := NewInputTarStream(gzRdr, sp, fgp) + if err != nil { + b.Fatal(err) + } + // read it all to the bit bucket + i1, err := io.Copy(ioutil.Discard, tarStream) + if err != nil { + b.Fatal(err) + } + + r := bytes.NewBuffer(w.Bytes()) + sup := storage.NewJSONUnpacker(r) + // and reuse the fgp that we Put the payloads to. + + rc := NewOutputTarStream(fgp, sup) + + i2, err := io.Copy(ioutil.Discard, rc) + if err != nil { + b.Fatal(err) + } + if i1 != i2 { + b.Errorf("%s: input(%d) and ouput(%d) byte count didn't match", tc.path, i1, i2) + } + }() + } } } diff --git a/tar/asm/disassemble.go b/tar/asm/disassemble.go index de25db0..54ef23a 100644 --- a/tar/asm/disassemble.go +++ b/tar/asm/disassemble.go @@ -22,8 +22,8 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io // What to do here... folks will want their own access to the Reader that is // their tar archive stream, but we'll need that same stream to use our // forked 'archive/tar'. - // Perhaps do an io.TeeReader that hand back an io.Reader for them to read - // from, and we'll mitm the stream to store metadata. + // Perhaps do an io.TeeReader that hands back an io.Reader for them to read + // from, and we'll MITM the stream to store metadata. // We'll need a storage.FilePutter too ... // Another concern, whether to do any storage.FilePutter operations, such that we @@ -32,7 +32,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io // Perhaps we have a DiscardFilePutter that is a bit bucket. // we'll return the pipe reader, since TeeReader does not buffer and will - // only read what the outputRdr Read's. Since Tar archive's have padding on + // only read what the outputRdr Read's. Since Tar archives have padding on // the end, we want to be the one reading the padding, even if the user's // `archive/tar` doesn't care. pR, pW := io.Pipe() @@ -55,12 +55,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io } // even when an EOF is reached, there is often 1024 null bytes on // the end of an archive. Collect them too. - _, err := p.AddEntry(storage.Entry{ - Type: storage.SegmentType, - Payload: tr.RawBytes(), - }) - if err != nil { - pW.CloseWithError(err) + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } } break // not return. We need the end of the reader. } @@ -68,11 +71,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io break // not return. We need the end of the reader. } - if _, err := p.AddEntry(storage.Entry{ - Type: storage.SegmentType, - Payload: tr.RawBytes(), - }); err != nil { - pW.CloseWithError(err) + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } } var csum []byte @@ -81,18 +88,23 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io _, csum, err = fp.Put(hdr.Name, tr) if err != nil { pW.CloseWithError(err) + return } } - // File entries added, regardless of size - _, err = p.AddEntry(storage.Entry{ + entry := storage.Entry{ Type: storage.FileType, - Name: hdr.Name, Size: hdr.Size, Payload: csum, - }) + } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) + + // File entries added, regardless of size + _, err = p.AddEntry(entry) if err != nil { pW.CloseWithError(err) + return } if b := tr.RawBytes(); len(b) > 0 { @@ -102,6 +114,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io }) if err != nil { pW.CloseWithError(err) + return } } } @@ -111,6 +124,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io remainder, err := ioutil.ReadAll(outputRdr) if err != nil && err != io.EOF { pW.CloseWithError(err) + return } _, err = p.AddEntry(storage.Entry{ Type: storage.SegmentType, @@ -118,9 +132,9 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io }) if err != nil { pW.CloseWithError(err) - } else { - pW.Close() + return } + pW.Close() }() return pR, nil diff --git a/tar/asm/testdata/extranils.tar.gz b/tar/asm/testdata/extranils.tar.gz new file mode 100644 index 0000000..70caf6e Binary files /dev/null and b/tar/asm/testdata/extranils.tar.gz differ diff --git a/tar/asm/testdata/fatlonglink.tar.gz b/tar/asm/testdata/fatlonglink.tar.gz new file mode 100644 index 0000000..0d8ed14 Binary files /dev/null and b/tar/asm/testdata/fatlonglink.tar.gz differ diff --git a/tar/asm/testdata/iso-8859.tar.gz b/tar/asm/testdata/iso-8859.tar.gz new file mode 100644 index 0000000..3e87f30 Binary files /dev/null and b/tar/asm/testdata/iso-8859.tar.gz differ diff --git a/tar/asm/testdata/longlink.tar.gz b/tar/asm/testdata/longlink.tar.gz new file mode 100644 index 0000000..cb21db5 Binary files /dev/null and b/tar/asm/testdata/longlink.tar.gz differ diff --git a/tar/asm/testdata/notenoughnils.tar.gz b/tar/asm/testdata/notenoughnils.tar.gz new file mode 100644 index 0000000..146bb00 Binary files /dev/null and b/tar/asm/testdata/notenoughnils.tar.gz differ diff --git a/tar/storage/doc.go b/tar/storage/doc.go index 57b61bc..83f7089 100644 --- a/tar/storage/doc.go +++ b/tar/storage/doc.go @@ -5,7 +5,7 @@ Packing and unpacking the Entries of the stream. The types of streams are either segments of raw bytes (for the raw headers and various padding) and for an entry marking a file payload. -The raw bytes are stored precisely in the packed (marshalled) Entry. Where as +The raw bytes are stored precisely in the packed (marshalled) Entry, whereas the file payload marker include the name of the file, size, and crc64 checksum (for basic file integrity). */ diff --git a/tar/storage/entry.go b/tar/storage/entry.go index 961af49..c91e7ea 100644 --- a/tar/storage/entry.go +++ b/tar/storage/entry.go @@ -1,5 +1,7 @@ package storage +import "unicode/utf8" + // Entries is for sorting by Position type Entries []Entry @@ -19,11 +21,11 @@ const ( // SegmentType represents a raw bytes segment from the archive stream. These raw // byte segments consist of the raw headers and various padding. // - // It's payload is to be marshalled base64 encoded. + // Its payload is to be marshalled base64 encoded. SegmentType ) -// Entry is a the structure for packing and unpacking the information read from +// Entry is the structure for packing and unpacking the information read from // the Tar archive. // // FileType Payload checksum is using `hash/crc64` for basic file integrity, @@ -32,8 +34,45 @@ const ( // collisions in a sample of 18.2 million, CRC64 had none. type Entry struct { Type Type `json:"type"` - Name string `json:"name",omitempty` - Size int64 `json:"size",omitempty` - Payload []byte `json:"payload"` // SegmentType store payload here; FileType store crc64 checksum here; + Name string `json:"name,omitempty"` + NameRaw []byte `json:"name_raw,omitempty"` + Size int64 `json:"size,omitempty"` + Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; Position int `json:"position"` } + +// SetName will check name for valid UTF-8 string, and set the appropriate +// field. See https://github.com/vbatts/tar-split/issues/17 +func (e *Entry) SetName(name string) { + if utf8.ValidString(name) { + e.Name = name + } else { + e.NameRaw = []byte(name) + } +} + +// SetNameBytes will check name for valid UTF-8 string, and set the appropriate +// field +func (e *Entry) SetNameBytes(name []byte) { + if utf8.Valid(name) { + e.Name = string(name) + } else { + e.NameRaw = name + } +} + +// GetName returns the string for the entry's name, regardless of the field stored in +func (e *Entry) GetName() string { + if len(e.NameRaw) > 0 { + return string(e.NameRaw) + } + return e.Name +} + +// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in +func (e *Entry) GetNameBytes() []byte { + if len(e.NameRaw) > 0 { + return e.NameRaw + } + return []byte(e.Name) +} diff --git a/tar/storage/entry_test.go b/tar/storage/entry_test.go index c797bca..90d103e 100644 --- a/tar/storage/entry_test.go +++ b/tar/storage/entry_test.go @@ -39,10 +39,10 @@ func TestEntries(t *testing.T) { func TestFile(t *testing.T) { f := Entry{ Type: FileType, - Name: "./hello.txt", Size: 100, Position: 2, } + f.SetName("./hello.txt") buf, err := json.Marshal(f) if err != nil { @@ -54,8 +54,37 @@ func TestFile(t *testing.T) { t.Fatal(err) } - if f.Name != f1.Name { - t.Errorf("expected Name %q, got %q", f.Name, f1.Name) + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) + } + if f.Size != f1.Size { + t.Errorf("expected Size %q, got %q", f.Size, f1.Size) + } + if f.Position != f1.Position { + t.Errorf("expected Position %q, got %q", f.Position, f1.Position) + } +} + +func TestFileRaw(t *testing.T) { + f := Entry{ + Type: FileType, + Size: 100, + Position: 2, + } + f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74}) + + buf, err := json.Marshal(f) + if err != nil { + t.Fatal(err) + } + + f1 := Entry{} + if err = json.Unmarshal(buf, &f1); err != nil { + t.Fatal(err) + } + + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) } if f.Size != f1.Size { t.Errorf("expected Size %q, got %q", f.Size, f1.Size) diff --git a/tar/storage/getter.go b/tar/storage/getter.go index 5d46e6a..ae11f8f 100644 --- a/tar/storage/getter.go +++ b/tar/storage/getter.go @@ -5,25 +5,24 @@ import ( "errors" "hash/crc64" "io" - "io/ioutil" "os" - "path" + "path/filepath" ) -// FileGetter is the interface for getting a stream of a file payload, address -// by name/filepath. Presumably, the names will be scoped to relative file -// paths. +// FileGetter is the interface for getting a stream of a file payload, +// addressed by name/filename. Presumably, the names will be scoped to relative +// file paths. type FileGetter interface { // Get returns a stream for the provided file path - Get(filepath string) (output io.ReadCloser, err error) + Get(filename string) (output io.ReadCloser, err error) } // FilePutter is the interface for storing a stream of a file payload, -// addressed by name/filepath. +// addressed by name/filename. type FilePutter interface { // Put returns the size of the stream received, and the crc64 checksum for // the provided stream - Put(filepath string, input io.Reader) (size int64, checksum []byte, err error) + Put(filename string, input io.Reader) (size int64, checksum []byte, err error) } // FileGetPutter is the interface that groups both Getting and Putting file @@ -44,8 +43,7 @@ type pathFileGetter struct { } func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) { - // FIXME might should have a check for '../../../../etc/passwd' attempts? - return os.Open(path.Join(pfg.root, filename)) + return os.Open(filepath.Join(pfg.root, filename)) } type bufferFileGetPutter struct { @@ -61,15 +59,15 @@ func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) { } func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) { - c := crc64.New(CRCTable) - tRdr := io.TeeReader(r, c) - b := bytes.NewBuffer([]byte{}) - i, err := io.Copy(b, tRdr) + crc := crc64.New(CRCTable) + buf := bytes.NewBuffer(nil) + cw := io.MultiWriter(crc, buf) + i, err := io.Copy(cw, r) if err != nil { return 0, nil, err } - bfgp.files[name] = b.Bytes() - return i, c.Sum(nil), nil + bfgp.files[name] = buf.Bytes() + return i, crc.Sum(nil), nil } type readCloserWrapper struct { @@ -78,7 +76,7 @@ type readCloserWrapper struct { func (w *readCloserWrapper) Close() error { return nil } -// NewBufferFileGetPutter is simple in memory FileGetPutter +// NewBufferFileGetPutter is a simple in-memory FileGetPutter // // Implication is this is memory intensive... // Probably best for testing or light weight cases. @@ -98,8 +96,7 @@ type bitBucketFilePutter struct { func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) { c := crc64.New(CRCTable) - tRdr := io.TeeReader(r, c) - i, err := io.Copy(ioutil.Discard, tRdr) + i, err := io.Copy(c, r) return i, c.Sum(nil), err } diff --git a/tar/storage/getter_test.go b/tar/storage/getter_test.go index 5a6fcc7..c06cff0 100644 --- a/tar/storage/getter_test.go +++ b/tar/storage/getter_test.go @@ -2,7 +2,9 @@ package storage import ( "bytes" + "fmt" "io/ioutil" + "strings" "testing" ) @@ -39,6 +41,7 @@ func TestGetter(t *testing.T) { } } } + func TestPutter(t *testing.T) { fp := NewDiscardFilePutter() // map[filename]map[body]crc64sum @@ -60,3 +63,22 @@ func TestPutter(t *testing.T) { } } } + +func BenchmarkPutter(b *testing.B) { + files := []string{ + strings.Repeat("foo", 1000), + strings.Repeat("bar", 1000), + strings.Repeat("baz", 1000), + strings.Repeat("fooz", 1000), + strings.Repeat("vbatts", 1000), + strings.Repeat("systemd", 1000), + } + for i := 0; i < b.N; i++ { + fgp := NewBufferFileGetPutter() + for n, body := range files { + if _, _, err := fgp.Put(fmt.Sprintf("%d", n), bytes.NewBufferString(body)); err != nil { + b.Fatal(err) + } + } + } +} diff --git a/tar/storage/packer.go b/tar/storage/packer.go index 6c4364b..aba6948 100644 --- a/tar/storage/packer.go +++ b/tar/storage/packer.go @@ -1,15 +1,15 @@ package storage import ( - "bufio" "encoding/json" "errors" "io" - "path" + "path/filepath" + "unicode/utf8" ) -// ErrDuplicatePath is occured when a tar archive has more than one entry for -// the same file path +// ErrDuplicatePath occurs when a tar archive has more than one entry for the +// same file path var ErrDuplicatePath = errors.New("duplicates of file paths not supported") // Packer describes the methods to pack Entries to a storage destination @@ -32,40 +32,24 @@ type PackUnpacker interface { */ type jsonUnpacker struct { - r io.Reader - b *bufio.Reader - isEOF bool - seen seenNames + seen seenNames + dec *json.Decoder } func (jup *jsonUnpacker) Next() (*Entry, error) { var e Entry - if jup.isEOF { - // since ReadBytes() will return read bytes AND an EOF, we handle it this - // round-a-bout way so we can Unmarshal the tail with relevant errors, but - // still get an io.EOF when the stream is ended. - return nil, io.EOF - } - line, err := jup.b.ReadBytes('\n') - if err != nil && err != io.EOF { + err := jup.dec.Decode(&e) + if err != nil { return nil, err - } else if err == io.EOF { - jup.isEOF = true - } - - err = json.Unmarshal(line, &e) - if err != nil && jup.isEOF { - // if the remainder actually _wasn't_ a remaining json structure, then just EOF - return nil, io.EOF } // check for dup name if e.Type == FileType { - cName := path.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jup.seen[cName]; ok { return nil, ErrDuplicatePath } - jup.seen[cName] = emptyByte + jup.seen[cName] = struct{}{} } return &e, err @@ -77,8 +61,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) { // Each Entry read are expected to be delimited by new line. func NewJSONUnpacker(r io.Reader) Unpacker { return &jsonUnpacker{ - r: r, - b: bufio.NewReader(r), + dec: json.NewDecoder(r), seen: seenNames{}, } } @@ -90,20 +73,24 @@ type jsonPacker struct { seen seenNames } -type seenNames map[string]byte - -// used in the seenNames map. byte is a uint8, and we'll re-use the same one -// for minimalism. -const emptyByte byte = 0 +type seenNames map[string]struct{} func (jp *jsonPacker) AddEntry(e Entry) (int, error) { + // if Name is not valid utf8, switch it to raw first. + if e.Name != "" { + if !utf8.ValidString(e.Name) { + e.NameRaw = []byte(e.Name) + e.Name = "" + } + } + // check early for dup name if e.Type == FileType { - cName := path.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jp.seen[cName]; ok { return -1, ErrDuplicatePath } - jp.seen[cName] = emptyByte + jp.seen[cName] = struct{}{} } e.Position = jp.pos @@ -117,7 +104,7 @@ func (jp *jsonPacker) AddEntry(e Entry) (int, error) { return e.Position, nil } -// NewJSONPacker provides an Packer that writes each Entry (SegmentType and +// NewJSONPacker provides a Packer that writes each Entry (SegmentType and // FileType) as a json document. // // The Entries are delimited by new line. diff --git a/tar/storage/packer_test.go b/tar/storage/packer_test.go index 1c6101f..7d93371 100644 --- a/tar/storage/packer_test.go +++ b/tar/storage/packer_test.go @@ -4,6 +4,8 @@ import ( "bytes" "compress/gzip" "io" + "io/ioutil" + "os" "testing" ) @@ -159,5 +161,58 @@ func TestGzip(t *testing.T) { if len(entries) != len(e) { t.Errorf("expected %d entries, got %d", len(e), len(entries)) } - +} + +func BenchmarkGetPut(b *testing.B) { + e := []Entry{ + Entry{ + Type: SegmentType, + Payload: []byte("how"), + }, + Entry{ + Type: SegmentType, + Payload: []byte("y'all"), + }, + Entry{ + Type: FileType, + Name: "./hurr.txt", + Payload: []byte("deadbeef"), + }, + Entry{ + Type: SegmentType, + Payload: []byte("doin"), + }, + } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + func() { + fh, err := ioutil.TempFile("", "tar-split.") + if err != nil { + b.Fatal(err) + } + defer os.Remove(fh.Name()) + defer fh.Close() + + jp := NewJSONPacker(fh) + for i := range e { + if _, err := jp.AddEntry(e[i]); err != nil { + b.Fatal(err) + } + } + fh.Sync() + + up := NewJSONUnpacker(fh) + for { + _, err := up.Next() + if err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + } + + }() + } + }) } diff --git a/tar_benchmark_test.go b/tar_benchmark_test.go new file mode 100644 index 0000000..d946f2a --- /dev/null +++ b/tar_benchmark_test.go @@ -0,0 +1,84 @@ +package tartest + +import ( + "io" + "io/ioutil" + "os" + "testing" + + upTar "archive/tar" + + ourTar "github.com/vbatts/tar-split/archive/tar" +) + +var testfile = "./archive/tar/testdata/sparse-formats.tar" + +func BenchmarkUpstreamTar(b *testing.B) { + for n := 0; n < b.N; n++ { + fh, err := os.Open(testfile) + if err != nil { + b.Fatal(err) + } + tr := upTar.NewReader(fh) + for { + _, err := tr.Next() + if err != nil { + if err == io.EOF { + break + } + fh.Close() + b.Fatal(err) + } + io.Copy(ioutil.Discard, tr) + } + fh.Close() + } +} + +func BenchmarkOurTarNoAccounting(b *testing.B) { + for n := 0; n < b.N; n++ { + fh, err := os.Open(testfile) + if err != nil { + b.Fatal(err) + } + tr := ourTar.NewReader(fh) + tr.RawAccounting = false // this is default, but explicit here + for { + _, err := tr.Next() + if err != nil { + if err == io.EOF { + break + } + fh.Close() + b.Fatal(err) + } + io.Copy(ioutil.Discard, tr) + } + fh.Close() + } +} +func BenchmarkOurTarYesAccounting(b *testing.B) { + for n := 0; n < b.N; n++ { + fh, err := os.Open(testfile) + if err != nil { + b.Fatal(err) + } + tr := ourTar.NewReader(fh) + tr.RawAccounting = true // This enables mechanics for collecting raw bytes + for { + _ = tr.RawBytes() + _, err := tr.Next() + _ = tr.RawBytes() + if err != nil { + if err == io.EOF { + break + } + fh.Close() + b.Fatal(err) + } + io.Copy(ioutil.Discard, tr) + _ = tr.RawBytes() + } + fh.Close() + } +} diff --git a/version/gen.go b/version/gen.go new file mode 100644 index 0000000..d290d83 --- /dev/null +++ b/version/gen.go @@ -0,0 +1,4 @@ +package version + +// from `go get github.com/vbatts/go-get-version` +//go:generate go-get-version -package version -variable VERSION -output version.go diff --git a/version/version.go b/version/version.go new file mode 100644 index 0000000..f317010 --- /dev/null +++ b/version/version.go @@ -0,0 +1,7 @@ +package version + +// AUTO-GENEREATED. DO NOT EDIT +// 2016-09-26 19:53:30.825879 -0400 EDT + +// VERSION is the generated version from /home/vbatts/src/vb/tar-split/version +var VERSION = "v0.10.1-4-gf280282"