Merge pull request #38 from vbatts/travis

travis: test more go versions
archive/tar: monotonic clock adjustment
2017-03-14 11:24:38 -04:00 · 2017-03-14 11:04:10 -04:00 · 2017-03-14 08:38:13 -04:00 · 2017-03-13 18:28:54 -04:00 · 2016-09-27 02:54:18 +00:00 · 2016-09-26 19:53:52 -04:00
46 changed files with 2643 additions and 1096 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,13 +1,18 @@
 language: go
 go:
-  - 1.4.1
-  - 1.3.3
+  - tip
+  - 1.x
+  - 1.8.x
+  - 1.7.x
+  - 1.6.x
+  - 1.5.x

 # let us have pretty, fast Docker-based Travis workers!
 sudo: false

-# we don't need "go get" here <3
-install: true
+install:
+  - go get -d ./...

 script:
  - go test -v ./...
+  - go vet ./...
--- a/DESIGN.md
+++ b/DESIGN.md
@ -1,36 +0,0 @@
-Flow of TAR stream
-==================
-
-The underlying use of `github.com/vbatts/tar-split/archive/tar` is most similar
-to stdlib.
-
-
-Packer interface
----------------
-
-For ease of storage and usage of the raw bytes, there will be a storage
-interface, that accepts an io.Writer (This way you could pass it an in memory
-buffer or a file handle).
-
-Having a Packer interface can allow configuration of hash.Hash for file payloads
-and providing your own io.Writer.
-
-Instead of having a state directory to store all the header information for all
-Readers, we will leave that up to user of Reader. Because we can not assume an
-ID for each Reader, and keeping that information differentiated.
-
-
-
-State Directory
---------------
-
-Perhaps we could deduplicate the header info, by hashing the rawbytes and
-storing them in a directory tree like:
-
-	./ac/dc/beef
-
-Then reference the hash of the header info, in the positional records for the
-tar stream. Though this could be a future feature, and not required for an
-initial implementation. Also, this would imply an owned state directory, rather
-than just writing storage info to an io.Writer.
-
--- a/39
+++ b/39
@ -1,19 +1,28 @@
 Copyright (c) 2015 Vincent Batts, Raleigh, NC, USA

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+All rights reserved.

-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -1,25 +1,49 @@
-tar-split
-========
+# tar-split

 [![Build Status](https://travis-ci.org/vbatts/tar-split.svg?branch=master)](https://travis-ci.org/vbatts/tar-split)
+[![Go Report Card](https://goreportcard.com/badge/github.com/vbatts/tar-split)](https://goreportcard.com/report/github.com/vbatts/tar-split)

-Extend the upstream golang stdlib `archive/tar` library, to expose the raw
-bytes of the TAR, rather than just the marshalled headers and file stream.
+Pristinely disassembling a tar archive, and stashing needed raw bytes and offsets to reassemble a validating original archive.

-The goal being that by preserving the raw bytes of each header, padding bytes,
-and the raw file payload, one could reassemble the original archive.
+## Docs

-
-Docs
----
+Code API for libraries provided by `tar-split`:

 * https://godoc.org/github.com/vbatts/tar-split/tar/asm
 * https://godoc.org/github.com/vbatts/tar-split/tar/storage
 * https://godoc.org/github.com/vbatts/tar-split/archive/tar

+## Install

-Caveat
------
+The command line utilitiy is installable via:
+
+```bash
+go get github.com/vbatts/tar-split/cmd/tar-split
+```
+
+## Usage
+
+For cli usage, see its [README.md](cmd/tar-split/README.md).
+For the library see the [docs](#docs)
+
+## Demo
+
+### Basic disassembly and assembly
+
+This demonstrates the `tar-split` command and how to assemble a tar archive from the `tar-data.json.gz`
+
+
+![basic cmd demo thumbnail](https://i.ytimg.com/vi/vh5wyjIOBtc/2.jpg?time=1445027151805)
+[youtube video of basic command demo](https://youtu.be/vh5wyjIOBtc)
+
+### Docker layer preservation
+
+This demonstrates the tar-split integration for docker-1.8. Providing consistent tar archives for the image layer content.
+
+![docker tar-split demo](https://i.ytimg.com/vi_webp/vh5wyjIOBtc/default.webp)
+[youtube vide of docker layer checksums](https://youtu.be/tV_Dia8E8xw)
+
+## Caveat

 Eventually this should detect TARs that this is not possible with.

@ -37,85 +61,21 @@ same path, we will not support this feature. If there are more than one entries
 with the same path, expect an err (like `ErrDuplicatePath`) or a resulting tar
 stream that does not validate your original checksum/signature.

+## Contract

-Contract
--------
+Do not break the API of stdlib `archive/tar` in our fork (ideally find an upstream mergeable solution).

-Do not break the API of stdlib `archive/tar` in our fork (ideally find an
-upstream mergeable solution)
+## Std Version
+
+The version of golang stdlib `archive/tar` is from go1.6
+It is minimally extended to expose the raw bytes of the TAR, rather than just the marshalled headers and file stream.


-Std Version
-----------
+## Design

-The version of golang stdlib `archive/tar` is from go1.4.1, and their master branch around [a9dddb53f](https://github.com/golang/go/tree/a9dddb53f)
+See the [design](concept/DESIGN.md).

-
-Example
-------
-
-First we'll get an archive to work with. For repeatability, we'll make an
-archive from what you've just cloned:
-
-```
-git archive --format=tar -o tar-split.tar HEAD .
-```
-
-Then build the example main.go:
-
-```
-go build ./main.go
-```
-
-Now run the example over the archive:
-
-```
-$ ./main tar-split.tar
-2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out"
-pax_global_header pre: 512 read: 52
-.travis.yml pre: 972 read: 374
-DESIGN.md pre: 650 read: 1131
-LICENSE pre: 917 read: 1075
-README.md pre: 973 read: 4289
-archive/ pre: 831 read: 0
-archive/tar/ pre: 512 read: 0
-archive/tar/common.go pre: 512 read: 7790
-[...]
-tar/storage/entry_test.go pre: 667 read: 1137
-tar/storage/getter.go pre: 911 read: 2741
-tar/storage/getter_test.go pre: 843 read: 1491
-tar/storage/packer.go pre: 557 read: 3141
-tar/storage/packer_test.go pre: 955 read: 3096
-EOF padding: 1512
-Remainder: 512
-Size: 215040; Sum: 215040
-```
-
-*What are we seeing here?* 
-
-* `pre` is the header of a file entry, and potentially the padding from the
-  end of the prior file's payload. Also with particular tar extensions and pax
-  attributes, the header can exceed 512 bytes.
-* `read` is the size of the file payload from the entry
-* `EOF padding` is the expected 1024 null bytes on the end of a tar archive,
-  plus potential padding from the end of the prior file entry's payload
-* `Remainder` is the remaining bytes of an archive. This is typically deadspace
-  as most tar implmentations will return after having reached the end of the
-  1024 null bytes. Though various implementations will include some amount of
-  bytes here, which will affect the checksum of the resulting tar archive,
-  therefore this must be accounted for as well.
-
-Ideally the input tar and output `*.out`, will match:
-
-```
-$ sha1sum tar-split.tar*
-ca9e19966b892d9ad5960414abac01ef585a1e22  tar-split.tar
-ca9e19966b892d9ad5960414abac01ef585a1e22  tar-split.tar.out
-```
-
-
-Stored Metadata
---------------
+## Stored Metadata

 Since the raw bytes of the headers and padding are stored, you may be wondering
 what the size implications are. The headers are at least 512 bytes per
@ -123,14 +83,16 @@ file (sometimes more), at least 1024 null bytes on the end, and then various
 padding. This makes for a constant linear growth in the stored metadata, with a
 naive storage implementation.

-Reusing our prior example's `tar-split.tar`, let's build the checksize.go example:
+First we'll get an archive to work with. For repeatability, we'll make an
+archive from what you've just cloned:

-```
-go build ./checksize.go
+```bash
+git archive --format=tar -o tar-split.tar HEAD .
 ```

-```
-$ ./checksize ./tar-split.tar
+```bash
+$ go get github.com/vbatts/tar-split/cmd/tar-split
+$ tar-split checksize ./tar-split.tar
 inspecting "tar-split.tar" (size 210k)
 -- number of files: 50
 -- size of metadata uncompressed: 53k
@ -143,10 +105,10 @@ implications are as little as 3kb.

 But let's look at a larger archive, with many files.

-```
+```bash
 $ ls -sh ./d.tar
 1.4G ./d.tar
-$ ./checksize ~/d.tar 
+$ tar-split checksize ~/d.tar 
 inspecting "/home/vbatts/d.tar" (size 1420749k)
 -- number of files: 38718
 -- size of metadata uncompressed: 43261k
@ -163,19 +125,14 @@ bytes-per-file rate for the storage implications.
 | ~ 1kb per/file | 0.06kb per/file |


-What's Next?
------------
+## What's Next?

 * More implementations of storage Packer and Unpacker
- - could be a redis or mongo backend
 * More implementations of FileGetter and FilePutter
- - could be a redis or mongo backend
-* cli tooling to assemble/disassemble a provided tar archive
 * would be interesting to have an assembler stream that implements `io.Seeker`

-License
-------

-See LICENSE
+## License

+See [LICENSE](LICENSE)

--- a/archive/tar/common.go
+++ b/archive/tar/common.go
@ -139,8 +139,8 @@ func (fi headerFileInfo) Mode() (mode os.FileMode) {
 	}

 	switch fi.h.Typeflag {
-	case TypeLink, TypeSymlink:
-		// hard link, symbolic link
+	case TypeSymlink:
+		// symbolic link
 		mode |= os.ModeSymlink
 	case TypeChar:
 		// character device node
@ -249,6 +249,30 @@ func FileInfoHeader(fi os.FileInfo, link string) (*Header, error) {
 	if fm&os.ModeSticky != 0 {
 		h.Mode |= c_ISVTX
 	}
+	// If possible, populate additional fields from OS-specific
+	// FileInfo fields.
+	if sys, ok := fi.Sys().(*Header); ok {
+		// This FileInfo came from a Header (not the OS). Use the
+		// original Header to populate all remaining fields.
+		h.Uid = sys.Uid
+		h.Gid = sys.Gid
+		h.Uname = sys.Uname
+		h.Gname = sys.Gname
+		h.AccessTime = sys.AccessTime
+		h.ChangeTime = sys.ChangeTime
+		if sys.Xattrs != nil {
+			h.Xattrs = make(map[string]string)
+			for k, v := range sys.Xattrs {
+				h.Xattrs[k] = v
+			}
+		}
+		if sys.Typeflag == TypeLink {
+			// hard link
+			h.Typeflag = TypeLink
+			h.Size = 0
+			h.Linkname = sys.Linkname
+		}
+	}
 	if sysStat != nil {
 		return h, sysStat(fi, h)
 	}
@ -303,3 +327,14 @@ func toASCII(s string) string {
 	}
 	return buf.String()
 }
+
+// isHeaderOnlyType checks if the given type flag is of the type that has no
+// data section even if a size is specified.
+func isHeaderOnlyType(flag byte) bool {
+	switch flag {
+	case TypeLink, TypeSymlink, TypeChar, TypeBlock, TypeDir, TypeFifo:
+		return true
+	default:
+		return false
+	}
+}
--- a/archive/tar/example_test.go
+++ b/archive/tar/example_test.go
@ -26,7 +26,7 @@ func Example() {
 	}{
 		{"readme.txt", "This archive contains some text files."},
 		{"gopher.txt", "Gopher names:\nGeorge\nGeoffrey\nGonzo"},
-		{"todo.txt", "Get animal handling licence."},
+		{"todo.txt", "Get animal handling license."},
 	}
 	for _, file := range files {
 		hdr := &tar.Header{
@ -76,5 +76,5 @@ func Example() {
 	// Geoffrey
 	// Gonzo
 	// Contents of todo.txt:
-	// Get animal handling licence.
+	// Get animal handling license.
 }
--- a/archive/tar/reader.go
+++ b/archive/tar/reader.go
@ -12,6 +12,7 @@ import (
 	"errors"
 	"io"
 	"io/ioutil"
+	"math"
 	"os"
 	"strconv"
 	"strings"
@ -39,6 +40,10 @@ type Reader struct {
 	rawBytes      *bytes.Buffer // last raw bits
 }

+type parser struct {
+	err error // Last error seen
+}
+
 // RawBytes accesses the raw bytes of the archive, apart from the file payload itself.
 // This includes the header and padding.
 //
@ -70,12 +75,36 @@ type regFileReader struct {
 	nb int64     // number of unread bytes for current file entry
 }

-// A sparseFileReader is a numBytesReader for reading sparse file data from a tar archive.
+// A sparseFileReader is a numBytesReader for reading sparse file data from a
+// tar archive.
 type sparseFileReader struct {
-	rfr *regFileReader // reads the sparse-encoded file data
-	sp  []sparseEntry  // the sparse map for the file
-	pos int64          // keeps track of file position
-	tot int64          // total size of the file
+	rfr   numBytesReader // Reads the sparse-encoded file data
+	sp    []sparseEntry  // The sparse map for the file
+	pos   int64          // Keeps track of file position
+	total int64          // Total size of the file
+}
+
+// A sparseEntry holds a single entry in a sparse file's sparse map.
+//
+// Sparse files are represented using a series of sparseEntrys.
+// Despite the name, a sparseEntry represents an actual data fragment that
+// references data found in the underlying archive stream. All regions not
+// covered by a sparseEntry are logically filled with zeros.
+//
+// For example, if the underlying raw file contains the 10-byte data:
+//	var compactData = "abcdefgh"
+//
+// And the sparse map has the following entries:
+//	var sp = []sparseEntry{
+//		{offset: 2,  numBytes: 5} // Data fragment for [2..7]
+//		{offset: 18, numBytes: 3} // Data fragment for [18..21]
+//	}
+//
+// Then the content of the resulting sparse file with a "real" size of 25 is:
+//	var sparseData = "\x00"*2 + "abcde" + "\x00"*11 + "fgh" + "\x00"*4
+type sparseEntry struct {
+	offset   int64 // Starting position of the fragment
+	numBytes int64 // Length of the fragment
 }

 // Keywords for GNU sparse files in a PAX extended header
@ -109,7 +138,6 @@ func NewReader(r io.Reader) *Reader { return &Reader{r: r} }
 //
 // io.EOF is returned at the end of the input.
 func (tr *Reader) Next() (*Header, error) {
-	var hdr *Header
 	if tr.RawAccounting {
 		if tr.rawBytes == nil {
 			tr.rawBytes = bytes.NewBuffer(nil)
@ -117,32 +145,72 @@ func (tr *Reader) Next() (*Header, error) {
 			tr.rawBytes.Reset()
 		}
 	}
-	if tr.err == nil {
-		tr.skipUnread()
-	}
+
 	if tr.err != nil {
-		return hdr, tr.err
+		return nil, tr.err
 	}
+
+	var hdr *Header
+	var extHdrs map[string]string
+
+	// Externally, Next iterates through the tar archive as if it is a series of
+	// files. Internally, the tar format often uses fake "files" to add meta
+	// data that describes the next file. These meta data "files" should not
+	// normally be visible to the outside. As such, this loop iterates through
+	// one or more "header files" until it finds a "normal file".
+loop:
+	for {
+		tr.err = tr.skipUnread()
+		if tr.err != nil {
+			return nil, tr.err
+		}
+
 		hdr = tr.readHeader()
-	if hdr == nil {
-		return hdr, tr.err
+		if tr.err != nil {
+			return nil, tr.err
 		}
-	// Check for PAX/GNU header.
+		// Check for PAX/GNU special headers and files.
 		switch hdr.Typeflag {
 		case TypeXHeader:
-		//  PAX extended header
-		headers, err := parsePAX(tr)
-		if err != nil {
-			return nil, err
+			extHdrs, tr.err = parsePAX(tr)
+			if tr.err != nil {
+				return nil, tr.err
 			}
-		// We actually read the whole file,
-		// but this skips alignment padding
-		tr.skipUnread()
-		hdr = tr.readHeader()
-		mergePAX(hdr, headers)
+			continue loop // This is a meta header affecting the next header
+		case TypeGNULongName, TypeGNULongLink:
+			var realname []byte
+			realname, tr.err = ioutil.ReadAll(tr)
+			if tr.err != nil {
+				return nil, tr.err
+			}
+
+			if tr.RawAccounting {
+				if _, tr.err = tr.rawBytes.Write(realname); tr.err != nil {
+					return nil, tr.err
+				}
+			}
+
+			// Convert GNU extensions to use PAX headers.
+			if extHdrs == nil {
+				extHdrs = make(map[string]string)
+			}
+			var p parser
+			switch hdr.Typeflag {
+			case TypeGNULongName:
+				extHdrs[paxPath] = p.parseString(realname)
+			case TypeGNULongLink:
+				extHdrs[paxLinkpath] = p.parseString(realname)
+			}
+			if p.err != nil {
+				tr.err = p.err
+				return nil, tr.err
+			}
+			continue loop // This is a meta header affecting the next header
+		default:
+			mergePAX(hdr, extHdrs)

 			// Check for a PAX format sparse file
-		sp, err := tr.checkForGNUSparsePAXHeaders(hdr, headers)
+			sp, err := tr.checkForGNUSparsePAXHeaders(hdr, extHdrs)
 			if err != nil {
 				tr.err = err
 				return nil, err
@ -150,55 +218,15 @@ func (tr *Reader) Next() (*Header, error) {
 			if sp != nil {
 				// Current file is a PAX format GNU sparse file.
 				// Set the current file reader to a sparse file reader.
-			tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size}
+				tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size)
+				if tr.err != nil {
+					return nil, tr.err
+				}
+			}
+			break loop // This is a file, so stop
+		}
 	}
 	return hdr, nil
-	case TypeGNULongName:
-		// We have a GNU long name header. Its contents are the real file name.
-		realname, err := ioutil.ReadAll(tr)
-		if err != nil {
-			return nil, err
-		}
-		var b []byte
-		if tr.RawAccounting {
-			if _, err = tr.rawBytes.Write(realname); err != nil {
-				return nil, err
-			}
-			b = tr.RawBytes()
-		}
-		hdr, err := tr.Next()
-		// since the above call to Next() resets the buffer, we need to throw the bytes over
-		if tr.RawAccounting {
-			if _, err = tr.rawBytes.Write(b); err != nil {
-				return nil, err
-			}
-		}
-		hdr.Name = cString(realname)
-		return hdr, err
-	case TypeGNULongLink:
-		// We have a GNU long link header.
-		realname, err := ioutil.ReadAll(tr)
-		if err != nil {
-			return nil, err
-		}
-		var b []byte
-		if tr.RawAccounting {
-			if _, err = tr.rawBytes.Write(realname); err != nil {
-				return nil, err
-			}
-			b = tr.RawBytes()
-		}
-		hdr, err := tr.Next()
-		// since the above call to Next() resets the buffer, we need to throw the bytes over
-		if tr.RawAccounting {
-			if _, err = tr.rawBytes.Write(b); err != nil {
-				return nil, err
-			}
-		}
-		hdr.Linkname = cString(realname)
-		return hdr, err
-	}
-	return hdr, tr.err
 }

 // checkForGNUSparsePAXHeaders checks the PAX headers for GNU sparse headers. If they are found, then
@ -375,6 +403,7 @@ func parsePAX(r io.Reader) (map[string]string, error) {
 			return nil, err
 		}
 	}
+	sbuf := string(buf)

 	// For GNU PAX sparse format 0.0 support.
 	// This function transforms the sparse format 0.0 headers into sparse format 0.1 headers.
@ -383,35 +412,17 @@ func parsePAX(r io.Reader) (map[string]string, error) {
 	headers := make(map[string]string)
 	// Each record is constructed as
 	//     "%d %s=%s\n", length, keyword, value
-	for len(buf) > 0 {
-		// or the header was empty to start with.
-		var sp int
-		// The size field ends at the first space.
-		sp = bytes.IndexByte(buf, ' ')
-		if sp == -1 {
-			return nil, ErrHeader
-		}
-		// Parse the first token as a decimal integer.
-		n, err := strconv.ParseInt(string(buf[:sp]), 10, 0)
+	for len(sbuf) > 0 {
+		key, value, residual, err := parsePAXRecord(sbuf)
 		if err != nil {
 			return nil, ErrHeader
 		}
-		// Extract everything between the decimal and the n -1 on the
-		// beginning to eat the ' ', -1 on the end to skip the newline.
-		var record []byte
-		record, buf = buf[sp+1:n-1], buf[n:]
-		// The first equals is guaranteed to mark the end of the key.
-		// Everything else is value.
-		eq := bytes.IndexByte(record, '=')
-		if eq == -1 {
-			return nil, ErrHeader
-		}
-		key, value := record[:eq], record[eq+1:]
+		sbuf = residual

 		keyStr := string(key)
 		if keyStr == paxGNUSparseOffset || keyStr == paxGNUSparseNumBytes {
 			// GNU sparse format 0.0 special key. Write to sparseMap instead of using the headers map.
-			sparseMap.Write(value)
+			sparseMap.WriteString(value)
 			sparseMap.Write([]byte{','})
 		} else {
 			// Normal key. Set the value in the headers map.
@ -426,9 +437,42 @@ func parsePAX(r io.Reader) (map[string]string, error) {
 	return headers, nil
 }

-// cString parses bytes as a NUL-terminated C-style string.
+// parsePAXRecord parses the input PAX record string into a key-value pair.
+// If parsing is successful, it will slice off the currently read record and
+// return the remainder as r.
+//
+// A PAX record is of the following form:
+//	"%d %s=%s\n" % (size, key, value)
+func parsePAXRecord(s string) (k, v, r string, err error) {
+	// The size field ends at the first space.
+	sp := strings.IndexByte(s, ' ')
+	if sp == -1 {
+		return "", "", s, ErrHeader
+	}
+
+	// Parse the first token as a decimal integer.
+	n, perr := strconv.ParseInt(s[:sp], 10, 0) // Intentionally parse as native int
+	if perr != nil || n < 5 || int64(len(s)) < n {
+		return "", "", s, ErrHeader
+	}
+
+	// Extract everything between the space and the final newline.
+	rec, nl, rem := s[sp+1:n-1], s[n-1:n], s[n:]
+	if nl != "\n" {
+		return "", "", s, ErrHeader
+	}
+
+	// The first equals separates the key from the value.
+	eq := strings.IndexByte(rec, '=')
+	if eq == -1 {
+		return "", "", s, ErrHeader
+	}
+	return rec[:eq], rec[eq+1:], rem, nil
+}
+
+// parseString parses bytes as a NUL-terminated C-style string.
 // If a NUL byte is not found then the whole slice is returned as a string.
-func cString(b []byte) string {
+func (*parser) parseString(b []byte) string {
 	n := 0
 	for n < len(b) && b[n] != 0 {
 		n++
@ -436,19 +480,51 @@ func cString(b []byte) string {
 	return string(b[0:n])
 }

-func (tr *Reader) octal(b []byte) int64 {
-	// Check for binary format first.
+// parseNumeric parses the input as being encoded in either base-256 or octal.
+// This function may return negative numbers.
+// If parsing fails or an integer overflow occurs, err will be set.
+func (p *parser) parseNumeric(b []byte) int64 {
+	// Check for base-256 (binary) format first.
+	// If the first bit is set, then all following bits constitute a two's
+	// complement encoded number in big-endian byte order.
 	if len(b) > 0 && b[0]&0x80 != 0 {
-		var x int64
-		for i, c := range b {
-			if i == 0 {
-				c &= 0x7f // ignore signal bit in first byte
-			}
-			x = x<<8 | int64(c)
-		}
-		return x
+		// Handling negative numbers relies on the following identity:
+		//	-a-1 == ^a
+		//
+		// If the number is negative, we use an inversion mask to invert the
+		// data bytes and treat the value as an unsigned number.
+		var inv byte // 0x00 if positive or zero, 0xff if negative
+		if b[0]&0x40 != 0 {
+			inv = 0xff
 		}

+		var x uint64
+		for i, c := range b {
+			c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing
+			if i == 0 {
+				c &= 0x7f // Ignore signal bit in first byte
+			}
+			if (x >> 56) > 0 {
+				p.err = ErrHeader // Integer overflow
+				return 0
+			}
+			x = x<<8 | uint64(c)
+		}
+		if (x >> 63) > 0 {
+			p.err = ErrHeader // Integer overflow
+			return 0
+		}
+		if inv == 0xff {
+			return ^int64(x)
+		}
+		return int64(x)
+	}
+
+	// Normal case is base-8 (octal) format.
+	return p.parseOctal(b)
+}
+
+func (p *parser) parseOctal(b []byte) int64 {
 	// Because unused fields are filled with NULs, we need
 	// to skip leading NULs. Fields may also be padded with
 	// spaces or NULs.
@ -459,27 +535,55 @@ func (tr *Reader) octal(b []byte) int64 {
 	if len(b) == 0 {
 		return 0
 	}
-	x, err := strconv.ParseUint(cString(b), 8, 64)
-	if err != nil {
-		tr.err = err
+	x, perr := strconv.ParseUint(p.parseString(b), 8, 64)
+	if perr != nil {
+		p.err = ErrHeader
 	}
 	return int64(x)
 }

-// skipUnread skips any unread bytes in the existing file entry, as well as any alignment padding.
-func (tr *Reader) skipUnread() {
-	nr := tr.numBytes() + tr.pad // number of bytes to skip
+// skipUnread skips any unread bytes in the existing file entry, as well as any
+// alignment padding. It returns io.ErrUnexpectedEOF if any io.EOF is
+// encountered in the data portion; it is okay to hit io.EOF in the padding.
+//
+// Note that this function still works properly even when sparse files are being
+// used since numBytes returns the bytes remaining in the underlying io.Reader.
+func (tr *Reader) skipUnread() error {
+	dataSkip := tr.numBytes()      // Number of data bytes to skip
+	totalSkip := dataSkip + tr.pad // Total number of bytes to skip
 	tr.curr, tr.pad = nil, 0
 	if tr.RawAccounting {
-		_, tr.err = io.CopyN(tr.rawBytes, tr.r, nr)
-		return
+		_, tr.err = io.CopyN(tr.rawBytes, tr.r, totalSkip)
+		return tr.err
 	}
-	if sr, ok := tr.r.(io.Seeker); ok {
-		if _, err := sr.Seek(nr, os.SEEK_CUR); err == nil {
-			return
+	// If possible, Seek to the last byte before the end of the data section.
+	// Do this because Seek is often lazy about reporting errors; this will mask
+	// the fact that the tar stream may be truncated. We can rely on the
+	// io.CopyN done shortly afterwards to trigger any IO errors.
+	var seekSkipped int64 // Number of bytes skipped via Seek
+	if sr, ok := tr.r.(io.Seeker); ok && dataSkip > 1 {
+		// Not all io.Seeker can actually Seek. For example, os.Stdin implements
+		// io.Seeker, but calling Seek always returns an error and performs
+		// no action. Thus, we try an innocent seek to the current position
+		// to see if Seek is really supported.
+		pos1, err := sr.Seek(0, os.SEEK_CUR)
+		if err == nil {
+			// Seek seems supported, so perform the real Seek.
+			pos2, err := sr.Seek(dataSkip-1, os.SEEK_CUR)
+			if err != nil {
+				tr.err = err
+				return tr.err
+			}
+			seekSkipped = pos2 - pos1
 		}
 	}
-	_, tr.err = io.CopyN(ioutil.Discard, tr.r, nr)
+
+	var copySkipped int64 // Number of bytes skipped via CopyN
+	copySkipped, tr.err = io.CopyN(ioutil.Discard, tr.r, totalSkip-seekSkipped)
+	if tr.err == io.EOF && seekSkipped+copySkipped < dataSkip {
+		tr.err = io.ErrUnexpectedEOF
+	}
+	return tr.err
 }

 func (tr *Reader) verifyChecksum(header []byte) bool {
@ -487,23 +591,32 @@ func (tr *Reader) verifyChecksum(header []byte) bool {
 		return false
 	}

-	given := tr.octal(header[148:156])
+	var p parser
+	given := p.parseOctal(header[148:156])
 	unsigned, signed := checksum(header)
-	return given == unsigned || given == signed
+	return p.err == nil && (given == unsigned || given == signed)
 }

+// readHeader reads the next block header and assumes that the underlying reader
+// is already aligned to a block boundary.
+//
+// The err will be set to io.EOF only when one of the following occurs:
+//	* Exactly 0 bytes are read and EOF is hit.
+//	* Exactly 1 block of zeros is read and EOF is hit.
+//	* At least 2 blocks of zeros are read.
 func (tr *Reader) readHeader() *Header {
 	header := tr.hdrBuff[:]
 	copy(header, zeroBlock)

-	if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil {
+	if n, err := io.ReadFull(tr.r, header); err != nil {
+		tr.err = err
 		// because it could read some of the block, but reach EOF first
 		if tr.err == io.EOF && tr.RawAccounting {
-			if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
-				return nil
+			if _, err := tr.rawBytes.Write(header[:n]); err != nil {
+				tr.err = err
 			}
 		}
-		return nil
+		return nil // io.EOF is okay here
 	}
 	if tr.RawAccounting {
 		if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
@ -513,14 +626,15 @@ func (tr *Reader) readHeader() *Header {

 	// Two blocks of zero bytes marks the end of the archive.
 	if bytes.Equal(header, zeroBlock[0:blockSize]) {
-		if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil {
+		if n, err := io.ReadFull(tr.r, header); err != nil {
+			tr.err = err
 			// because it could read some of the block, but reach EOF first
 			if tr.err == io.EOF && tr.RawAccounting {
-				if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
-					return nil
+				if _, err := tr.rawBytes.Write(header[:n]); err != nil {
+					tr.err = err
 				}
 			}
-			return nil
+			return nil // io.EOF is okay here
 		}
 		if tr.RawAccounting {
 			if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
@ -541,18 +655,19 @@ func (tr *Reader) readHeader() *Header {
 	}

 	// Unpack
+	var p parser
 	hdr := new(Header)
 	s := slicer(header)

-	hdr.Name = cString(s.next(100))
-	hdr.Mode = tr.octal(s.next(8))
-	hdr.Uid = int(tr.octal(s.next(8)))
-	hdr.Gid = int(tr.octal(s.next(8)))
-	hdr.Size = tr.octal(s.next(12))
-	hdr.ModTime = time.Unix(tr.octal(s.next(12)), 0)
+	hdr.Name = p.parseString(s.next(100))
+	hdr.Mode = p.parseNumeric(s.next(8))
+	hdr.Uid = int(p.parseNumeric(s.next(8)))
+	hdr.Gid = int(p.parseNumeric(s.next(8)))
+	hdr.Size = p.parseNumeric(s.next(12))
+	hdr.ModTime = time.Unix(p.parseNumeric(s.next(12)), 0)
 	s.next(8) // chksum
 	hdr.Typeflag = s.next(1)[0]
-	hdr.Linkname = cString(s.next(100))
+	hdr.Linkname = p.parseString(s.next(100))

 	// The remainder of the header depends on the value of magic.
 	// The original (v7) version of tar had no explicit magic field,
@ -572,70 +687,76 @@ func (tr *Reader) readHeader() *Header {

 	switch format {
 	case "posix", "gnu", "star":
-		hdr.Uname = cString(s.next(32))
-		hdr.Gname = cString(s.next(32))
+		hdr.Uname = p.parseString(s.next(32))
+		hdr.Gname = p.parseString(s.next(32))
 		devmajor := s.next(8)
 		devminor := s.next(8)
 		if hdr.Typeflag == TypeChar || hdr.Typeflag == TypeBlock {
-			hdr.Devmajor = tr.octal(devmajor)
-			hdr.Devminor = tr.octal(devminor)
+			hdr.Devmajor = p.parseNumeric(devmajor)
+			hdr.Devminor = p.parseNumeric(devminor)
 		}
 		var prefix string
 		switch format {
 		case "posix", "gnu":
-			prefix = cString(s.next(155))
+			prefix = p.parseString(s.next(155))
 		case "star":
-			prefix = cString(s.next(131))
-			hdr.AccessTime = time.Unix(tr.octal(s.next(12)), 0)
-			hdr.ChangeTime = time.Unix(tr.octal(s.next(12)), 0)
+			prefix = p.parseString(s.next(131))
+			hdr.AccessTime = time.Unix(p.parseNumeric(s.next(12)), 0)
+			hdr.ChangeTime = time.Unix(p.parseNumeric(s.next(12)), 0)
 		}
 		if len(prefix) > 0 {
 			hdr.Name = prefix + "/" + hdr.Name
 		}
 	}

-	if tr.err != nil {
+	if p.err != nil {
+		tr.err = p.err
+		return nil
+	}
+
+	nb := hdr.Size
+	if isHeaderOnlyType(hdr.Typeflag) {
+		nb = 0
+	}
+	if nb < 0 {
 		tr.err = ErrHeader
 		return nil
 	}

-	// Maximum value of hdr.Size is 64 GB (12 octal digits),
-	// so there's no risk of int64 overflowing.
-	nb := int64(hdr.Size)
-	tr.pad = -nb & (blockSize - 1) // blockSize is a power of two
-
 	// Set the current file reader.
+	tr.pad = -nb & (blockSize - 1) // blockSize is a power of two
 	tr.curr = &regFileReader{r: tr.r, nb: nb}

 	// Check for old GNU sparse format entry.
 	if hdr.Typeflag == TypeGNUSparse {
 		// Get the real size of the file.
-		hdr.Size = tr.octal(header[483:495])
+		hdr.Size = p.parseNumeric(header[483:495])
+		if p.err != nil {
+			tr.err = p.err
+			return nil
+		}

 		// Read the sparse map.
 		sp := tr.readOldGNUSparseMap(header)
 		if tr.err != nil {
 			return nil
 		}
+
 		// Current file is a GNU sparse file. Update the current file reader.
-		tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size}
+		tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size)
+		if tr.err != nil {
+			return nil
+		}
 	}

 	return hdr
 }

-// A sparseEntry holds a single entry in a sparse file's sparse map.
-// A sparse entry indicates the offset and size in a sparse file of a
-// block of data.
-type sparseEntry struct {
-	offset   int64
-	numBytes int64
-}
-
 // readOldGNUSparseMap reads the sparse map as stored in the old GNU sparse format.
 // The sparse map is stored in the tar header if it's small enough. If it's larger than four entries,
 // then one or more extension headers are used to store the rest of the sparse map.
 func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
+	var p parser
 	isExtended := header[oldGNUSparseMainHeaderIsExtendedOffset] != 0
 	spCap := oldGNUSparseMainHeaderNumEntries
 	if isExtended {
@ -646,10 +767,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {

 	// Read the four entries from the main tar header
 	for i := 0; i < oldGNUSparseMainHeaderNumEntries; i++ {
-		offset := tr.octal(s.next(oldGNUSparseOffsetSize))
-		numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize))
-		if tr.err != nil {
-			tr.err = ErrHeader
+		offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize))
+		numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize))
+		if p.err != nil {
+			tr.err = p.err
 			return nil
 		}
 		if offset == 0 && numBytes == 0 {
@ -673,10 +794,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
 		isExtended = sparseHeader[oldGNUSparseExtendedHeaderIsExtendedOffset] != 0
 		s = slicer(sparseHeader)
 		for i := 0; i < oldGNUSparseExtendedHeaderNumEntries; i++ {
-			offset := tr.octal(s.next(oldGNUSparseOffsetSize))
-			numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize))
-			if tr.err != nil {
-				tr.err = ErrHeader
+			offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize))
+			numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize))
+			if p.err != nil {
+				tr.err = p.err
 				return nil
 			}
 			if offset == 0 && numBytes == 0 {
@ -688,134 +809,111 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
 	return sp
 }

-// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format version 1.0.
-// The sparse map is stored just before the file data and padded out to the nearest block boundary.
+// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format
+// version 1.0. The format of the sparse map consists of a series of
+// newline-terminated numeric fields. The first field is the number of entries
+// and is always present. Following this are the entries, consisting of two
+// fields (offset, numBytes). This function must stop reading at the end
+// boundary of the block containing the last newline.
+//
+// Note that the GNU manual says that numeric values should be encoded in octal
+// format. However, the GNU tar utility itself outputs these values in decimal.
+// As such, this library treats values as being encoded in decimal.
 func readGNUSparseMap1x0(r io.Reader) ([]sparseEntry, error) {
-	buf := make([]byte, 2*blockSize)
-	sparseHeader := buf[:blockSize]
+	var cntNewline int64
+	var buf bytes.Buffer
+	var blk = make([]byte, blockSize)

-	// readDecimal is a helper function to read a decimal integer from the sparse map
-	// while making sure to read from the file in blocks of size blockSize
-	readDecimal := func() (int64, error) {
-		// Look for newline
-		nl := bytes.IndexByte(sparseHeader, '\n')
-		if nl == -1 {
-			if len(sparseHeader) >= blockSize {
-				// This is an error
-				return 0, ErrHeader
+	// feedTokens copies data in numBlock chunks from r into buf until there are
+	// at least cnt newlines in buf. It will not read more blocks than needed.
+	var feedTokens = func(cnt int64) error {
+		for cntNewline < cnt {
+			if _, err := io.ReadFull(r, blk); err != nil {
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
 				}
-			oldLen := len(sparseHeader)
-			newLen := oldLen + blockSize
-			if cap(sparseHeader) < newLen {
-				// There's more header, but we need to make room for the next block
-				copy(buf, sparseHeader)
-				sparseHeader = buf[:newLen]
-			} else {
-				// There's more header, and we can just reslice
-				sparseHeader = sparseHeader[:newLen]
+				return err
+			}
+			buf.Write(blk)
+			for _, c := range blk {
+				if c == '\n' {
+					cntNewline++
+				}
+			}
+		}
+		return nil
 	}

-			// Now that sparseHeader is large enough, read next block
-			if _, err := io.ReadFull(r, sparseHeader[oldLen:newLen]); err != nil {
-				return 0, err
-			}
-			// leaving this function for io.Reader makes it more testable
-			if tr, ok := r.(*Reader); ok && tr.RawAccounting {
-				if _, err := tr.rawBytes.Write(sparseHeader[oldLen:newLen]); err != nil {
-					return 0, err
-				}
+	// nextToken gets the next token delimited by a newline. This assumes that
+	// at least one newline exists in the buffer.
+	var nextToken = func() string {
+		cntNewline--
+		tok, _ := buf.ReadString('\n')
+		return tok[:len(tok)-1] // Cut off newline
 	}

-			// Look for a newline in the new data
-			nl = bytes.IndexByte(sparseHeader[oldLen:newLen], '\n')
-			if nl == -1 {
-				// This is an error
-				return 0, ErrHeader
-			}
-			nl += oldLen // We want the position from the beginning
-		}
-		// Now that we've found a newline, read a number
-		n, err := strconv.ParseInt(string(sparseHeader[:nl]), 10, 0)
-		if err != nil {
-			return 0, ErrHeader
-		}
-
-		// Update sparseHeader to consume this number
-		sparseHeader = sparseHeader[nl+1:]
-		return n, nil
-	}
-
-	// Read the first block
-	if _, err := io.ReadFull(r, sparseHeader); err != nil {
-		return nil, err
-	}
-	// leaving this function for io.Reader makes it more testable
-	if tr, ok := r.(*Reader); ok && tr.RawAccounting {
-		if _, err := tr.rawBytes.Write(sparseHeader); err != nil {
+	// Parse for the number of entries.
+	// Use integer overflow resistant math to check this.
+	if err := feedTokens(1); err != nil {
 		return nil, err
 	}
+	numEntries, err := strconv.ParseInt(nextToken(), 10, 0) // Intentionally parse as native int
+	if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) {
+		return nil, ErrHeader
 	}

-	// The first line contains the number of entries
-	numEntries, err := readDecimal()
-	if err != nil {
+	// Parse for all member entries.
+	// numEntries is trusted after this since a potential attacker must have
+	// committed resources proportional to what this library used.
+	if err := feedTokens(2 * numEntries); err != nil {
 		return nil, err
 	}
-
-	// Read all the entries
 	sp := make([]sparseEntry, 0, numEntries)
 	for i := int64(0); i < numEntries; i++ {
-		// Read the offset
-		offset, err := readDecimal()
+		offset, err := strconv.ParseInt(nextToken(), 10, 64)
 		if err != nil {
-			return nil, err
+			return nil, ErrHeader
 		}
-		// Read numBytes
-		numBytes, err := readDecimal()
+		numBytes, err := strconv.ParseInt(nextToken(), 10, 64)
 		if err != nil {
-			return nil, err
+			return nil, ErrHeader
 		}
-
 		sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes})
 	}
-
 	return sp, nil
 }

-// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format version 0.1.
-// The sparse map is stored in the PAX headers.
-func readGNUSparseMap0x1(headers map[string]string) ([]sparseEntry, error) {
-	// Get number of entries
-	numEntriesStr, ok := headers[paxGNUSparseNumBlocks]
-	if !ok {
-		return nil, ErrHeader
-	}
-	numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0)
-	if err != nil {
+// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format
+// version 0.1. The sparse map is stored in the PAX headers.
+func readGNUSparseMap0x1(extHdrs map[string]string) ([]sparseEntry, error) {
+	// Get number of entries.
+	// Use integer overflow resistant math to check this.
+	numEntriesStr := extHdrs[paxGNUSparseNumBlocks]
+	numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0) // Intentionally parse as native int
+	if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) {
 		return nil, ErrHeader
 	}

-	sparseMap := strings.Split(headers[paxGNUSparseMap], ",")
-
-	// There should be two numbers in sparseMap for each entry
+	// There should be two numbers in sparseMap for each entry.
+	sparseMap := strings.Split(extHdrs[paxGNUSparseMap], ",")
 	if int64(len(sparseMap)) != 2*numEntries {
 		return nil, ErrHeader
 	}

-	// Loop through the entries in the sparse map
+	// Loop through the entries in the sparse map.
+	// numEntries is trusted now.
 	sp := make([]sparseEntry, 0, numEntries)
 	for i := int64(0); i < numEntries; i++ {
-		offset, err := strconv.ParseInt(sparseMap[2*i], 10, 0)
+		offset, err := strconv.ParseInt(sparseMap[2*i], 10, 64)
 		if err != nil {
 			return nil, ErrHeader
 		}
-		numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 0)
+		numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 64)
 		if err != nil {
 			return nil, ErrHeader
 		}
 		sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes})
 	}
-
 	return sp, nil
 }

@ -832,10 +930,18 @@ func (tr *Reader) numBytes() int64 {
 // Read reads from the current entry in the tar archive.
 // It returns 0, io.EOF when it reaches the end of that entry,
 // until Next is called to advance to the next entry.
+//
+// Calling Read on special types like TypeLink, TypeSymLink, TypeChar,
+// TypeBlock, TypeDir, and TypeFifo returns 0, io.EOF regardless of what
+// the Header.Size claims.
 func (tr *Reader) Read(b []byte) (n int, err error) {
+	if tr.err != nil {
+		return 0, tr.err
+	}
 	if tr.curr == nil {
 		return 0, io.EOF
 	}
+
 	n, err = tr.curr.Read(b)
 	if err != nil && err != io.EOF {
 		tr.err = err
@ -865,9 +971,33 @@ func (rfr *regFileReader) numBytes() int64 {
 	return rfr.nb
 }

-// readHole reads a sparse file hole ending at offset toOffset
-func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int {
-	n64 := toOffset - sfr.pos
+// newSparseFileReader creates a new sparseFileReader, but validates all of the
+// sparse entries before doing so.
+func newSparseFileReader(rfr numBytesReader, sp []sparseEntry, total int64) (*sparseFileReader, error) {
+	if total < 0 {
+		return nil, ErrHeader // Total size cannot be negative
+	}
+
+	// Validate all sparse entries. These are the same checks as performed by
+	// the BSD tar utility.
+	for i, s := range sp {
+		switch {
+		case s.offset < 0 || s.numBytes < 0:
+			return nil, ErrHeader // Negative values are never okay
+		case s.offset > math.MaxInt64-s.numBytes:
+			return nil, ErrHeader // Integer overflow with large length
+		case s.offset+s.numBytes > total:
+			return nil, ErrHeader // Region extends beyond the "real" size
+		case i > 0 && sp[i-1].offset+sp[i-1].numBytes > s.offset:
+			return nil, ErrHeader // Regions can't overlap and must be in order
+		}
+	}
+	return &sparseFileReader{rfr: rfr, sp: sp, total: total}, nil
+}
+
+// readHole reads a sparse hole ending at endOffset.
+func (sfr *sparseFileReader) readHole(b []byte, endOffset int64) int {
+	n64 := endOffset - sfr.pos
 	if n64 > int64(len(b)) {
 		n64 = int64(len(b))
 	}
@ -881,46 +1011,54 @@ func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int {

 // Read reads the sparse file data in expanded form.
 func (sfr *sparseFileReader) Read(b []byte) (n int, err error) {
-	if len(sfr.sp) == 0 {
-		// No more data fragments to read from.
-		if sfr.pos < sfr.tot {
-			// We're in the last hole
-			n = sfr.readHole(b, sfr.tot)
-			return
-		}
-		// Otherwise, we're at the end of the file
-		return 0, io.EOF
-	}
-	if sfr.pos < sfr.sp[0].offset {
-		// We're in a hole
-		n = sfr.readHole(b, sfr.sp[0].offset)
-		return
+	// Skip past all empty fragments.
+	for len(sfr.sp) > 0 && sfr.sp[0].numBytes == 0 {
+		sfr.sp = sfr.sp[1:]
 	}

-	// We're not in a hole, so we'll read from the next data fragment
-	posInFragment := sfr.pos - sfr.sp[0].offset
-	bytesLeft := sfr.sp[0].numBytes - posInFragment
+	// If there are no more fragments, then it is possible that there
+	// is one last sparse hole.
+	if len(sfr.sp) == 0 {
+		// This behavior matches the BSD tar utility.
+		// However, GNU tar stops returning data even if sfr.total is unmet.
+		if sfr.pos < sfr.total {
+			return sfr.readHole(b, sfr.total), nil
+		}
+		return 0, io.EOF
+	}
+
+	// In front of a data fragment, so read a hole.
+	if sfr.pos < sfr.sp[0].offset {
+		return sfr.readHole(b, sfr.sp[0].offset), nil
+	}
+
+	// In a data fragment, so read from it.
+	// This math is overflow free since we verify that offset and numBytes can
+	// be safely added when creating the sparseFileReader.
+	endPos := sfr.sp[0].offset + sfr.sp[0].numBytes // End offset of fragment
+	bytesLeft := endPos - sfr.pos                   // Bytes left in fragment
 	if int64(len(b)) > bytesLeft {
-		b = b[0:bytesLeft]
+		b = b[:bytesLeft]
 	}

 	n, err = sfr.rfr.Read(b)
 	sfr.pos += int64(n)
-
-	if int64(n) == bytesLeft {
-		// We're done with this fragment
-		sfr.sp = sfr.sp[1:]
+	if err == io.EOF {
+		if sfr.pos < endPos {
+			err = io.ErrUnexpectedEOF // There was supposed to be more data
+		} else if sfr.pos < sfr.total {
+			err = nil // There is still an implicit sparse hole at the end
+		}
 	}

-	if err == io.EOF && sfr.pos < sfr.tot {
-		// We reached the end of the last fragment's data, but there's a final hole
-		err = nil
+	if sfr.pos == endPos {
+		sfr.sp = sfr.sp[1:] // We are done with this fragment, so pop it
 	}
-	return
+	return n, err
 }

 // numBytes returns the number of bytes left to read in the sparse file's
 // sparse-encoded data in the tar archive.
 func (sfr *sparseFileReader) numBytes() int64 {
-	return sfr.rfr.nb
+	return sfr.rfr.numBytes()
 }
--- a/archive/tar/reader_test.go
+++ b/archive/tar/reader_test.go
--- a/archive/tar/tar_test.go
+++ b/archive/tar/tar_test.go
@ -97,10 +97,9 @@ func TestRoundTrip(t *testing.T) {
 		Name: "file.txt",
 		Uid:  1 << 21, // too big for 8 octal digits
 		Size: int64(len(data)),
-		ModTime: time.Now(),
+		// https://github.com/golang/go/commit/0e3355903d2ebcf5ee9e76096f51ac9a116a9dbb#diff-d7bf2a98d7b57b6ff754ca406f1b7581R105
+		ModTime: time.Now().AddDate(0, 0, 0).Round(1 * time.Second),
 	}
-	// tar only supports second precision.
-	hdr.ModTime = hdr.ModTime.Add(-time.Duration(hdr.ModTime.Nanosecond()) * time.Nanosecond)
 	if err := tw.WriteHeader(hdr); err != nil {
 		t.Fatalf("tw.WriteHeader: %v", err)
 	}
@ -147,17 +146,6 @@ func TestHeaderRoundTrip(t *testing.T) {
 			},
 			fm: 0644,
 		},
-		// hard link.
-		{
-			h: &Header{
-				Name:     "hard.txt",
-				Mode:     0644 | c_ISLNK,
-				Size:     0,
-				ModTime:  time.Unix(1360600916, 0),
-				Typeflag: TypeLink,
-			},
-			fm: 0644 | os.ModeSymlink,
-		},
 		// symbolic link.
 		{
 			h: &Header{
@ -246,6 +234,33 @@ func TestHeaderRoundTrip(t *testing.T) {
 			},
 			fm: 0600 | os.ModeSticky,
 		},
+		// hard link.
+		{
+			h: &Header{
+				Name:     "hard.txt",
+				Mode:     0644 | c_ISREG,
+				Size:     0,
+				Linkname: "file.txt",
+				ModTime:  time.Unix(1360600916, 0),
+				Typeflag: TypeLink,
+			},
+			fm: 0644,
+		},
+		// More information.
+		{
+			h: &Header{
+				Name:     "info.txt",
+				Mode:     0600 | c_ISREG,
+				Size:     0,
+				Uid:      1000,
+				Gid:      1000,
+				ModTime:  time.Unix(1360602540, 0),
+				Uname:    "slartibartfast",
+				Gname:    "users",
+				Typeflag: TypeReg,
+			},
+			fm: 0600,
+		},
 	}

 	for i, g := range golden {
@ -268,12 +283,37 @@ func TestHeaderRoundTrip(t *testing.T) {
 		if got, want := h2.Size, g.h.Size; got != want {
 			t.Errorf("i=%d: Size: got %v, want %v", i, got, want)
 		}
+		if got, want := h2.Uid, g.h.Uid; got != want {
+			t.Errorf("i=%d: Uid: got %d, want %d", i, got, want)
+		}
+		if got, want := h2.Gid, g.h.Gid; got != want {
+			t.Errorf("i=%d: Gid: got %d, want %d", i, got, want)
+		}
+		if got, want := h2.Uname, g.h.Uname; got != want {
+			t.Errorf("i=%d: Uname: got %q, want %q", i, got, want)
+		}
+		if got, want := h2.Gname, g.h.Gname; got != want {
+			t.Errorf("i=%d: Gname: got %q, want %q", i, got, want)
+		}
+		if got, want := h2.Linkname, g.h.Linkname; got != want {
+			t.Errorf("i=%d: Linkname: got %v, want %v", i, got, want)
+		}
+		if got, want := h2.Typeflag, g.h.Typeflag; got != want {
+			t.Logf("%#v %#v", g.h, fi.Sys())
+			t.Errorf("i=%d: Typeflag: got %q, want %q", i, got, want)
+		}
 		if got, want := h2.Mode, g.h.Mode; got != want {
 			t.Errorf("i=%d: Mode: got %o, want %o", i, got, want)
 		}
 		if got, want := fi.Mode(), g.fm; got != want {
 			t.Errorf("i=%d: fi.Mode: got %o, want %o", i, got, want)
 		}
+		if got, want := h2.AccessTime, g.h.AccessTime; got != want {
+			t.Errorf("i=%d: AccessTime: got %v, want %v", i, got, want)
+		}
+		if got, want := h2.ChangeTime, g.h.ChangeTime; got != want {
+			t.Errorf("i=%d: ChangeTime: got %v, want %v", i, got, want)
+		}
 		if got, want := h2.ModTime, g.h.ModTime; got != want {
 			t.Errorf("i=%d: ModTime: got %v, want %v", i, got, want)
 		}
--- a/archive/tar/testdata/gnu-multi-hdrs.tar
+++ b/archive/tar/testdata/gnu-multi-hdrs.tar
--- a/archive/tar/testdata/hardlink.tar
+++ b/archive/tar/testdata/hardlink.tar
--- a/archive/tar/testdata/hdr-only.tar
+++ b/archive/tar/testdata/hdr-only.tar
--- a/archive/tar/testdata/issue10968.tar
+++ b/archive/tar/testdata/issue10968.tar
--- a/archive/tar/testdata/issue11169.tar
+++ b/archive/tar/testdata/issue11169.tar
--- a/archive/tar/testdata/issue12435.tar
+++ b/archive/tar/testdata/issue12435.tar
--- a/archive/tar/testdata/neg-size.tar
+++ b/archive/tar/testdata/neg-size.tar
--- a/archive/tar/testdata/pax-multi-hdrs.tar
+++ b/archive/tar/testdata/pax-multi-hdrs.tar
--- a/archive/tar/testdata/pax-path-hdr.tar
+++ b/archive/tar/testdata/pax-path-hdr.tar
--- a/archive/tar/testdata/ustar-file-reg.tar
+++ b/archive/tar/testdata/ustar-file-reg.tar
--- a/archive/tar/writer.go
+++ b/archive/tar/writer.go
@ -12,8 +12,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"os"
 	"path"
+	"sort"
 	"strconv"
 	"strings"
 	"time"
@ -23,7 +23,6 @@ var (
 	ErrWriteTooLong    = errors.New("archive/tar: write too long")
 	ErrFieldTooLong    = errors.New("archive/tar: header field too long")
 	ErrWriteAfterClose = errors.New("archive/tar: write after close")
-	errNameTooLong     = errors.New("archive/tar: name too long")
 	errInvalidHeader   = errors.New("archive/tar: header field too long or contains invalid values")
 )

@ -43,6 +42,10 @@ type Writer struct {
 	paxHdrBuff [blockSize]byte // buffer to use in writeHeader when writing a pax header
 }

+type formatter struct {
+	err error // Last error seen
+}
+
 // NewWriter creates a new Writer writing to w.
 func NewWriter(w io.Writer) *Writer { return &Writer{w: w} }

@ -69,17 +72,9 @@ func (tw *Writer) Flush() error {
 }

 // Write s into b, terminating it with a NUL if there is room.
-// If the value is too long for the field and allowPax is true add a paxheader record instead
-func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string, paxHeaders map[string]string) {
-	needsPaxHeader := allowPax && len(s) > len(b) || !isASCII(s)
-	if needsPaxHeader {
-		paxHeaders[paxKeyword] = s
-		return
-	}
+func (f *formatter) formatString(b []byte, s string) {
 	if len(s) > len(b) {
-		if tw.err == nil {
-			tw.err = ErrFieldTooLong
-		}
+		f.err = ErrFieldTooLong
 		return
 	}
 	ascii := toASCII(s)
@ -90,40 +85,40 @@ func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string,
 }

 // Encode x as an octal ASCII string and write it into b with leading zeros.
-func (tw *Writer) octal(b []byte, x int64) {
+func (f *formatter) formatOctal(b []byte, x int64) {
 	s := strconv.FormatInt(x, 8)
 	// leading zeros, but leave room for a NUL.
 	for len(s)+1 < len(b) {
 		s = "0" + s
 	}
-	tw.cString(b, s, false, paxNone, nil)
+	f.formatString(b, s)
 }

-// Write x into b, either as octal or as binary (GNUtar/star extension).
-// If the value is too long for the field and writingPax is enabled both for the field and the add a paxheader record instead
-func (tw *Writer) numeric(b []byte, x int64, allowPax bool, paxKeyword string, paxHeaders map[string]string) {
-	// Try octal first.
-	s := strconv.FormatInt(x, 8)
-	if len(s) < len(b) {
-		tw.octal(b, x)
-		return
+// fitsInBase256 reports whether x can be encoded into n bytes using base-256
+// encoding. Unlike octal encoding, base-256 encoding does not require that the
+// string ends with a NUL character. Thus, all n bytes are available for output.
+//
+// If operating in binary mode, this assumes strict GNU binary mode; which means
+// that the first byte can only be either 0x80 or 0xff. Thus, the first byte is
+// equivalent to the sign bit in two's complement form.
+func fitsInBase256(n int, x int64) bool {
+	var binBits = uint(n-1) * 8
+	return n >= 9 || (x >= -1<<binBits && x < 1<<binBits)
 }

-	// If it is too long for octal, and pax is preferred, use a pax header
-	if allowPax && tw.preferPax {
-		tw.octal(b, 0)
-		s := strconv.FormatInt(x, 10)
-		paxHeaders[paxKeyword] = s
-		return
-	}
-
-	// Too big: use binary (big-endian).
-	tw.usedBinary = true
-	for i := len(b) - 1; x > 0 && i >= 0; i-- {
+// Write x into b, as binary (GNUtar/star extension).
+func (f *formatter) formatNumeric(b []byte, x int64) {
+	if fitsInBase256(len(b), x) {
+		for i := len(b) - 1; i >= 0; i-- {
 			b[i] = byte(x)
 			x >>= 8
 		}
-	b[0] |= 0x80 // highest bit indicates binary format
+		b[0] |= 0x80 // Highest bit indicates binary format
+		return
+	}
+
+	f.formatOctal(b, 0) // Last resort, just write zero
+	f.err = ErrFieldTooLong
 }

 var (
@ -162,6 +157,7 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
 	// subsecond time resolution, but for now let's just capture
 	// too long fields or non ascii characters

+	var f formatter
 	var header []byte

 	// We need to select which scratch buffer to use carefully,
@ -176,10 +172,40 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
 	copy(header, zeroBlock)
 	s := slicer(header)

+	// Wrappers around formatter that automatically sets paxHeaders if the
+	// argument extends beyond the capacity of the input byte slice.
+	var formatString = func(b []byte, s string, paxKeyword string) {
+		needsPaxHeader := paxKeyword != paxNone && len(s) > len(b) || !isASCII(s)
+		if needsPaxHeader {
+			paxHeaders[paxKeyword] = s
+			return
+		}
+		f.formatString(b, s)
+	}
+	var formatNumeric = func(b []byte, x int64, paxKeyword string) {
+		// Try octal first.
+		s := strconv.FormatInt(x, 8)
+		if len(s) < len(b) {
+			f.formatOctal(b, x)
+			return
+		}
+
+		// If it is too long for octal, and PAX is preferred, use a PAX header.
+		if paxKeyword != paxNone && tw.preferPax {
+			f.formatOctal(b, 0)
+			s := strconv.FormatInt(x, 10)
+			paxHeaders[paxKeyword] = s
+			return
+		}
+
+		tw.usedBinary = true
+		f.formatNumeric(b, x)
+	}
+
 	// keep a reference to the filename to allow to overwrite it later if we detect that we can use ustar longnames instead of pax
 	pathHeaderBytes := s.next(fileNameSize)

-	tw.cString(pathHeaderBytes, hdr.Name, true, paxPath, paxHeaders)
+	formatString(pathHeaderBytes, hdr.Name, paxPath)

 	// Handle out of range ModTime carefully.
 	var modTime int64
@ -187,25 +213,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
 		modTime = hdr.ModTime.Unix()
 	}

-	tw.octal(s.next(8), hdr.Mode)                                   // 100:108
-	tw.numeric(s.next(8), int64(hdr.Uid), true, paxUid, paxHeaders) // 108:116
-	tw.numeric(s.next(8), int64(hdr.Gid), true, paxGid, paxHeaders) // 116:124
-	tw.numeric(s.next(12), hdr.Size, true, paxSize, paxHeaders)     // 124:136
-	tw.numeric(s.next(12), modTime, false, paxNone, nil)            // 136:148 --- consider using pax for finer granularity
+	f.formatOctal(s.next(8), hdr.Mode)               // 100:108
+	formatNumeric(s.next(8), int64(hdr.Uid), paxUid) // 108:116
+	formatNumeric(s.next(8), int64(hdr.Gid), paxGid) // 116:124
+	formatNumeric(s.next(12), hdr.Size, paxSize)     // 124:136
+	formatNumeric(s.next(12), modTime, paxNone)      // 136:148 --- consider using pax for finer granularity
 	s.next(8)                                        // chksum (148:156)
 	s.next(1)[0] = hdr.Typeflag                      // 156:157

-	tw.cString(s.next(100), hdr.Linkname, true, paxLinkpath, paxHeaders)
+	formatString(s.next(100), hdr.Linkname, paxLinkpath)

 	copy(s.next(8), []byte("ustar\x0000"))          // 257:265
-	tw.cString(s.next(32), hdr.Uname, true, paxUname, paxHeaders) // 265:297
-	tw.cString(s.next(32), hdr.Gname, true, paxGname, paxHeaders) // 297:329
-	tw.numeric(s.next(8), hdr.Devmajor, false, paxNone, nil)      // 329:337
-	tw.numeric(s.next(8), hdr.Devminor, false, paxNone, nil)      // 337:345
+	formatString(s.next(32), hdr.Uname, paxUname)   // 265:297
+	formatString(s.next(32), hdr.Gname, paxGname)   // 297:329
+	formatNumeric(s.next(8), hdr.Devmajor, paxNone) // 329:337
+	formatNumeric(s.next(8), hdr.Devminor, paxNone) // 337:345

 	// keep a reference to the prefix to allow to overwrite it later if we detect that we can use ustar longnames instead of pax
 	prefixHeaderBytes := s.next(155)
-	tw.cString(prefixHeaderBytes, "", false, paxNone, nil) // 345:500  prefix
+	formatString(prefixHeaderBytes, "", paxNone) // 345:500  prefix

 	// Use the GNU magic instead of POSIX magic if we used any GNU extensions.
 	if tw.usedBinary {
@ -215,37 +241,26 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
 	_, paxPathUsed := paxHeaders[paxPath]
 	// try to use a ustar header when only the name is too long
 	if !tw.preferPax && len(paxHeaders) == 1 && paxPathUsed {
-		suffix := hdr.Name
-		prefix := ""
-		if len(hdr.Name) > fileNameSize && isASCII(hdr.Name) {
-			var err error
-			prefix, suffix, err = tw.splitUSTARLongName(hdr.Name)
-			if err == nil {
-				// ok we can use a ustar long name instead of pax, now correct the fields
-
-				// remove the path field from the pax header. this will suppress the pax header
+		prefix, suffix, ok := splitUSTARPath(hdr.Name)
+		if ok {
+			// Since we can encode in USTAR format, disable PAX header.
 			delete(paxHeaders, paxPath)

-				// update the path fields
-				tw.cString(pathHeaderBytes, suffix, false, paxNone, nil)
-				tw.cString(prefixHeaderBytes, prefix, false, paxNone, nil)
-
-				// Use the ustar magic if we used ustar long names.
-				if len(prefix) > 0 && !tw.usedBinary {
-					copy(header[257:265], []byte("ustar\x00"))
-				}
-			}
+			// Update the path fields
+			formatString(pathHeaderBytes, suffix, paxNone)
+			formatString(prefixHeaderBytes, prefix, paxNone)
 		}
 	}

 	// The chksum field is terminated by a NUL and a space.
 	// This is different from the other octal fields.
 	chksum, _ := checksum(header)
-	tw.octal(header[148:155], chksum)
+	f.formatOctal(header[148:155], chksum) // Never fails
 	header[155] = ' '

-	if tw.err != nil {
-		// problem with header; probably integer too big for a field.
+	// Check if there were any formatting errors.
+	if f.err != nil {
+		tw.err = f.err
 		return tw.err
 	}

@ -270,28 +285,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
 	return tw.err
 }

-// writeUSTARLongName splits a USTAR long name hdr.Name.
-// name must be < 256 characters. errNameTooLong is returned
-// if hdr.Name can't be split. The splitting heuristic
-// is compatible with gnu tar.
-func (tw *Writer) splitUSTARLongName(name string) (prefix, suffix string, err error) {
+// splitUSTARPath splits a path according to USTAR prefix and suffix rules.
+// If the path is not splittable, then it will return ("", "", false).
+func splitUSTARPath(name string) (prefix, suffix string, ok bool) {
 	length := len(name)
-	if length > fileNamePrefixSize+1 {
+	if length <= fileNameSize || !isASCII(name) {
+		return "", "", false
+	} else if length > fileNamePrefixSize+1 {
 		length = fileNamePrefixSize + 1
 	} else if name[length-1] == '/' {
 		length--
 	}
+
 	i := strings.LastIndex(name[:length], "/")
-	// nlen contains the resulting length in the name field.
-	// plen contains the resulting length in the prefix field.
-	nlen := len(name) - i - 1
-	plen := i
+	nlen := len(name) - i - 1 // nlen is length of suffix
+	plen := i                 // plen is length of prefix
 	if i <= 0 || nlen > fileNameSize || nlen == 0 || plen > fileNamePrefixSize {
-		err = errNameTooLong
-		return
+		return "", "", false
 	}
-	prefix, suffix = name[:i], name[i+1:]
-	return
+	return name[:i], name[i+1:], true
 }

 // writePaxHeader writes an extended pax header to the
@ -304,11 +316,11 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
 	// succeed, and seems harmless enough.
 	ext.ModTime = hdr.ModTime
 	// The spec asks that we namespace our pseudo files
-	// with the current pid.
-	pid := os.Getpid()
+	// with the current pid.  However, this results in differing outputs
+	// for identical inputs.  As such, the constant 0 is now used instead.
+	// golang.org/issue/12358
 	dir, file := path.Split(hdr.Name)
-	fullName := path.Join(dir,
-		fmt.Sprintf("PaxHeaders.%d", pid), file)
+	fullName := path.Join(dir, "PaxHeaders.0", file)

 	ascii := toASCII(fullName)
 	if len(ascii) > 100 {
@ -318,8 +330,15 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
 	// Construct the body
 	var buf bytes.Buffer

-	for k, v := range paxHeaders {
-		fmt.Fprint(&buf, paxHeader(k+"="+v))
+	// Keys are sorted before writing to body to allow deterministic output.
+	var keys []string
+	for k := range paxHeaders {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	for _, k := range keys {
+		fmt.Fprint(&buf, formatPAXRecord(k, paxHeaders[k]))
 	}

 	ext.Size = int64(len(buf.Bytes()))
@ -335,17 +354,18 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
 	return nil
 }

-// paxHeader formats a single pax record, prefixing it with the appropriate length
-func paxHeader(msg string) string {
-	const padding = 2 // Extra padding for space and newline
-	size := len(msg) + padding
+// formatPAXRecord formats a single PAX record, prefixing it with the
+// appropriate length.
+func formatPAXRecord(k, v string) string {
+	const padding = 3 // Extra padding for ' ', '=', and '\n'
+	size := len(k) + len(v) + padding
 	size += len(strconv.Itoa(size))
-	record := fmt.Sprintf("%d %s\n", size, msg)
+	record := fmt.Sprintf("%d %s=%s\n", size, k, v)
+
+	// Final adjustment if adding size field increased the record size.
 	if len(record) != size {
-		// Final adjustment if adding size increased
-		// the number of digits in size
 		size = len(record)
-		record = fmt.Sprintf("%d %s\n", size, msg)
+		record = fmt.Sprintf("%d %s=%s\n", size, k, v)
 	}
 	return record
 }
@ -355,7 +375,7 @@ func paxHeader(msg string) string {
 // hdr.Size bytes are written after WriteHeader.
 func (tw *Writer) Write(b []byte) (n int, err error) {
 	if tw.closed {
-		err = ErrWriteTooLong
+		err = ErrWriteAfterClose
 		return
 	}
 	overwrite := false
--- a/archive/tar/writer_test.go
+++ b/archive/tar/writer_test.go
@ -9,8 +9,10 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math"
 	"os"
 	"reflect"
+	"sort"
 	"strings"
 	"testing"
 	"testing/iotest"
@ -147,6 +149,44 @@ var writerTests = []*writerTest{
 			},
 		},
 	},
+	// This file was produced using gnu tar 1.26
+	// echo "Slartibartfast" > file.txt
+	// ln file.txt hard.txt
+	// tar -b 1 --format=ustar -c -f hardlink.tar file.txt hard.txt
+	{
+		file: "testdata/hardlink.tar",
+		entries: []*writerTestEntry{
+			{
+				header: &Header{
+					Name:     "file.txt",
+					Mode:     0644,
+					Uid:      1000,
+					Gid:      100,
+					Size:     15,
+					ModTime:  time.Unix(1425484303, 0),
+					Typeflag: '0',
+					Uname:    "vbatts",
+					Gname:    "users",
+				},
+				contents: "Slartibartfast\n",
+			},
+			{
+				header: &Header{
+					Name:     "hard.txt",
+					Mode:     0644,
+					Uid:      1000,
+					Gid:      100,
+					Size:     0,
+					ModTime:  time.Unix(1425484303, 0),
+					Typeflag: '1',
+					Linkname: "file.txt",
+					Uname:    "vbatts",
+					Gname:    "users",
+				},
+				// no contents
+			},
+		},
+	},
 }

 // Render byte array in a two-character hexadecimal string, spaced for easy visual inspection.
@ -253,7 +293,7 @@ func TestPax(t *testing.T) {
 		t.Fatal(err)
 	}
 	// Simple test to make sure PAX extensions are in effect
-	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
+	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
 		t.Fatal("Expected at least one PAX header to be written.")
 	}
 	// Test that we can get a long name back out of the archive.
@ -292,7 +332,7 @@ func TestPaxSymlink(t *testing.T) {
 		t.Fatal(err)
 	}
 	// Simple test to make sure PAX extensions are in effect
-	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
+	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
 		t.Fatal("Expected at least one PAX header to be written.")
 	}
 	// Test that we can get a long name back out of the archive.
@ -342,7 +382,7 @@ func TestPaxNonAscii(t *testing.T) {
 		t.Fatal(err)
 	}
 	// Simple test to make sure PAX extensions are in effect
-	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
+	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
 		t.Fatal("Expected at least one PAX header to be written.")
 	}
 	// Test that we can get a long name back out of the archive.
@ -401,21 +441,49 @@ func TestPaxXattrs(t *testing.T) {
 	}
 }

-func TestPAXHeader(t *testing.T) {
-	medName := strings.Repeat("CD", 50)
-	longName := strings.Repeat("AB", 100)
-	paxTests := [][2]string{
-		{paxPath + "=/etc/hosts", "19 path=/etc/hosts\n"},
-		{"a=b", "6 a=b\n"},          // Single digit length
-		{"a=names", "11 a=names\n"}, // Test case involving carries
-		{paxPath + "=" + longName, fmt.Sprintf("210 path=%s\n", longName)},
-		{paxPath + "=" + medName, fmt.Sprintf("110 path=%s\n", medName)}}
-
-	for _, test := range paxTests {
-		key, expected := test[0], test[1]
-		if result := paxHeader(key); result != expected {
-			t.Fatalf("paxHeader: got %s, expected %s", result, expected)
+func TestPaxHeadersSorted(t *testing.T) {
+	fileinfo, err := os.Stat("testdata/small.txt")
+	if err != nil {
+		t.Fatal(err)
 	}
+	hdr, err := FileInfoHeader(fileinfo, "")
+	if err != nil {
+		t.Fatalf("os.Stat: %v", err)
+	}
+	contents := strings.Repeat(" ", int(hdr.Size))
+
+	hdr.Xattrs = map[string]string{
+		"foo": "foo",
+		"bar": "bar",
+		"baz": "baz",
+		"qux": "qux",
+	}
+
+	var buf bytes.Buffer
+	writer := NewWriter(&buf)
+	if err := writer.WriteHeader(hdr); err != nil {
+		t.Fatal(err)
+	}
+	if _, err = writer.Write([]byte(contents)); err != nil {
+		t.Fatal(err)
+	}
+	if err := writer.Close(); err != nil {
+		t.Fatal(err)
+	}
+	// Simple test to make sure PAX extensions are in effect
+	if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
+		t.Fatal("Expected at least one PAX header to be written.")
+	}
+
+	// xattr bar should always appear before others
+	indices := []int{
+		bytes.Index(buf.Bytes(), []byte("bar=bar")),
+		bytes.Index(buf.Bytes(), []byte("baz=baz")),
+		bytes.Index(buf.Bytes(), []byte("foo=foo")),
+		bytes.Index(buf.Bytes(), []byte("qux=qux")),
+	}
+	if !sort.IntsAreSorted(indices) {
+		t.Fatal("PAX headers are not sorted")
 	}
 }

@ -489,3 +557,166 @@ func TestValidTypeflagWithPAXHeader(t *testing.T) {
 		}
 	}
 }
+
+func TestWriteAfterClose(t *testing.T) {
+	var buffer bytes.Buffer
+	tw := NewWriter(&buffer)
+
+	hdr := &Header{
+		Name: "small.txt",
+		Size: 5,
+	}
+	if err := tw.WriteHeader(hdr); err != nil {
+		t.Fatalf("Failed to write header: %s", err)
+	}
+	tw.Close()
+	if _, err := tw.Write([]byte("Kilts")); err != ErrWriteAfterClose {
+		t.Fatalf("Write: got %v; want ErrWriteAfterClose", err)
+	}
+}
+
+func TestSplitUSTARPath(t *testing.T) {
+	var sr = strings.Repeat
+
+	var vectors = []struct {
+		input  string // Input path
+		prefix string // Expected output prefix
+		suffix string // Expected output suffix
+		ok     bool   // Split success?
+	}{
+		{"", "", "", false},
+		{"abc", "", "", false},
+		{"用戶名", "", "", false},
+		{sr("a", fileNameSize), "", "", false},
+		{sr("a", fileNameSize) + "/", "", "", false},
+		{sr("a", fileNameSize) + "/a", sr("a", fileNameSize), "a", true},
+		{sr("a", fileNamePrefixSize) + "/", "", "", false},
+		{sr("a", fileNamePrefixSize) + "/a", sr("a", fileNamePrefixSize), "a", true},
+		{sr("a", fileNameSize+1), "", "", false},
+		{sr("/", fileNameSize+1), sr("/", fileNameSize-1), "/", true},
+		{sr("a", fileNamePrefixSize) + "/" + sr("b", fileNameSize),
+			sr("a", fileNamePrefixSize), sr("b", fileNameSize), true},
+		{sr("a", fileNamePrefixSize) + "//" + sr("b", fileNameSize), "", "", false},
+		{sr("a/", fileNameSize), sr("a/", 77) + "a", sr("a/", 22), true},
+	}
+
+	for _, v := range vectors {
+		prefix, suffix, ok := splitUSTARPath(v.input)
+		if prefix != v.prefix || suffix != v.suffix || ok != v.ok {
+			t.Errorf("splitUSTARPath(%q):\ngot  (%q, %q, %v)\nwant (%q, %q, %v)",
+				v.input, prefix, suffix, ok, v.prefix, v.suffix, v.ok)
+		}
+	}
+}
+
+func TestFormatPAXRecord(t *testing.T) {
+	var medName = strings.Repeat("CD", 50)
+	var longName = strings.Repeat("AB", 100)
+
+	var vectors = []struct {
+		inputKey string
+		inputVal string
+		output   string
+	}{
+		{"k", "v", "6 k=v\n"},
+		{"path", "/etc/hosts", "19 path=/etc/hosts\n"},
+		{"path", longName, "210 path=" + longName + "\n"},
+		{"path", medName, "110 path=" + medName + "\n"},
+		{"foo", "ba", "9 foo=ba\n"},
+		{"foo", "bar", "11 foo=bar\n"},
+		{"foo", "b=\nar=\n==\x00", "18 foo=b=\nar=\n==\x00\n"},
+		{"foo", "hello9 foo=ba\nworld", "27 foo=hello9 foo=ba\nworld\n"},
+		{"☺☻☹", "日a本b語ç", "27 ☺☻☹=日a本b語ç\n"},
+		{"\x00hello", "\x00world", "17 \x00hello=\x00world\n"},
+	}
+
+	for _, v := range vectors {
+		output := formatPAXRecord(v.inputKey, v.inputVal)
+		if output != v.output {
+			t.Errorf("formatPAXRecord(%q, %q): got %q, want %q",
+				v.inputKey, v.inputVal, output, v.output)
+		}
+	}
+}
+
+func TestFitsInBase256(t *testing.T) {
+	var vectors = []struct {
+		input int64
+		width int
+		ok    bool
+	}{
+		{+1, 8, true},
+		{0, 8, true},
+		{-1, 8, true},
+		{1 << 56, 8, false},
+		{(1 << 56) - 1, 8, true},
+		{-1 << 56, 8, true},
+		{(-1 << 56) - 1, 8, false},
+		{121654, 8, true},
+		{-9849849, 8, true},
+		{math.MaxInt64, 9, true},
+		{0, 9, true},
+		{math.MinInt64, 9, true},
+		{math.MaxInt64, 12, true},
+		{0, 12, true},
+		{math.MinInt64, 12, true},
+	}
+
+	for _, v := range vectors {
+		ok := fitsInBase256(v.width, v.input)
+		if ok != v.ok {
+			t.Errorf("checkNumeric(%d, %d): got %v, want %v", v.input, v.width, ok, v.ok)
+		}
+	}
+}
+
+func TestFormatNumeric(t *testing.T) {
+	var vectors = []struct {
+		input  int64
+		output string
+		ok     bool
+	}{
+		// Test base-256 (binary) encoded values.
+		{-1, "\xff", true},
+		{-1, "\xff\xff", true},
+		{-1, "\xff\xff\xff", true},
+		{(1 << 0), "0", false},
+		{(1 << 8) - 1, "\x80\xff", true},
+		{(1 << 8), "0\x00", false},
+		{(1 << 16) - 1, "\x80\xff\xff", true},
+		{(1 << 16), "00\x00", false},
+		{-1 * (1 << 0), "\xff", true},
+		{-1*(1<<0) - 1, "0", false},
+		{-1 * (1 << 8), "\xff\x00", true},
+		{-1*(1<<8) - 1, "0\x00", false},
+		{-1 * (1 << 16), "\xff\x00\x00", true},
+		{-1*(1<<16) - 1, "00\x00", false},
+		{537795476381659745, "0000000\x00", false},
+		{537795476381659745, "\x80\x00\x00\x00\x07\x76\xa2\x22\xeb\x8a\x72\x61", true},
+		{-615126028225187231, "0000000\x00", false},
+		{-615126028225187231, "\xff\xff\xff\xff\xf7\x76\xa2\x22\xeb\x8a\x72\x61", true},
+		{math.MaxInt64, "0000000\x00", false},
+		{math.MaxInt64, "\x80\x00\x00\x00\x7f\xff\xff\xff\xff\xff\xff\xff", true},
+		{math.MinInt64, "0000000\x00", false},
+		{math.MinInt64, "\xff\xff\xff\xff\x80\x00\x00\x00\x00\x00\x00\x00", true},
+		{math.MaxInt64, "\x80\x7f\xff\xff\xff\xff\xff\xff\xff", true},
+		{math.MinInt64, "\xff\x80\x00\x00\x00\x00\x00\x00\x00", true},
+	}
+
+	for _, v := range vectors {
+		var f formatter
+		output := make([]byte, len(v.output))
+		f.formatNumeric(output, v.input)
+		ok := (f.err == nil)
+		if ok != v.ok {
+			if v.ok {
+				t.Errorf("formatNumeric(%d): got formatting failure, want success", v.input)
+			} else {
+				t.Errorf("formatNumeric(%d): got formatting success, want failure", v.input)
+			}
+		}
+		if string(output) != v.output {
+			t.Errorf("formatNumeric(%d): got %q, want %q", v.input, output, v.output)
+		}
+	}
+}
--- a/cmd/tar-split/README.md
+++ b/cmd/tar-split/README.md
@ -0,0 +1,39 @@
+# tar-split utility
+
+## Installation
+
+	go get -u github.com/vbatts/tar-split/cmd/tar-split
+
+## Usage
+
+### Disassembly
+
+```bash
+$ sha256sum archive.tar 
+d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868  archive.tar
+$ mkdir ./x
+$ tar-split disasm --output tar-data.json.gz ./archive.tar | tar -C ./x -x
+time="2015-07-20T15:45:04-04:00" level=info msg="created tar-data.json.gz from ./archive.tar (read 204800 bytes)"
+```
+
+### Assembly
+
+```bash
+$ tar-split asm --output new.tar --input ./tar-data.json.gz  --path ./x/
+INFO[0000] created new.tar from ./x/ and ./tar-data.json.gz (wrote 204800 bytes)
+$ sha256sum new.tar 
+d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868  new.tar
+```
+
+### Estimating metadata size
+
+```bash
+$ tar-split checksize ./archive.tar
+inspecting "./archive.tar" (size 200k)
+ -- number of files: 28
+ -- size of metadata uncompressed: 28k
+ -- size of gzip compressed metadata: 1k
+```
+
+
+
--- a/cmd/tar-split/asm.go
+++ b/cmd/tar-split/asm.go
@ -0,0 +1,64 @@
+package main
+
+import (
+	"compress/gzip"
+	"io"
+	"os"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/urfave/cli"
+	"github.com/vbatts/tar-split/tar/asm"
+	"github.com/vbatts/tar-split/tar/storage"
+)
+
+func CommandAsm(c *cli.Context) {
+	if len(c.Args()) > 0 {
+		logrus.Warnf("%d additional arguments passed are ignored", len(c.Args()))
+	}
+	if len(c.String("input")) == 0 {
+		logrus.Fatalf("--input filename must be set")
+	}
+	if len(c.String("output")) == 0 {
+		logrus.Fatalf("--output filename must be set ([FILENAME|-])")
+	}
+	if len(c.String("path")) == 0 {
+		logrus.Fatalf("--path must be set")
+	}
+
+	var outputStream io.Writer
+	if c.String("output") == "-" {
+		outputStream = os.Stdout
+	} else {
+		fh, err := os.Create(c.String("output"))
+		if err != nil {
+			logrus.Fatal(err)
+		}
+		defer fh.Close()
+		outputStream = fh
+	}
+
+	// Get the tar metadata reader
+	mf, err := os.Open(c.String("input"))
+	if err != nil {
+		logrus.Fatal(err)
+	}
+	defer mf.Close()
+	mfz, err := gzip.NewReader(mf)
+	if err != nil {
+		logrus.Fatal(err)
+	}
+	defer mfz.Close()
+
+	metaUnpacker := storage.NewJSONUnpacker(mfz)
+	// XXX maybe get the absolute path here
+	fileGetter := storage.NewPathFileGetter(c.String("path"))
+
+	ots := asm.NewOutputTarStream(fileGetter, metaUnpacker)
+	defer ots.Close()
+	i, err := io.Copy(outputStream, ots)
+	if err != nil {
+		logrus.Fatal(err)
+	}
+
+	logrus.Infof("created %s from %s and %s (wrote %d bytes)", c.String("output"), c.String("path"), c.String("input"), i)
+}
--- a/cmd/tar-split/checksize.go
+++ b/cmd/tar-split/checksize.go
@ -1,29 +1,25 @@
-// +build ignore
-
 package main

 import (
 	"archive/tar"
 	"compress/gzip"
-	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"os"

+	"github.com/Sirupsen/logrus"
+	"github.com/urfave/cli"
 	"github.com/vbatts/tar-split/tar/asm"
 	"github.com/vbatts/tar-split/tar/storage"
 )

-var (
-	flCleanup = flag.Bool("cleanup", true, "cleanup tempfiles")
-)
-
-func main() {
-	flag.Parse()
-
-	for _, arg := range flag.Args() {
+func CommandChecksize(c *cli.Context) {
+	if len(c.Args()) == 0 {
+		logrus.Fatalf("please specify tar archives to check ('-' will check stdin)")
+	}
+	for _, arg := range c.Args() {
 		fh, err := os.Open(arg)
 		if err != nil {
 			log.Fatal(err)
@ -40,8 +36,10 @@ func main() {
 			log.Fatal(err)
 		}
 		defer packFh.Close()
-		if *flCleanup {
+		if !c.Bool("work") {
 			defer os.Remove(packFh.Name())
+		} else {
+			fmt.Printf(" -- working file preserved: %s\n", packFh.Name())
 		}

 		sp := storage.NewJSONPacker(packFh)
@ -83,7 +81,7 @@ func main() {
 			log.Fatal(err)
 		}
 		defer gzPackFh.Close()
-		if *flCleanup {
+		if !c.Bool("work") {
 			defer os.Remove(gzPackFh.Name())
 		}

--- a/cmd/tar-split/disasm.go
+++ b/cmd/tar-split/disasm.go
@ -0,0 +1,63 @@
+package main
+
+import (
+	"compress/gzip"
+	"io"
+	"io/ioutil"
+	"os"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/urfave/cli"
+	"github.com/vbatts/tar-split/tar/asm"
+	"github.com/vbatts/tar-split/tar/storage"
+)
+
+func CommandDisasm(c *cli.Context) {
+	if len(c.Args()) != 1 {
+		logrus.Fatalf("please specify tar to be disabled <NAME|->")
+	}
+	if len(c.String("output")) == 0 {
+		logrus.Fatalf("--output filename must be set")
+	}
+
+	// Set up the tar input stream
+	var inputStream io.Reader
+	if c.Args()[0] == "-" {
+		inputStream = os.Stdin
+	} else {
+		fh, err := os.Open(c.Args()[0])
+		if err != nil {
+			logrus.Fatal(err)
+		}
+		defer fh.Close()
+		inputStream = fh
+	}
+
+	// Set up the metadata storage
+	mf, err := os.OpenFile(c.String("output"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(0600))
+	if err != nil {
+		logrus.Fatal(err)
+	}
+	defer mf.Close()
+	mfz := gzip.NewWriter(mf)
+	defer mfz.Close()
+	metaPacker := storage.NewJSONPacker(mfz)
+
+	// we're passing nil here for the file putter, because the ApplyDiff will
+	// handle the extraction of the archive
+	its, err := asm.NewInputTarStream(inputStream, metaPacker, nil)
+	if err != nil {
+		logrus.Fatal(err)
+	}
+	var out io.Writer
+	if c.Bool("no-stdout") {
+		out = ioutil.Discard
+	} else {
+		out = os.Stdout
+	}
+	i, err := io.Copy(out, its)
+	if err != nil {
+		logrus.Fatal(err)
+	}
+	logrus.Infof("created %s from %s (read %d bytes)", c.String("output"), c.Args()[0], i)
+}
--- a/cmd/tar-split/main.go
+++ b/cmd/tar-split/main.go
@ -0,0 +1,91 @@
+package main
+
+import (
+	"os"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/urfave/cli"
+	"github.com/vbatts/tar-split/version"
+)
+
+func main() {
+	app := cli.NewApp()
+	app.Name = "tar-split"
+	app.Usage = "tar assembly and disassembly utility"
+	app.Version = version.VERSION
+	app.Author = "Vincent Batts"
+	app.Email = "vbatts@hashbangbash.com"
+	app.Action = cli.ShowAppHelp
+	app.Before = func(c *cli.Context) error {
+		logrus.SetOutput(os.Stderr)
+		if c.Bool("debug") {
+			logrus.SetLevel(logrus.DebugLevel)
+		}
+		return nil
+	}
+	app.Flags = []cli.Flag{
+		cli.BoolFlag{
+			Name:  "debug, D",
+			Usage: "debug output",
+			// defaults to false
+		},
+	}
+	app.Commands = []cli.Command{
+		{
+			Name:    "disasm",
+			Aliases: []string{"d"},
+			Usage:   "disassemble the input tar stream",
+			Action:  CommandDisasm,
+			Flags: []cli.Flag{
+				cli.StringFlag{
+					Name:  "output",
+					Value: "tar-data.json.gz",
+					Usage: "output of disassembled tar stream",
+				},
+				cli.BoolFlag{
+					Name:  "no-stdout",
+					Usage: "do not throughput the stream to STDOUT",
+				},
+			},
+		},
+		{
+			Name:    "asm",
+			Aliases: []string{"a"},
+			Usage:   "assemble tar stream",
+			Action:  CommandAsm,
+			Flags: []cli.Flag{
+				cli.StringFlag{
+					Name:  "input",
+					Value: "tar-data.json.gz",
+					Usage: "input of disassembled tar stream",
+				},
+				cli.StringFlag{
+					Name:  "output",
+					Value: "-",
+					Usage: "reassembled tar archive",
+				},
+				cli.StringFlag{
+					Name:  "path",
+					Value: "",
+					Usage: "relative path of extracted tar",
+				},
+			},
+		},
+		{
+			Name:   "checksize",
+			Usage:  "displays size estimates for metadata storage of a Tar archive",
+			Action: CommandChecksize,
+			Flags: []cli.Flag{
+				cli.BoolFlag{
+					Name:  "work",
+					Usage: "do not delete the working directory",
+					// defaults to false
+				},
+			},
+		},
+	}
+
+	if err := app.Run(os.Args); err != nil {
+		logrus.Fatal(err)
+	}
+}
--- a/concept/DESIGN.md
+++ b/concept/DESIGN.md
@ -0,0 +1,94 @@
+# Flow of TAR stream
+
+## `./archive/tar`
+
+The import path `github.com/vbatts/tar-split/archive/tar` is fork of upstream golang stdlib [`archive/tar`](http://golang.org/pkg/archive/tar/).
+It adds plumbing to access raw bytes of the tar stream as the headers and payload are read.
+
+## Packer interface
+
+For ease of storage and usage of the raw bytes, there will be a storage
+interface, that accepts an io.Writer (This way you could pass it an in memory
+buffer or a file handle).
+
+Having a Packer interface can allow configuration of hash.Hash for file payloads
+and providing your own io.Writer.
+
+Instead of having a state directory to store all the header information for all
+Readers, we will leave that up to user of Reader. Because we can not assume an
+ID for each Reader, and keeping that information differentiated.
+
+## State Directory
+
+Perhaps we could deduplicate the header info, by hashing the rawbytes and
+storing them in a directory tree like:
+
+	./ac/dc/beef
+
+Then reference the hash of the header info, in the positional records for the
+tar stream. Though this could be a future feature, and not required for an
+initial implementation. Also, this would imply an owned state directory, rather
+than just writing storage info to an io.Writer.
+
+## Concept Example
+
+First we'll get an archive to work with. For repeatability, we'll make an
+archive from what you've just cloned:
+
+```
+git archive --format=tar -o tar-split.tar HEAD .
+```
+
+Then build the example main.go:
+
+```
+go build ./main.go
+```
+
+Now run the example over the archive:
+
+```
+$ ./main tar-split.tar
+2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out"
+pax_global_header pre: 512 read: 52
+.travis.yml pre: 972 read: 374
+DESIGN.md pre: 650 read: 1131
+LICENSE pre: 917 read: 1075
+README.md pre: 973 read: 4289
+archive/ pre: 831 read: 0
+archive/tar/ pre: 512 read: 0
+archive/tar/common.go pre: 512 read: 7790
+[...]
+tar/storage/entry_test.go pre: 667 read: 1137
+tar/storage/getter.go pre: 911 read: 2741
+tar/storage/getter_test.go pre: 843 read: 1491
+tar/storage/packer.go pre: 557 read: 3141
+tar/storage/packer_test.go pre: 955 read: 3096
+EOF padding: 1512
+Remainder: 512
+Size: 215040; Sum: 215040
+```
+
+*What are we seeing here?* 
+
+* `pre` is the header of a file entry, and potentially the padding from the
+  end of the prior file's payload. Also with particular tar extensions and pax
+  attributes, the header can exceed 512 bytes.
+* `read` is the size of the file payload from the entry
+* `EOF padding` is the expected 1024 null bytes on the end of a tar archive,
+  plus potential padding from the end of the prior file entry's payload
+* `Remainder` is the remaining bytes of an archive. This is typically deadspace
+  as most tar implmentations will return after having reached the end of the
+  1024 null bytes. Though various implementations will include some amount of
+  bytes here, which will affect the checksum of the resulting tar archive,
+  therefore this must be accounted for as well.
+
+Ideally the input tar and output `*.out`, will match:
+
+```
+$ sha1sum tar-split.tar*
+ca9e19966b892d9ad5960414abac01ef585a1e22  tar-split.tar
+ca9e19966b892d9ad5960414abac01ef585a1e22  tar-split.tar.out
+```
+
+
--- a/concept/main.go
+++ b/concept/main.go
--- a/tar/asm/assemble.go
+++ b/tar/asm/assemble.go
@ -3,13 +3,15 @@ package asm
 import (
 	"bytes"
 	"fmt"
+	"hash"
 	"hash/crc64"
 	"io"
+	"sync"

 	"github.com/vbatts/tar-split/tar/storage"
 )

-// NewOutputTarStream returns an io.ReadCloser that is an assemble tar archive
+// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive
 // stream.
 //
 // It takes a storage.FileGetter, for mapping the file payloads that are to be read in,
@ -23,44 +25,106 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose
 	}
 	pr, pw := io.Pipe()
 	go func() {
+		err := WriteOutputTarStream(fg, up, pw)
+		if err != nil {
+			pw.CloseWithError(err)
+		} else {
+			pw.Close()
+		}
+	}()
+	return pr
+}
+
+// WriteOutputTarStream writes assembled tar archive to a writer.
+func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error {
+	// ... Since these are interfaces, this is possible, so let's not have a nil pointer
+	if fg == nil || up == nil {
+		return nil
+	}
+	var copyBuffer []byte
+	var crcHash hash.Hash
+	var crcSum []byte
+	var multiWriter io.Writer
 	for {
 		entry, err := up.Next()
 		if err != nil {
-				pw.CloseWithError(err)
-				break
+			if err == io.EOF {
+				return nil
+			}
+			return err
 		}
 		switch entry.Type {
 		case storage.SegmentType:
-				if _, err := pw.Write(entry.Payload); err != nil {
-					pw.CloseWithError(err)
-					break
+			if _, err := w.Write(entry.Payload); err != nil {
+				return err
 			}
 		case storage.FileType:
 			if entry.Size == 0 {
 				continue
 			}
-				fh, err := fg.Get(entry.Name)
+			fh, err := fg.Get(entry.GetName())
 			if err != nil {
-					pw.CloseWithError(err)
-					break
+				return err
 			}
-				defer fh.Close()
-				c := crc64.New(storage.CRCTable)
-				tRdr := io.TeeReader(fh, c)
-				if _, err := io.Copy(pw, tRdr); err != nil {
-					pw.CloseWithError(err)
-					break
+			if crcHash == nil {
+				crcHash = crc64.New(storage.CRCTable)
+				crcSum = make([]byte, 8)
+				multiWriter = io.MultiWriter(w, crcHash)
+				copyBuffer = byteBufferPool.Get().([]byte)
+				defer byteBufferPool.Put(copyBuffer)
+			} else {
+				crcHash.Reset()
 			}
-				if !bytes.Equal(c.Sum(nil), entry.Payload) {
+
+			if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil {
+				fh.Close()
+				return err
+			}
+
+			if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) {
 				// I would rather this be a comparable ErrInvalidChecksum or such,
 				// but since it's coming through the PipeReader, the context of
 				// _which_ file would be lost...
-					pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name))
+				fh.Close()
+				return fmt.Errorf("file integrity checksum failed for %q", entry.GetName())
+			}
+			fh.Close()
+		}
+	}
+}
+
+var byteBufferPool = &sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 32*1024)
+	},
+}
+
+// copyWithBuffer is taken from stdlib io.Copy implementation
+// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367
+func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) {
+	for {
+		nr, er := src.Read(buf)
+		if nr > 0 {
+			nw, ew := dst.Write(buf[0:nr])
+			if nw > 0 {
+				written += int64(nw)
+			}
+			if ew != nil {
+				err = ew
+				break
+			}
+			if nr != nw {
+				err = io.ErrShortWrite
 				break
 			}
 		}
+		if er == io.EOF {
+			break
 		}
-		pw.Close()
-	}()
-	return pr
+		if er != nil {
+			err = er
+			break
+		}
+	}
+	return written, err
 }
--- a/tar/asm/assemble_test.go
+++ b/tar/asm/assemble_test.go
@ -5,6 +5,7 @@ import (
 	"compress/gzip"
 	"crypto/sha1"
 	"fmt"
+	"hash/crc64"
 	"io"
 	"io/ioutil"
 	"os"
@ -33,48 +34,119 @@ var entries = []struct {
 			Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187},
 			Size:    26,
 		},
-
+		Body: []byte("café con leche, por favor"),
+	},
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip.
+			Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187},
+			Size:    26,
+		},
+		Body: []byte("café con leche, por favor"),
+	},
+}
+var entriesMangled = []struct {
+	Entry storage.Entry
+	Body  []byte
+}{
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			Name:    "./hurr.txt",
+			Payload: []byte{3, 116, 164, 177, 171, 236, 107, 78},
+			Size:    20,
+		},
+		// switch
+		Body: []byte("imma derp til I hurr"),
+	},
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			Name:    "./ermahgerd.txt",
+			Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187},
+			Size:    26,
+		},
+		// san not con
+		Body: []byte("café sans leche, por favor"),
+	},
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4},
+			Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187},
+			Size:    26,
+		},
 		Body: []byte("café con leche, por favor"),
 	},
 }

-func TestTarStreamOld(t *testing.T) {
+func TestTarStreamMangledGetterPutter(t *testing.T) {
 	fgp := storage.NewBufferFileGetPutter()

 	// first lets prep a GetPutter and Packer
 	for i := range entries {
 		if entries[i].Entry.Type == storage.FileType {
-			j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body))
+			j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body))
 			if err != nil {
 				t.Error(err)
 			}
 			if j != entries[i].Entry.Size {
 				t.Errorf("size %q: expected %d; got %d",
-					entries[i].Entry.Name,
+					entries[i].Entry.GetName(),
 					entries[i].Entry.Size,
 					j)
 			}
 			if !bytes.Equal(csum, entries[i].Entry.Payload) {
 				t.Errorf("checksum %q: expected %v; got %v",
-					entries[i].Entry.Name,
+					entries[i].Entry.GetName(),
 					entries[i].Entry.Payload,
 					csum)
 			}
 		}
 	}

-	// next we'll use these to produce a tar stream.
-	_ = NewOutputTarStream(fgp, nil)
-	// TODO finish this
+	for _, e := range entriesMangled {
+		if e.Entry.Type == storage.FileType {
+			rdr, err := fgp.Get(e.Entry.GetName())
+			if err != nil {
+				t.Error(err)
+			}
+			c := crc64.New(storage.CRCTable)
+			i, err := io.Copy(c, rdr)
+			if err != nil {
+				t.Fatal(err)
+			}
+			rdr.Close()
+
+			csum := c.Sum(nil)
+			if bytes.Equal(csum, e.Entry.Payload) {
+				t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v",
+					i,
+					e.Entry.GetName(),
+					csum)
+			}
+		}
+	}
+}
+
+var testCases = []struct {
+	path            string
+	expectedSHA1Sum string
+	expectedSize    int64
+}{
+	{"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240},
+	{"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480},
+	{"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880},
+	{"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240},
+	{"./testdata/extranils.tar.gz", "e187b4b3e739deaccc257342f4940f34403dc588", 10648},
+	{"./testdata/notenoughnils.tar.gz", "72f93f41efd95290baa5c174c234f5d4c22ce601", 512},
 }

 func TestTarStream(t *testing.T) {
-	var (
-		expectedSum        = "1eb237ff69bca6e22789ecb05b45d35ca307adbd"
-		expectedSize int64 = 10240
-	)

-	fh, err := os.Open("./testdata/t.tar.gz")
+	for _, tc := range testCases {
+		fh, err := os.Open(tc.path)
 		if err != nil {
 			t.Fatal(err)
 		}
@ -98,22 +170,19 @@ func TestTarStream(t *testing.T) {

 		// get a sum of the stream after it has passed through to ensure it's the same.
 		h0 := sha1.New()
-	tRdr0 := io.TeeReader(tarStream, h0)
-
-	// read it all to the bit bucket
-	i, err := io.Copy(ioutil.Discard, tRdr0)
+		i, err := io.Copy(h0, tarStream)
 		if err != nil {
 			t.Fatal(err)
 		}

-	if i != expectedSize {
-		t.Errorf("size of tar: expected %d; got %d", expectedSize, i)
+		if i != tc.expectedSize {
+			t.Errorf("size of tar: expected %d; got %d", tc.expectedSize, i)
 		}
-	if fmt.Sprintf("%x", h0.Sum(nil)) != expectedSum {
-		t.Fatalf("checksum of tar: expected %s; got %x", expectedSum, h0.Sum(nil))
+		if fmt.Sprintf("%x", h0.Sum(nil)) != tc.expectedSHA1Sum {
+			t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil))
 		}

-	t.Logf("%s", w.String()) // if we fail, then show the packed info
+		//t.Logf("%s", w.String()) // if we fail, then show the packed info

 		// If we've made it this far, then we'll turn it around and create a tar
 		// stream from the packed metadata and buffered file contents.
@ -123,18 +192,65 @@ func TestTarStream(t *testing.T) {

 		rc := NewOutputTarStream(fgp, sup)
 		h1 := sha1.New()
-	tRdr1 := io.TeeReader(rc, h1)
-
-	// read it all to the bit bucket
-	i, err = io.Copy(ioutil.Discard, tRdr1)
+		i, err = io.Copy(h1, rc)
 		if err != nil {
 			t.Fatal(err)
 		}

-	if i != expectedSize {
-		t.Errorf("size of output tar: expected %d; got %d", expectedSize, i)
+		if i != tc.expectedSize {
+			t.Errorf("size of output tar: expected %d; got %d", tc.expectedSize, i)
+		}
+		if fmt.Sprintf("%x", h1.Sum(nil)) != tc.expectedSHA1Sum {
+			t.Fatalf("checksum of output tar: expected %s; got %x", tc.expectedSHA1Sum, h1.Sum(nil))
+		}
+	}
+}
+
+func BenchmarkAsm(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for _, tc := range testCases {
+			func() {
+				fh, err := os.Open(tc.path)
+				if err != nil {
+					b.Fatal(err)
+				}
+				defer fh.Close()
+				gzRdr, err := gzip.NewReader(fh)
+				if err != nil {
+					b.Fatal(err)
+				}
+				defer gzRdr.Close()
+
+				// Setup where we'll store the metadata
+				w := bytes.NewBuffer([]byte{})
+				sp := storage.NewJSONPacker(w)
+				fgp := storage.NewBufferFileGetPutter()
+
+				// wrap the disassembly stream
+				tarStream, err := NewInputTarStream(gzRdr, sp, fgp)
+				if err != nil {
+					b.Fatal(err)
+				}
+				// read it all to the bit bucket
+				i1, err := io.Copy(ioutil.Discard, tarStream)
+				if err != nil {
+					b.Fatal(err)
+				}
+
+				r := bytes.NewBuffer(w.Bytes())
+				sup := storage.NewJSONUnpacker(r)
+				// and reuse the fgp that we Put the payloads to.
+
+				rc := NewOutputTarStream(fgp, sup)
+
+				i2, err := io.Copy(ioutil.Discard, rc)
+				if err != nil {
+					b.Fatal(err)
+				}
+				if i1 != i2 {
+					b.Errorf("%s: input(%d) and ouput(%d) byte count didn't match", tc.path, i1, i2)
+				}
+			}()
 		}
-	if fmt.Sprintf("%x", h1.Sum(nil)) != expectedSum {
-		t.Fatalf("checksum of output tar: expected %s; got %x", expectedSum, h1.Sum(nil))
 	}
 }
--- a/tar/asm/disassemble.go
+++ b/tar/asm/disassemble.go
@ -22,8 +22,8 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 	// What to do here... folks will want their own access to the Reader that is
 	// their tar archive stream, but we'll need that same stream to use our
 	// forked 'archive/tar'.
-	// Perhaps do an io.TeeReader that hand back an io.Reader for them to read
-	// from, and we'll mitm the stream to store metadata.
+	// Perhaps do an io.TeeReader that hands back an io.Reader for them to read
+	// from, and we'll MITM the stream to store metadata.
 	// We'll need a storage.FilePutter too ...

 	// Another concern, whether to do any storage.FilePutter operations, such that we
@ -32,7 +32,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 	// Perhaps we have a DiscardFilePutter that is a bit bucket.

 	// we'll return the pipe reader, since TeeReader does not buffer and will
-	// only read what the outputRdr Read's. Since Tar archive's have padding on
+	// only read what the outputRdr Read's. Since Tar archives have padding on
 	// the end, we want to be the one reading the padding, even if the user's
 	// `archive/tar` doesn't care.
 	pR, pW := io.Pipe()
@ -55,12 +55,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 				}
 				// even when an EOF is reached, there is often 1024 null bytes on
 				// the end of an archive. Collect them too.
+				if b := tr.RawBytes(); len(b) > 0 {
 					_, err := p.AddEntry(storage.Entry{
 						Type:    storage.SegmentType,
-					Payload: tr.RawBytes(),
+						Payload: b,
 					})
 					if err != nil {
 						pW.CloseWithError(err)
+						return
+					}
 				}
 				break // not return. We need the end of the reader.
 			}
@ -68,11 +71,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 				break // not return. We need the end of the reader.
 			}

-			if _, err := p.AddEntry(storage.Entry{
+			if b := tr.RawBytes(); len(b) > 0 {
+				_, err := p.AddEntry(storage.Entry{
 					Type:    storage.SegmentType,
-				Payload: tr.RawBytes(),
-			}); err != nil {
+					Payload: b,
+				})
+				if err != nil {
 					pW.CloseWithError(err)
+					return
+				}
 			}

 			var csum []byte
@ -81,18 +88,23 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 				_, csum, err = fp.Put(hdr.Name, tr)
 				if err != nil {
 					pW.CloseWithError(err)
+					return
 				}
 			}

-			// File entries added, regardless of size
-			_, err = p.AddEntry(storage.Entry{
+			entry := storage.Entry{
 				Type:    storage.FileType,
-				Name:    hdr.Name,
 				Size:    hdr.Size,
 				Payload: csum,
-			})
+			}
+			// For proper marshalling of non-utf8 characters
+			entry.SetName(hdr.Name)
+
+			// File entries added, regardless of size
+			_, err = p.AddEntry(entry)
 			if err != nil {
 				pW.CloseWithError(err)
+				return
 			}

 			if b := tr.RawBytes(); len(b) > 0 {
@ -102,6 +114,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 				})
 				if err != nil {
 					pW.CloseWithError(err)
+					return
 				}
 			}
 		}
@ -111,6 +124,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 		remainder, err := ioutil.ReadAll(outputRdr)
 		if err != nil && err != io.EOF {
 			pW.CloseWithError(err)
+			return
 		}
 		_, err = p.AddEntry(storage.Entry{
 			Type:    storage.SegmentType,
@ -118,9 +132,9 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 		})
 		if err != nil {
 			pW.CloseWithError(err)
-		} else {
-			pW.Close()
+			return
 		}
+		pW.Close()
 	}()

 	return pR, nil
--- a/tar/asm/testdata/extranils.tar.gz
+++ b/tar/asm/testdata/extranils.tar.gz
--- a/tar/asm/testdata/fatlonglink.tar.gz
+++ b/tar/asm/testdata/fatlonglink.tar.gz
--- a/tar/asm/testdata/iso-8859.tar.gz
+++ b/tar/asm/testdata/iso-8859.tar.gz
--- a/tar/asm/testdata/longlink.tar.gz
+++ b/tar/asm/testdata/longlink.tar.gz
--- a/tar/asm/testdata/notenoughnils.tar.gz
+++ b/tar/asm/testdata/notenoughnils.tar.gz
--- a/tar/storage/doc.go
+++ b/tar/storage/doc.go
@ -5,7 +5,7 @@ Packing and unpacking the Entries of the stream. The types of streams are
 either segments of raw bytes (for the raw headers and various padding) and for
 an entry marking a file payload.

-The raw bytes are stored precisely in the packed (marshalled) Entry. Where as
+The raw bytes are stored precisely in the packed (marshalled) Entry, whereas
 the file payload marker include the name of the file, size, and crc64 checksum
 (for basic file integrity).
 */
--- a/tar/storage/entry.go
+++ b/tar/storage/entry.go
@ -1,5 +1,7 @@
 package storage

+import "unicode/utf8"
+
 // Entries is for sorting by Position
 type Entries []Entry

@ -19,11 +21,11 @@ const (
 	// SegmentType represents a raw bytes segment from the archive stream. These raw
 	// byte segments consist of the raw headers and various padding.
 	//
-	// It's payload is to be marshalled base64 encoded.
+	// Its payload is to be marshalled base64 encoded.
 	SegmentType
 )

-// Entry is a the structure for packing and unpacking the information read from
+// Entry is the structure for packing and unpacking the information read from
 // the Tar archive.
 //
 // FileType Payload checksum is using `hash/crc64` for basic file integrity,
@ -32,8 +34,45 @@ const (
 // collisions in a sample of 18.2 million, CRC64 had none.
 type Entry struct {
 	Type     Type   `json:"type"`
-	Name     string `json:"name",omitempty`
-	Size     int64  `json:"size",omitempty`
-	Payload  []byte `json:"payload"` // SegmentType store payload here; FileType store crc64 checksum here;
+	Name     string `json:"name,omitempty"`
+	NameRaw  []byte `json:"name_raw,omitempty"`
+	Size     int64  `json:"size,omitempty"`
+	Payload  []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here;
 	Position int    `json:"position"`
 }
+
+// SetName will check name for valid UTF-8 string, and set the appropriate
+// field. See https://github.com/vbatts/tar-split/issues/17
+func (e *Entry) SetName(name string) {
+	if utf8.ValidString(name) {
+		e.Name = name
+	} else {
+		e.NameRaw = []byte(name)
+	}
+}
+
+// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
+// field
+func (e *Entry) SetNameBytes(name []byte) {
+	if utf8.Valid(name) {
+		e.Name = string(name)
+	} else {
+		e.NameRaw = name
+	}
+}
+
+// GetName returns the string for the entry's name, regardless of the field stored in
+func (e *Entry) GetName() string {
+	if len(e.NameRaw) > 0 {
+		return string(e.NameRaw)
+	}
+	return e.Name
+}
+
+// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in
+func (e *Entry) GetNameBytes() []byte {
+	if len(e.NameRaw) > 0 {
+		return e.NameRaw
+	}
+	return []byte(e.Name)
+}
--- a/tar/storage/entry_test.go
+++ b/tar/storage/entry_test.go
@ -39,10 +39,10 @@ func TestEntries(t *testing.T) {
 func TestFile(t *testing.T) {
 	f := Entry{
 		Type:     FileType,
-		Name:     "./hello.txt",
 		Size:     100,
 		Position: 2,
 	}
+	f.SetName("./hello.txt")

 	buf, err := json.Marshal(f)
 	if err != nil {
@ -54,8 +54,37 @@ func TestFile(t *testing.T) {
 		t.Fatal(err)
 	}

-	if f.Name != f1.Name {
-		t.Errorf("expected Name %q, got %q", f.Name, f1.Name)
+	if f.GetName() != f1.GetName() {
+		t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
+	}
+	if f.Size != f1.Size {
+		t.Errorf("expected Size %q, got %q", f.Size, f1.Size)
+	}
+	if f.Position != f1.Position {
+		t.Errorf("expected Position %q, got %q", f.Position, f1.Position)
+	}
+}
+
+func TestFileRaw(t *testing.T) {
+	f := Entry{
+		Type:     FileType,
+		Size:     100,
+		Position: 2,
+	}
+	f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74})
+
+	buf, err := json.Marshal(f)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	f1 := Entry{}
+	if err = json.Unmarshal(buf, &f1); err != nil {
+		t.Fatal(err)
+	}
+
+	if f.GetName() != f1.GetName() {
+		t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
 	}
 	if f.Size != f1.Size {
 		t.Errorf("expected Size %q, got %q", f.Size, f1.Size)
--- a/tar/storage/getter.go
+++ b/tar/storage/getter.go
@ -5,25 +5,24 @@ import (
 	"errors"
 	"hash/crc64"
 	"io"
-	"io/ioutil"
 	"os"
-	"path"
+	"path/filepath"
 )

-// FileGetter is the interface for getting a stream of a file payload, address
-// by name/filepath. Presumably, the names will be scoped to relative file
-// paths.
+// FileGetter is the interface for getting a stream of a file payload,
+// addressed by name/filename. Presumably, the names will be scoped to relative
+// file paths.
 type FileGetter interface {
 	// Get returns a stream for the provided file path
-	Get(filepath string) (output io.ReadCloser, err error)
+	Get(filename string) (output io.ReadCloser, err error)
 }

 // FilePutter is the interface for storing a stream of a file payload,
-// addressed by name/filepath.
+// addressed by name/filename.
 type FilePutter interface {
 	// Put returns the size of the stream received, and the crc64 checksum for
 	// the provided stream
-	Put(filepath string, input io.Reader) (size int64, checksum []byte, err error)
+	Put(filename string, input io.Reader) (size int64, checksum []byte, err error)
 }

 // FileGetPutter is the interface that groups both Getting and Putting file
@ -44,8 +43,7 @@ type pathFileGetter struct {
 }

 func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) {
-	// FIXME might should have a check for '../../../../etc/passwd' attempts?
-	return os.Open(path.Join(pfg.root, filename))
+	return os.Open(filepath.Join(pfg.root, filename))
 }

 type bufferFileGetPutter struct {
@ -61,15 +59,15 @@ func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) {
 }

 func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) {
-	c := crc64.New(CRCTable)
-	tRdr := io.TeeReader(r, c)
-	b := bytes.NewBuffer([]byte{})
-	i, err := io.Copy(b, tRdr)
+	crc := crc64.New(CRCTable)
+	buf := bytes.NewBuffer(nil)
+	cw := io.MultiWriter(crc, buf)
+	i, err := io.Copy(cw, r)
 	if err != nil {
 		return 0, nil, err
 	}
-	bfgp.files[name] = b.Bytes()
-	return i, c.Sum(nil), nil
+	bfgp.files[name] = buf.Bytes()
+	return i, crc.Sum(nil), nil
 }

 type readCloserWrapper struct {
@ -78,7 +76,7 @@ type readCloserWrapper struct {

 func (w *readCloserWrapper) Close() error { return nil }

-// NewBufferFileGetPutter is simple in memory FileGetPutter
+// NewBufferFileGetPutter is a simple in-memory FileGetPutter
 //
 // Implication is this is memory intensive...
 // Probably best for testing or light weight cases.
@ -98,8 +96,7 @@ type bitBucketFilePutter struct {

 func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) {
 	c := crc64.New(CRCTable)
-	tRdr := io.TeeReader(r, c)
-	i, err := io.Copy(ioutil.Discard, tRdr)
+	i, err := io.Copy(c, r)
 	return i, c.Sum(nil), err
 }

--- a/tar/storage/getter_test.go
+++ b/tar/storage/getter_test.go
@ -2,7 +2,9 @@ package storage

 import (
 	"bytes"
+	"fmt"
 	"io/ioutil"
+	"strings"
 	"testing"
 )

@ -39,6 +41,7 @@ func TestGetter(t *testing.T) {
 		}
 	}
 }
+
 func TestPutter(t *testing.T) {
 	fp := NewDiscardFilePutter()
 	// map[filename]map[body]crc64sum
@ -60,3 +63,22 @@ func TestPutter(t *testing.T) {
 		}
 	}
 }
+
+func BenchmarkPutter(b *testing.B) {
+	files := []string{
+		strings.Repeat("foo", 1000),
+		strings.Repeat("bar", 1000),
+		strings.Repeat("baz", 1000),
+		strings.Repeat("fooz", 1000),
+		strings.Repeat("vbatts", 1000),
+		strings.Repeat("systemd", 1000),
+	}
+	for i := 0; i < b.N; i++ {
+		fgp := NewBufferFileGetPutter()
+		for n, body := range files {
+			if _, _, err := fgp.Put(fmt.Sprintf("%d", n), bytes.NewBufferString(body)); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+}
--- a/tar/storage/packer.go
+++ b/tar/storage/packer.go
@ -1,15 +1,15 @@
 package storage

 import (
-	"bufio"
 	"encoding/json"
 	"errors"
 	"io"
-	"path"
+	"path/filepath"
+	"unicode/utf8"
 )

-// ErrDuplicatePath is occured when a tar archive has more than one entry for
-// the same file path
+// ErrDuplicatePath occurs when a tar archive has more than one entry for the
+// same file path
 var ErrDuplicatePath = errors.New("duplicates of file paths not supported")

 // Packer describes the methods to pack Entries to a storage destination
@ -32,40 +32,24 @@ type PackUnpacker interface {
 */

 type jsonUnpacker struct {
-	r     io.Reader
-	b     *bufio.Reader
-	isEOF bool
 	seen seenNames
+	dec  *json.Decoder
 }

 func (jup *jsonUnpacker) Next() (*Entry, error) {
 	var e Entry
-	if jup.isEOF {
-		// since ReadBytes() will return read bytes AND an EOF, we handle it this
-		// round-a-bout way so we can Unmarshal the tail with relevant errors, but
-		// still get an io.EOF when the stream is ended.
-		return nil, io.EOF
-	}
-	line, err := jup.b.ReadBytes('\n')
-	if err != nil && err != io.EOF {
+	err := jup.dec.Decode(&e)
+	if err != nil {
 		return nil, err
-	} else if err == io.EOF {
-		jup.isEOF = true
-	}
-
-	err = json.Unmarshal(line, &e)
-	if err != nil && jup.isEOF {
-		// if the remainder actually _wasn't_ a remaining json structure, then just EOF
-		return nil, io.EOF
 	}

 	// check for dup name
 	if e.Type == FileType {
-		cName := path.Clean(e.Name)
+		cName := filepath.Clean(e.GetName())
 		if _, ok := jup.seen[cName]; ok {
 			return nil, ErrDuplicatePath
 		}
-		jup.seen[cName] = emptyByte
+		jup.seen[cName] = struct{}{}
 	}

 	return &e, err
@ -77,8 +61,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) {
 // Each Entry read are expected to be delimited by new line.
 func NewJSONUnpacker(r io.Reader) Unpacker {
 	return &jsonUnpacker{
-		r:    r,
-		b:    bufio.NewReader(r),
+		dec:  json.NewDecoder(r),
 		seen: seenNames{},
 	}
 }
@ -90,20 +73,24 @@ type jsonPacker struct {
 	seen seenNames
 }

-type seenNames map[string]byte
-
-// used in the seenNames map. byte is a uint8, and we'll re-use the same one
-// for minimalism.
-const emptyByte byte = 0
+type seenNames map[string]struct{}

 func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
+	// if Name is not valid utf8, switch it to raw first.
+	if e.Name != "" {
+		if !utf8.ValidString(e.Name) {
+			e.NameRaw = []byte(e.Name)
+			e.Name = ""
+		}
+	}
+
 	// check early for dup name
 	if e.Type == FileType {
-		cName := path.Clean(e.Name)
+		cName := filepath.Clean(e.GetName())
 		if _, ok := jp.seen[cName]; ok {
 			return -1, ErrDuplicatePath
 		}
-		jp.seen[cName] = emptyByte
+		jp.seen[cName] = struct{}{}
 	}

 	e.Position = jp.pos
@ -117,7 +104,7 @@ func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
 	return e.Position, nil
 }

-// NewJSONPacker provides an Packer that writes each Entry (SegmentType and
+// NewJSONPacker provides a Packer that writes each Entry (SegmentType and
 // FileType) as a json document.
 //
 // The Entries are delimited by new line.
--- a/tar/storage/packer_test.go
+++ b/tar/storage/packer_test.go
@ -4,6 +4,8 @@ import (
 	"bytes"
 	"compress/gzip"
 	"io"
+	"io/ioutil"
+	"os"
 	"testing"
 )

@ -159,5 +161,58 @@ func TestGzip(t *testing.T) {
 	if len(entries) != len(e) {
 		t.Errorf("expected %d entries, got %d", len(e), len(entries))
 	}
-
+}
+
+func BenchmarkGetPut(b *testing.B) {
+	e := []Entry{
+		Entry{
+			Type:    SegmentType,
+			Payload: []byte("how"),
+		},
+		Entry{
+			Type:    SegmentType,
+			Payload: []byte("y'all"),
+		},
+		Entry{
+			Type:    FileType,
+			Name:    "./hurr.txt",
+			Payload: []byte("deadbeef"),
+		},
+		Entry{
+			Type:    SegmentType,
+			Payload: []byte("doin"),
+		},
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			func() {
+				fh, err := ioutil.TempFile("", "tar-split.")
+				if err != nil {
+					b.Fatal(err)
+				}
+				defer os.Remove(fh.Name())
+				defer fh.Close()
+
+				jp := NewJSONPacker(fh)
+				for i := range e {
+					if _, err := jp.AddEntry(e[i]); err != nil {
+						b.Fatal(err)
+					}
+				}
+				fh.Sync()
+
+				up := NewJSONUnpacker(fh)
+				for {
+					_, err := up.Next()
+					if err != nil {
+						if err == io.EOF {
+							break
+						}
+						b.Fatal(err)
+					}
+				}
+
+			}()
+		}
+	})
 }
--- a/tar_benchmark_test.go
+++ b/tar_benchmark_test.go
@ -0,0 +1,84 @@
+package tartest
+
+import (
+	"io"
+	"io/ioutil"
+	"os"
+	"testing"
+
+	upTar "archive/tar"
+
+	ourTar "github.com/vbatts/tar-split/archive/tar"
+)
+
+var testfile = "./archive/tar/testdata/sparse-formats.tar"
+
+func BenchmarkUpstreamTar(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		fh, err := os.Open(testfile)
+		if err != nil {
+			b.Fatal(err)
+		}
+		tr := upTar.NewReader(fh)
+		for {
+			_, err := tr.Next()
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				fh.Close()
+				b.Fatal(err)
+			}
+			io.Copy(ioutil.Discard, tr)
+		}
+		fh.Close()
+	}
+}
+
+func BenchmarkOurTarNoAccounting(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		fh, err := os.Open(testfile)
+		if err != nil {
+			b.Fatal(err)
+		}
+		tr := ourTar.NewReader(fh)
+		tr.RawAccounting = false // this is default, but explicit here
+		for {
+			_, err := tr.Next()
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				fh.Close()
+				b.Fatal(err)
+			}
+			io.Copy(ioutil.Discard, tr)
+		}
+		fh.Close()
+	}
+}
+func BenchmarkOurTarYesAccounting(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		fh, err := os.Open(testfile)
+		if err != nil {
+			b.Fatal(err)
+		}
+		tr := ourTar.NewReader(fh)
+		tr.RawAccounting = true // This enables mechanics for collecting raw bytes
+		for {
+			_ = tr.RawBytes()
+			_, err := tr.Next()
+			_ = tr.RawBytes()
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				fh.Close()
+				b.Fatal(err)
+			}
+			io.Copy(ioutil.Discard, tr)
+			_ = tr.RawBytes()
+		}
+		fh.Close()
+	}
+}
--- a/version/gen.go
+++ b/version/gen.go
@ -0,0 +1,4 @@
+package version
+
+// from `go get github.com/vbatts/go-get-version`
+//go:generate go-get-version -package version -variable VERSION -output version.go
--- a/version/version.go
+++ b/version/version.go
@ -0,0 +1,7 @@
+package version
+
+// AUTO-GENEREATED. DO NOT EDIT
+// 2016-09-26 19:53:30.825879 -0400 EDT
+
+// VERSION is the generated version from /home/vbatts/src/vb/tar-split/version
+var VERSION = "v0.10.1-4-gf280282"