1
0
Fork 0
forked from mirrors/tar-split

Compare commits

...

120 commits

Author SHA1 Message Date
b9127a1393 Merge pull request #38 from vbatts/travis
travis: test more go versions
2017-03-14 11:24:38 -04:00
c6dd42815a
archive/tar: monotonic clock adjustment
commit 0e3355903d2ebcf5ee9e76096f51ac9a116a9dbb upstream

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2017-03-14 11:04:10 -04:00
245403c324
travis: test more go versions
Thanks to @tianon, for pointing to
5e3ef60b0d/lib/travis/build/config.rb (L54-L70)

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2017-03-14 08:38:13 -04:00
7560005f21
README: adding a golang report card
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2017-03-13 18:28:54 -04:00
bd4c5d64c3
main: switch import paths to urfave
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-09-27 02:54:18 +00:00
d3f1b54304
version: bump to v0.10.1
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-09-26 19:53:52 -04:00
f28028292a Merge branch 'master' of github.com:vbatts/tar-split 2016-09-26 19:52:55 -04:00
416fa5dcfe Merge pull request #36 from dmcgowan/fix-extra-nil-accounting
archive/tar: fix writing too many raw bytes
2016-09-26 18:31:47 -04:00
Derek McGowan
6b59e6942e archive/tar: fix writing too many raw bytes
When an EOF is read, only the part of the header buffer which
was read should be accounted for.

Signed-off-by: Derek McGowan <derek@mcgstyle.net>
2016-09-26 14:01:48 -07:00
7410961e75 tar/asm: failing test for lack of EOF nils
Reported-by: Derek McGowan <derek@mcgstyle.net>
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-09-26 13:39:03 -07:00
eb3808673d
version: bump to v0.10.0
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-09-23 11:01:58 -04:00
ae8540dc47 Merge pull request #34 from dmcgowan/fix-panic-issue-33
Fix panic in Next
2016-09-23 09:41:12 -04:00
Derek McGowan
e527e70d25 Fix panic in Next
readHeader should never return nil with a tr.err also nil.
To correct this, ensure tr.err never gets reset to nil followed
by a nil return.
2016-09-22 17:38:18 -07:00
6810cedb21 benchmark: add a comparison of 'archive/tar'
Since this project has forked logic of upstream 'archive/tar', this does
a brief comparison including the RawBytes usage.

```bash
$ go test -run="XXX" -bench=.
testing: warning: no tests to run
BenchmarkUpstreamTar-4                      2000            700809 ns/op
BenchmarkOurTarNoAccounting-4               2000            692055 ns/op
BenchmarkOurTarYesAccounting-4              2000            723184 ns/op
PASS
ok      vb/tar-split    4.461s
```

From this, the difference is negligible.

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-07-26 09:50:08 -04:00
28bc4c32f9 Merge pull request #32 from vbatts/fix-travis
travis: update golang versions
2016-06-26 15:00:37 -04:00
beaeceb06f travis: update golang versions
This is not saying that tar-split no longer works on go1.3 or go1.4, but
rather that the headache of `go vet` having a version dependent ability
to install it, makes it a headache in travis.

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-06-26 14:56:04 -04:00
54e3a92a60 Merge branch 'master' of github.com:vbatts/tar-split 2016-06-26 14:43:38 -04:00
354fd6cf34 cmd: add a disasm --no-stdout flag
Since sometimes you just need to > /dev/null

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-06-26 10:15:12 -04:00
226f7c7490 README: update archive/tar version reference
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-03-30 16:38:51 -04:00
e2a62d6b0d README.md: fix thumbnail
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-02-29 11:40:38 -05:00
24fe0a94fe version: bump to v0.9.13
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-02-15 09:44:28 -05:00
862ccd05bc Merge pull request #31 from vbatts/tar-go1.6
Tar go1.6
2016-02-15 09:41:56 -05:00
c32966b9e8 archive/tar: go1.3 and go1.4 compatibility
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2016-02-15 09:38:46 -05:00
Joe Tsai
10db8408f6 archive/tar: document how Reader.Read handles header-only files
Commit dd5e14a7511465d20c6e95bf54c9b8f999abbbf6 ensured that no data
could be read for header-only files regardless of what the Header.Size
said. We should document this fact in Reader.Read.

Updates #13647

Change-Id: I4df9a2892bc66b49e0279693d08454bf696cfa31
Reviewed-on: https://go-review.googlesource.com/17913
Reviewed-by: Russ Cox <rsc@golang.org>
2016-02-03 07:01:09 -05:00
Joe Tsai
962540fec3 archive/tar: spell license correctly in example
Change-Id: Ice85d161f026a991953bd63ecc6ec80f8d06dfbd
Reviewed-on: https://go-review.googlesource.com/17901
Run-TryBot: Joe Tsai <joetsai@digital-static.net>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2016-02-03 07:01:09 -05:00
Joe Tsai
a04b4ddba4 archive/tar: properly parse GNU base-256 encoding
Motivation:
* Previous implementation did not detect integer overflow when
parsing a base-256 encoded field.
* Previous implementation did not treat the integer as a two's
complement value as specified by GNU.

The relevant GNU specification says:
<<<
GNU format uses two's-complement base-256 notation to store values
that do not fit into standard ustar range.
>>>

Fixes #12435

Change-Id: I4639bcffac8d12e1cb040b76bd05c9d7bc6c23a8
Reviewed-on: https://go-review.googlesource.com/17424
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-03 07:01:09 -05:00
Joe Tsai
ce5aac17f9 archive/tar: properly format GNU base-256 encoding
Motivation:
* Previous implementation silently failed when an integer overflow
occurred. Now, we report an ErrFieldTooLong.
* Previous implementation did not encode in two's complement format and was
unable to encode negative numbers.

The relevant GNU specification says:
<<<
GNU format uses two's-complement base-256 notation to store values
that do not fit into standard ustar range.
>>>

Fixes #12436

Change-Id: I09c20602eabf8ae3a7e0db35b79440a64bfaf807
Reviewed-on: https://go-review.googlesource.com/17425
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-03 06:58:30 -05:00
Joe Tsai
be9ac88117 archive/tar: convert Reader.Next to be loop based
Motivation for change:
* Recursive logic is hard to follow, since it tends to apply
things in reverse. On the other hand, the tar formats tend to
describe meta headers as affecting the next entry.
* Recursion also applies changes in the wrong order. Two test
files are attached that use multiple headers. The previous Go
behavior differs from what GNU and BSD tar do.

Change-Id: Ic1557256fc1363c5cb26570e5d0b9f65a9e57341
Reviewed-on: https://go-review.googlesource.com/14624
Run-TryBot: Joe Tsai <joetsai@digital-static.net>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2016-02-03 06:58:30 -05:00
Joe Tsai
64935a5f0f archive/tar: move parse/format methods to standalone receiver
Motivations for this change:
* It allows these functions to be used outside of Reader/Writer.
* It allows these functions to be more easily unit tested.

Change-Id: Iebe2b70bdb8744371c9ffa87c24316cbbf025b59
Reviewed-on: https://go-review.googlesource.com/15113
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Joe Tsai <joetsai@digital-static.net>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2016-02-02 14:32:27 -05:00
Joe Tsai
b598ba3ee7 archive/tar: fix issues with readGNUSparseMap1x0
Motivations:
* Use of strconv.ParseInt does not properly treat integers as 64bit,
preventing this function from working properly on 32bit machines.
* Use of io.ReadFull does not properly detect truncated streams
when the file suddenly ends on a block boundary.
* The function blindly trusts user input for numEntries and allocates
memory accordingly.
* The function does not validate that numEntries is not negative,
allowing a malicious sparse file to cause a panic during make.

In general, this function was overly complicated for what it was
accomplishing and it was hard to reason that it was free from
bounds errors. Instead, it has been rewritten and relies on
bytes.Buffer.ReadString to do the main work. So long as invariants
about the number of '\n' in the buffer are maintained, it is much
easier to see why this approach is correct.

Change-Id: Ibb12c4126c26e0ea460ea063cd17af68e3cf609e
Reviewed-on: https://go-review.googlesource.com/15174
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:17:35 -05:00
Joe Tsai
7500c932c7 archive/tar: properly handle header-only "files" in Reader
Certain special type-flags, specifically 1, 2, 3, 4, 5, 6,
do not have a data section. Thus, regardless of what the size field
says, we should not attempt to read any data for these special types.

The relevant PAX and USTAR specification says:
<<<
If the typeflag field is set to specify a file to be of type 1 (a link)
or 2 (a symbolic link), the size field shall be specified as zero.
If the typeflag field is set to specify a file of type 5 (directory),
the size field shall be interpreted as described under the definition
of that record type. No data logical records are stored for types 1, 2, or 5.
If the typeflag field is set to 3 (character special file),
4 (block special file), or 6 (FIFO), the meaning of the size field is
unspecified by this volume of POSIX.1-2008, and no data logical records shall
be stored on the medium.
Additionally, for type 6, the size field shall be ignored when reading.
If the typeflag field is set to any other value, the number of logical
records written following the header shall be (size+511)/512, ignoring
any fraction in the result of the division.
>>>

Contrary to the specification, we do not assert that the size field
is zero for type 1 and 2 since we liberally accept non-conforming formats.

Change-Id: I666b601597cb9d7a50caa081813d90ca9cfc52ed
Reviewed-on: https://go-review.googlesource.com/16614
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:10:38 -05:00
Matt Layher
2424f4e367 archive/tar: make output deterministic
Replaces PID in PaxHeaders with 0.  Sorts PAX header keys before writing
them to the archive.

Fixes #12358

Change-Id: If239f89c85f1c9d9895a253fb06a47ad44960124
Reviewed-on: https://go-review.googlesource.com/13975
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Joe Tsai <joetsai@digital-static.net>
2016-02-02 14:10:11 -05:00
Joe Tsai
bffda594f7 archive/tar: detect truncated files
Motivation:
* Reader.skipUnread never reports io.ErrUnexpectedEOF. This is strange
given that io.ErrUnexpectedEOF is given through Reader.Read if the
user manually reads the file.
* Reader.skipUnread fails to detect truncated files since io.Seeker
is lazy about reporting errors. Thus, the behavior of Reader differs
whether the input io.Reader also satisfies io.Seeker or not.

To solve this, we seek to one before the end of the data section and
always rely on at least one call to io.CopyN. If the tr.r satisfies
io.Seeker, this is guarunteed to never read more than blockSize.

Fixes #12557

Change-Id: I0ddddfc6bed0d74465cb7e7a02b26f1de7a7a279
Reviewed-on: https://go-review.googlesource.com/15175
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:09:30 -05:00
Joe Tsai
cf83c95de8 archive/tar: fix numeric overflow issues in readGNUSparseMap0x1
Motivation:
* The logic to verify the numEntries can overflow and incorrectly
pass, allowing a malicious file to allocate arbitrary memory.
* The use of strconv.ParseInt does not set the integer precision
to 64bit, causing this code to work incorrectly on 32bit machines.

Change-Id: I1b1571a750a84f2dde97cc329ed04fe2342aaa60
Reviewed-on: https://go-review.googlesource.com/15173
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:09:04 -05:00
Joe Tsai
cb423795eb archive/tar: add missing error checks to Reader.Next
A recursive call to Reader.Next did not check the error before
trying to use the result, leading to a nil pointer panic.
This specific CL addresses the immediate issue, which is the panic,
but does not solve the root issue, which is due to an integer
overflow in the base-256 parser.

Updates #12435

Change-Id: Ia908671f0f411a409a35e24f2ebf740d46734072
Reviewed-on: https://go-review.googlesource.com/15437
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:08:38 -05:00
Joe Tsai
4ad443d166 archive/tar: expand abilities of TestReader
Motivation:
* There are an increasing number of "one-off" corrupt files added
to make sure that package does not succeed or crash on them.
Instead, allow for the test to specify the error that is expected
to occur (if any).
* Also, fold in the logic to check the MD5 checksum into this
function.

The following tests are being removed:
* TestIncrementalRead: Done by TestReader by using io.CopyBuffer
with a buffer of 8. This achieves the same behavior as this test.
* TestSparseEndToEnd: Since TestReader checks the MD5 checksums
if the input corpus provides them, then this is redundant.
* TestSparseIncrementalRead: Redundant for the same reasons that
TestIncrementalRead is now redundant
* TestNegativeHdrSize: Added to TestReader corpus
* TestIssue10968: Added to TestReader corpus
* TestIssue11169: Added to TestReader corpus

With this change, code coverage did not change: 85.3%

Change-Id: I8550d48657d4dbb8f47dfc3dc280758ef73b47ec
Reviewed-on: https://go-review.googlesource.com/15176
Reviewed-by: Andrew Gerrand <adg@golang.org>
2016-02-02 14:06:30 -05:00
Joe Tsai
f0fc67b3a8 archive/tar: make Reader.Read errors persistent
If the stream is in an inconsistent state, it does not make sense
that Reader.Read can be called and possibly succeed.

Change-Id: I9d1c5a1300b2c2b45232188aa7999e350809dcf2
Reviewed-on: https://go-review.googlesource.com/15177
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
2016-02-02 14:06:30 -05:00
Joe Tsai
af15385a0d archive/tar: fix bugs with sparseFileReader
The sparseFileReader is prone to two different forms of
denial-of-service attacks:
* A malicious tar file can cause an infinite loop
* A malicious tar file can cause arbitrary panics

This results because of poor error checking/handling, which this
CL fixes. While we are at it, add a plethora of unit tests to
test for possible malicious inputs.

Change-Id: I2f9446539d189f3c1738a1608b0ad4859c1be929
Reviewed-on: https://go-review.googlesource.com/15115
Reviewed-by: Andrew Gerrand <adg@golang.org>
Run-TryBot: Andrew Gerrand <adg@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-02-02 14:06:30 -05:00
Joe Tsai
440ba9e519 archive/tar: remove dead code with USTAR path splitting
Convert splitUSTARPath to return a bool rather than an error since
the caller never ever uses the error other than to check if it is
nil. Thus, we can remove errNameTooLong as well.

Also, fold the checking of the length <= fileNameSize and whether
the string is ASCII into the split function itself.

Lastly, remove logic to set the MAGIC since that's already done on
L200. Thus, setting the magic is redundant.

There is no overall logic change.

Updates #12638

Change-Id: I26b6992578199abad723c2a2af7f4fc078af9c17
Reviewed-on: https://go-review.googlesource.com/14723
Reviewed-by: David Symonds <dsymonds@golang.org>
Run-TryBot: David Symonds <dsymonds@golang.org>
2016-02-02 14:06:30 -05:00
b87f81631a version: mark 0.9.12 2016-01-31 01:39:10 -05:00
d50e5c9283 LICENSE: update LICENSE to BSD 3-clause
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-12-03 15:45:57 -05:00
0de4e9db0c Merge pull request #27 from vbatts/bench_asm
tar/asm: basic benchmark on disasm/asm of testdata
2015-12-02 14:09:21 -06:00
1501fe6002 Merge pull request #22 from tonistiigi/stream-opt
Optimize tar stream generation
2015-12-02 14:09:08 -06:00
19b7e22058 tar/asm: basic benchmark on disasm/asm of testdata
```
PASS
BenchmarkAsm-4         5         238968475 ns/op        66841059 B/op       2449 allocs/op
ok      _/home/vbatts/src/vb/tar-split/tar/asm  2.267s
```

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-12-02 14:36:02 -05:00
026e78012b Merge pull request #26 from vbatts/better_discard_in_test
tar/asm: remove unneeded Tee
2015-12-02 12:00:26 -06:00
2efe34695a tar/asm: remove unneeded Tee
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-12-02 12:56:52 -05:00
Tonis Tiigi
23b6435e6b Optimize tar stream generation
- New writeTo method allows to avoid creating extra pipe.
- Copy with a pooled buffer instead of allocating new buffer for each file.
- Avoid extra object allocations inside the loop.

Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>
2015-12-01 14:08:53 -08:00
93666d5824 Merge pull request #25 from vbatts/bench
tar/storage: adding Getter Putter benchmark
2015-12-01 14:37:10 -06:00
11281e8c09 tar/storage: adding Getter Putter benchmark
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-12-01 15:31:48 -05:00
fc1e47e71d Merge pull request #24 from vbatts/drop_go1.2
travis: drop go1.2
2015-12-01 14:31:13 -06:00
d80c6b3bb1 travis: drop go1.2
seems overly reasonable to support go1.3 and greater. :-)

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-12-01 15:26:30 -05:00
Tonis Tiigi
8b20f9161d Optimize JSON decoding
This allows to avoid extra allocations on `ReadBytes` and
decoding buffers.

Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>
2015-11-30 09:52:44 -08:00
bece0c7009 demo: docker layer checksums
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-10-16 17:05:18 -04:00
7ea74e1c31 demo: basic command
Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-10-16 16:41:09 -04:00
c955161e57 Merge pull request #20 from vbatts/unicode-utf8
remove common in favor of stdlib `unicode/utf8`
2015-09-25 14:38:49 -04:00
10250c25e0 tar/asm: remove useless test
The iso-8859-1 archive is already tested round trip, and this test did
not do anything really.
2015-09-25 14:35:12 -04:00
7e38cefd4b common: remove in favor of stdlib unicode/utf8 2015-09-25 14:33:24 -04:00
7ef16e6f67 Merge pull request #19 from LK4D4/go_143
Update travis to go1.4.3
2015-09-24 16:02:39 -04:00
Alexander Morozov
27876e49c2 Update travis to go1.4.3
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-09-24 12:24:31 -07:00
8a361ef0d8 tar/storage: Sprintf is unnecessary
fmt.Sprintf() vs string() for this []byte conversion is too much and
does not provide any further safety.

https://gist.github.com/vbatts/ab17181086aed558dd3a
2015-09-24 09:51:58 -04:00
7f56c08c48 Merge pull request #18 from vbatts/iso-8859-1
Iso 8859 1
2015-09-23 15:47:23 -04:00
cde639172f tar/asm: work with non-utf8 entry names 2015-09-23 15:27:33 -04:00
032efafc29 tar/storage: work with raw (invalid utf8) names
When the entry name is not UTF-8, for example ISO-8859-1, then store the
raw bytes.
To accommodate this, we will have getters and setters for the entry's
name now. Since this most heavily affects the json marshalling, we'll
double check the sanity of the name before storing it in the JSONPacker.
2015-09-23 15:27:33 -04:00
39d06b9dc4 tar/common: get index of first invalid utf-8 char 2015-09-23 15:27:15 -04:00
2865353200 common: add a UTF-8 check helper 2015-09-23 15:27:13 -04:00
7384cf1827 Merge pull request #16 from LK4D4/go_15
Add go 1.5.1 to CI
2015-09-11 12:50:40 -04:00
Alexander Morozov
1148e7ee3b Add go 1.5.1 to CI
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-09-11 08:49:52 -07:00
414a687f83 README: usage 2015-09-03 15:01:25 -04:00
b4d27b5426 Merge pull request #15 from vbatts/go1.5
Go1.5 updates for archive/tar
2015-08-20 21:25:41 -07:00
4d4b53c78b archive/tar: don't treat multiple file system links as a tar hardlink
Do not assume that if stat shows multiple links that we should mark the
file as a hardlink in the tar format.  If the hardlink link was not
referenced, this caused a link to "/".  On an overlay file system, all
files have multiple links.

The caller must keep the inode references and set TypeLink, Size = 0,
and LinkName themselves.

Change-Id: I873b8a235bc8f8fbb271db74ee54232da36ca013
Reviewed-on: https://go-review.googlesource.com/13045
Reviewed-by: Ian Lance Taylor <iant@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Alex Brainman
3b34dbd368 archive/tar: move round-trip reading into common os file
Fixes #11426

Change-Id: I77368b0e852149ed4533e139cc43887508ac7f78
Reviewed-on: https://go-review.googlesource.com/11662
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Russ Cox <rsc@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Brad Fitzpatrick
27e18409b9 archive/tar: also skip header roundtrip test on nacl
Update #11426

Change-Id: I7abc4ed2241a7a3af6d57c934786f36de4f97b77
Reviewed-on: https://go-review.googlesource.com/11592
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Brad Fitzpatrick
8eee43d0df archive/tar: disable new failing test on windows and plan9
Update #11426

Change-Id: If406d2efcc81965825a63c76f5448d544ba2a740
Reviewed-on: https://go-review.googlesource.com/11590
Reviewed-by: Austin Clements <austin@google.com>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
b48c28014e archive/tar: fix round-trip attributes
The issue was identified while
working with round trip FileInfo of the headers of hardlinks. Also,
additional test cases for hard link handling.
(review carried over from http://golang.org/cl/165860043)

Fixes #9027

Change-Id: I9e3a724c8de72eb1b0fbe0751a7b488894911b76
Reviewed-on: https://go-review.googlesource.com/6790
Reviewed-by: Russ Cox <rsc@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Michael Gehring
2e5698249c archive/tar: add missing error checks
Check for errors when reading the headers following the pax headers.

Fixes #11169.

Change-Id: Ifec4a949ec8df8b49fa7cb7a67eb826fe2282ad8
Reviewed-on: https://go-review.googlesource.com/11031
Reviewed-by: Russ Cox <rsc@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Michael Gehring
69de764807 archive/tar: fix slice bounds out of range
Sanity check the pax-header size field before using it.

Fixes #11167.

Change-Id: I9d5d0210c3990e6fb9434c3fe333be0d507d5962
Reviewed-on: https://go-review.googlesource.com/10954
Reviewed-by: David Symonds <dsymonds@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Håvard Haugen
55dceefe42 archive/tar: terminate when reading malformed sparse files
Fixes #10968.

Change-Id: I027bc571a71629ac49c2a0ff101b2950af6e7531
Reviewed-on: https://go-review.googlesource.com/10482
Reviewed-by: David Symonds <dsymonds@golang.org>
Run-TryBot: David Symonds <dsymonds@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
Håvard Haugen
576b273762 archive/tar: don't panic on negative file size
Fixes #10959.
Fixes #10960.

Change-Id: I9a81a0e2b8275338d0d1c3f7f7265e0fd91f3de2
Reviewed-on: https://go-review.googlesource.com/10402
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Symonds <dsymonds@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
David du Colombier
6e38573de2 archive/tar: fix error message
Write should return ErrWriteAfterClose instead
of ErrWriteTooLong when called after Close.

Change-Id: If5ec4ef924e4c56489e0d426976f7e5fad79be9b
Reviewed-on: https://go-review.googlesource.com/9259
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
2015-08-21 00:15:22 -04:00
bf82db1f0d README: updates 2015-08-18 14:54:32 -04:00
ea4426eee9 Merge pull request #13 from vbatts/travis-go-versions
travis: adding older and newer golang
2015-08-14 10:18:38 -04:00
3a88af2866 travis: adding older and newer golang 2015-08-14 10:15:26 -04:00
4f81319c22 *: adding some version magic 2015-08-14 10:02:46 -04:00
c76e42010e tar/asm: additional GNU LongLink testcase
Adding a minimal test case for GNU @LongLink.
Tested that it fails on v0.9.5, but now passes on v0.9.6 and master.
2015-08-14 07:55:18 -04:00
44d93178df Merge pull request #11 from LK4D4/add_vet
Add vet check to travis
2015-08-13 15:37:30 -04:00
8f81a50860 Merge pull request #10 from LK4D4/fix_pipe_close
asm: Remove unreachable code
2015-08-13 15:36:42 -04:00
e72b4959f9 Merge pull request #9 from LK4D4/fix_json_tags
storage: Fix syntax of json tags
2015-08-13 15:35:20 -04:00
4d66163297 archive/tar: a []byte copy needed for GNU LongLink 2015-08-13 15:32:17 -04:00
9b9df04f1f Merge pull request #12 from LK4D4/multi_for_tee
tar/storage: Replace TeeReader with MultiWriter
2015-08-13 15:12:22 -04:00
Alexander Morozov
45399711c2 tar/storage: Replace TeeReader with MultiWriter
It uses slightly less memory and more understandable.
Benchmar results:

benchmark             old ns/op     new ns/op     delta
BenchmarkPutter-4     57272         52375         -8.55%

benchmark             old allocs     new allocs     delta
BenchmarkPutter-4     21             19             -9.52%
benchmark             old bytes     new bytes     delta
BenchmarkPutter-4     19416         13336         -31.31%

Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-13 11:43:31 -07:00
Alexander Morozov
ea73dc6f6f tar/storage: Benchmark for bufferFileGetPutter.Put
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-13 11:42:14 -07:00
Alexander Morozov
fa881b2347 Add vet check to travis
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-12 22:49:38 -07:00
Alexander Morozov
93c0a320a8 asm: Remove unreachable code
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-12 22:45:39 -07:00
Alexander Morozov
b1783bc86d storage: Fix syntax of json tags
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-12 22:41:28 -07:00
505d53c95c Merge pull request #8 from LK4D4/remove_tee
Remove redundant TeeReader
2015-08-12 22:49:28 -04:00
Alexander Morozov
e6df23162e Remove redundant TeeReader
Signed-off-by: Alexander Morozov <lk4d4@docker.com>
2015-08-12 16:46:04 -07:00
b5c23068bb Merge branch 'longlink' 2015-08-11 15:58:45 -04:00
e46a815cbc archive/tar: fix carry-over of bytes for GNU types
Archives produced with GNU tar can have types of TypeGNULongName and
TypeGNULongLink.
These fields effectively appear like two file entries in the tar
archive. While golang's `archive/tar` transparently provide the file
name and headers and file payload, the access to the raw bytes is still
needed.

This fixes the access to the longlink header, it's payload (of the long
file path name), and the following file header and actual file payload.
2015-08-11 15:57:20 -04:00
df8572a1eb tar/asm: check length before adding an entry 2015-08-11 15:57:20 -04:00
51b0481d4a tar/asm: adding a failing test due to GNU LongLink 2015-08-11 15:57:20 -04:00
0a79a3807c README: missed a checksize reference 2015-08-10 16:26:09 -04:00
c6be94f8a3 cmd/tar-split: README usage for checksize 2015-08-10 16:22:36 -04:00
6c671d7267 cmd/tar-split: make checksize a sub-command
Moving it from top-level to the `tar-split` command
2015-08-10 16:20:22 -04:00
5d0b967302 README: cleanup 2015-08-10 15:36:38 -04:00
779e824d76 README: formatting and cleanup 2015-08-10 15:36:30 -04:00
f465e4720e cmd/tar-split: adding to the README 2015-07-28 17:16:04 -04:00
de37d1755a travis: incorrect comment 2015-07-28 15:45:24 -04:00
a80fb82091 Merge pull request #6 from vbatts/jonboulle-fixes-rebased
rebased #5
2015-07-22 16:12:42 -04:00
Jonathan Boulle
caf6a872c9 tar/storage: switch to map[string]struct{} for set
Using an empty struct is more idiomatic/efficient for representing a
set-like container.
2015-07-22 15:32:49 -04:00
Jonathan Boulle
002d19f0b0 *: clean up assorted spelling/grammar issues
Various minor fixes noticed on walking through
2015-07-22 15:32:49 -04:00
e0e9886972 tar/asm: return instead of break
5ddec2ae4a (commitcomment-12290378)

Reported-by: Tibor Vass <tibor@docker.com>
2015-07-22 11:32:18 -04:00
c2c2dde4cb tar/storage: use filepath instead of path 2015-07-22 10:27:53 -04:00
6d59e7bc76 tar/asm: clean up return on errors
This closure on error message needs returns so that the error message is
bubbled up to the reader.
2015-07-21 12:10:09 -04:00
d3556a0551 travis: go1.4.1 -> go1.4.2 2015-07-20 20:16:42 -04:00
c74af0bae7 tar/asm: test was flipped 2015-07-20 17:26:16 -04:00
97acaa9e83 travis: needing to fetch the cmd dependencies 2015-07-20 17:22:10 -04:00
04172717de tar/asm: test for failure when mangling 2015-07-20 16:46:22 -04:00
fd84b2fdfd cmd/tar-split: adding a cli tool for asm/disasm 2015-07-20 15:51:20 -04:00
6094dcaeca concept: move the PoC out of the root directory 2015-07-20 15:47:10 -04:00
e33913bf75 tar/asm: don't defer file closing
this `for {}` can read many files. defering the file handle close can
cause an EMFILE (too many open files).
2015-07-15 13:43:48 -04:00
46 changed files with 2643 additions and 1096 deletions

View file

@ -1,13 +1,18 @@
language: go
go:
- 1.4.1
- 1.3.3
- tip
- 1.x
- 1.8.x
- 1.7.x
- 1.6.x
- 1.5.x
# let us have pretty, fast Docker-based Travis workers!
sudo: false
# we don't need "go get" here <3
install: true
install:
- go get -d ./...
script:
- go test -v ./...
- go vet ./...

View file

@ -1,36 +0,0 @@
Flow of TAR stream
==================
The underlying use of `github.com/vbatts/tar-split/archive/tar` is most similar
to stdlib.
Packer interface
----------------
For ease of storage and usage of the raw bytes, there will be a storage
interface, that accepts an io.Writer (This way you could pass it an in memory
buffer or a file handle).
Having a Packer interface can allow configuration of hash.Hash for file payloads
and providing your own io.Writer.
Instead of having a state directory to store all the header information for all
Readers, we will leave that up to user of Reader. Because we can not assume an
ID for each Reader, and keeping that information differentiated.
State Directory
---------------
Perhaps we could deduplicate the header info, by hashing the rawbytes and
storing them in a directory tree like:
./ac/dc/beef
Then reference the hash of the header info, in the positional records for the
tar stream. Though this could be a future feature, and not required for an
initial implementation. Also, this would imply an owned state directory, rather
than just writing storage info to an io.Writer.

39
LICENSE
View file

@ -1,19 +1,28 @@
Copyright (c) 2015 Vincent Batts, Raleigh, NC, USA
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
All rights reserved.
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

155
README.md
View file

@ -1,25 +1,49 @@
tar-split
========
# tar-split
[![Build Status](https://travis-ci.org/vbatts/tar-split.svg?branch=master)](https://travis-ci.org/vbatts/tar-split)
[![Go Report Card](https://goreportcard.com/badge/github.com/vbatts/tar-split)](https://goreportcard.com/report/github.com/vbatts/tar-split)
Extend the upstream golang stdlib `archive/tar` library, to expose the raw
bytes of the TAR, rather than just the marshalled headers and file stream.
Pristinely disassembling a tar archive, and stashing needed raw bytes and offsets to reassemble a validating original archive.
The goal being that by preserving the raw bytes of each header, padding bytes,
and the raw file payload, one could reassemble the original archive.
## Docs
Docs
----
Code API for libraries provided by `tar-split`:
* https://godoc.org/github.com/vbatts/tar-split/tar/asm
* https://godoc.org/github.com/vbatts/tar-split/tar/storage
* https://godoc.org/github.com/vbatts/tar-split/archive/tar
## Install
Caveat
------
The command line utilitiy is installable via:
```bash
go get github.com/vbatts/tar-split/cmd/tar-split
```
## Usage
For cli usage, see its [README.md](cmd/tar-split/README.md).
For the library see the [docs](#docs)
## Demo
### Basic disassembly and assembly
This demonstrates the `tar-split` command and how to assemble a tar archive from the `tar-data.json.gz`
![basic cmd demo thumbnail](https://i.ytimg.com/vi/vh5wyjIOBtc/2.jpg?time=1445027151805)
[youtube video of basic command demo](https://youtu.be/vh5wyjIOBtc)
### Docker layer preservation
This demonstrates the tar-split integration for docker-1.8. Providing consistent tar archives for the image layer content.
![docker tar-split demo](https://i.ytimg.com/vi_webp/vh5wyjIOBtc/default.webp)
[youtube vide of docker layer checksums](https://youtu.be/tV_Dia8E8xw)
## Caveat
Eventually this should detect TARs that this is not possible with.
@ -37,85 +61,21 @@ same path, we will not support this feature. If there are more than one entries
with the same path, expect an err (like `ErrDuplicatePath`) or a resulting tar
stream that does not validate your original checksum/signature.
## Contract
Contract
--------
Do not break the API of stdlib `archive/tar` in our fork (ideally find an upstream mergeable solution).
Do not break the API of stdlib `archive/tar` in our fork (ideally find an
upstream mergeable solution)
## Std Version
The version of golang stdlib `archive/tar` is from go1.6
It is minimally extended to expose the raw bytes of the TAR, rather than just the marshalled headers and file stream.
Std Version
-----------
## Design
The version of golang stdlib `archive/tar` is from go1.4.1, and their master branch around [a9dddb53f](https://github.com/golang/go/tree/a9dddb53f)
See the [design](concept/DESIGN.md).
Example
-------
First we'll get an archive to work with. For repeatability, we'll make an
archive from what you've just cloned:
```
git archive --format=tar -o tar-split.tar HEAD .
```
Then build the example main.go:
```
go build ./main.go
```
Now run the example over the archive:
```
$ ./main tar-split.tar
2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out"
pax_global_header pre: 512 read: 52
.travis.yml pre: 972 read: 374
DESIGN.md pre: 650 read: 1131
LICENSE pre: 917 read: 1075
README.md pre: 973 read: 4289
archive/ pre: 831 read: 0
archive/tar/ pre: 512 read: 0
archive/tar/common.go pre: 512 read: 7790
[...]
tar/storage/entry_test.go pre: 667 read: 1137
tar/storage/getter.go pre: 911 read: 2741
tar/storage/getter_test.go pre: 843 read: 1491
tar/storage/packer.go pre: 557 read: 3141
tar/storage/packer_test.go pre: 955 read: 3096
EOF padding: 1512
Remainder: 512
Size: 215040; Sum: 215040
```
*What are we seeing here?*
* `pre` is the header of a file entry, and potentially the padding from the
end of the prior file's payload. Also with particular tar extensions and pax
attributes, the header can exceed 512 bytes.
* `read` is the size of the file payload from the entry
* `EOF padding` is the expected 1024 null bytes on the end of a tar archive,
plus potential padding from the end of the prior file entry's payload
* `Remainder` is the remaining bytes of an archive. This is typically deadspace
as most tar implmentations will return after having reached the end of the
1024 null bytes. Though various implementations will include some amount of
bytes here, which will affect the checksum of the resulting tar archive,
therefore this must be accounted for as well.
Ideally the input tar and output `*.out`, will match:
```
$ sha1sum tar-split.tar*
ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar
ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar.out
```
Stored Metadata
---------------
## Stored Metadata
Since the raw bytes of the headers and padding are stored, you may be wondering
what the size implications are. The headers are at least 512 bytes per
@ -123,14 +83,16 @@ file (sometimes more), at least 1024 null bytes on the end, and then various
padding. This makes for a constant linear growth in the stored metadata, with a
naive storage implementation.
Reusing our prior example's `tar-split.tar`, let's build the checksize.go example:
First we'll get an archive to work with. For repeatability, we'll make an
archive from what you've just cloned:
```
go build ./checksize.go
```bash
git archive --format=tar -o tar-split.tar HEAD .
```
```
$ ./checksize ./tar-split.tar
```bash
$ go get github.com/vbatts/tar-split/cmd/tar-split
$ tar-split checksize ./tar-split.tar
inspecting "tar-split.tar" (size 210k)
-- number of files: 50
-- size of metadata uncompressed: 53k
@ -143,10 +105,10 @@ implications are as little as 3kb.
But let's look at a larger archive, with many files.
```
```bash
$ ls -sh ./d.tar
1.4G ./d.tar
$ ./checksize ~/d.tar
$ tar-split checksize ~/d.tar
inspecting "/home/vbatts/d.tar" (size 1420749k)
-- number of files: 38718
-- size of metadata uncompressed: 43261k
@ -163,19 +125,14 @@ bytes-per-file rate for the storage implications.
| ~ 1kb per/file | 0.06kb per/file |
What's Next?
------------
## What's Next?
* More implementations of storage Packer and Unpacker
- could be a redis or mongo backend
* More implementations of FileGetter and FilePutter
- could be a redis or mongo backend
* cli tooling to assemble/disassemble a provided tar archive
* would be interesting to have an assembler stream that implements `io.Seeker`
License
-------
See LICENSE
## License
See [LICENSE](LICENSE)

View file

@ -139,8 +139,8 @@ func (fi headerFileInfo) Mode() (mode os.FileMode) {
}
switch fi.h.Typeflag {
case TypeLink, TypeSymlink:
// hard link, symbolic link
case TypeSymlink:
// symbolic link
mode |= os.ModeSymlink
case TypeChar:
// character device node
@ -249,6 +249,30 @@ func FileInfoHeader(fi os.FileInfo, link string) (*Header, error) {
if fm&os.ModeSticky != 0 {
h.Mode |= c_ISVTX
}
// If possible, populate additional fields from OS-specific
// FileInfo fields.
if sys, ok := fi.Sys().(*Header); ok {
// This FileInfo came from a Header (not the OS). Use the
// original Header to populate all remaining fields.
h.Uid = sys.Uid
h.Gid = sys.Gid
h.Uname = sys.Uname
h.Gname = sys.Gname
h.AccessTime = sys.AccessTime
h.ChangeTime = sys.ChangeTime
if sys.Xattrs != nil {
h.Xattrs = make(map[string]string)
for k, v := range sys.Xattrs {
h.Xattrs[k] = v
}
}
if sys.Typeflag == TypeLink {
// hard link
h.Typeflag = TypeLink
h.Size = 0
h.Linkname = sys.Linkname
}
}
if sysStat != nil {
return h, sysStat(fi, h)
}
@ -303,3 +327,14 @@ func toASCII(s string) string {
}
return buf.String()
}
// isHeaderOnlyType checks if the given type flag is of the type that has no
// data section even if a size is specified.
func isHeaderOnlyType(flag byte) bool {
switch flag {
case TypeLink, TypeSymlink, TypeChar, TypeBlock, TypeDir, TypeFifo:
return true
default:
return false
}
}

View file

@ -26,7 +26,7 @@ func Example() {
}{
{"readme.txt", "This archive contains some text files."},
{"gopher.txt", "Gopher names:\nGeorge\nGeoffrey\nGonzo"},
{"todo.txt", "Get animal handling licence."},
{"todo.txt", "Get animal handling license."},
}
for _, file := range files {
hdr := &tar.Header{
@ -76,5 +76,5 @@ func Example() {
// Geoffrey
// Gonzo
// Contents of todo.txt:
// Get animal handling licence.
// Get animal handling license.
}

View file

@ -12,6 +12,7 @@ import (
"errors"
"io"
"io/ioutil"
"math"
"os"
"strconv"
"strings"
@ -39,6 +40,10 @@ type Reader struct {
rawBytes *bytes.Buffer // last raw bits
}
type parser struct {
err error // Last error seen
}
// RawBytes accesses the raw bytes of the archive, apart from the file payload itself.
// This includes the header and padding.
//
@ -70,12 +75,36 @@ type regFileReader struct {
nb int64 // number of unread bytes for current file entry
}
// A sparseFileReader is a numBytesReader for reading sparse file data from a tar archive.
// A sparseFileReader is a numBytesReader for reading sparse file data from a
// tar archive.
type sparseFileReader struct {
rfr *regFileReader // reads the sparse-encoded file data
sp []sparseEntry // the sparse map for the file
pos int64 // keeps track of file position
tot int64 // total size of the file
rfr numBytesReader // Reads the sparse-encoded file data
sp []sparseEntry // The sparse map for the file
pos int64 // Keeps track of file position
total int64 // Total size of the file
}
// A sparseEntry holds a single entry in a sparse file's sparse map.
//
// Sparse files are represented using a series of sparseEntrys.
// Despite the name, a sparseEntry represents an actual data fragment that
// references data found in the underlying archive stream. All regions not
// covered by a sparseEntry are logically filled with zeros.
//
// For example, if the underlying raw file contains the 10-byte data:
// var compactData = "abcdefgh"
//
// And the sparse map has the following entries:
// var sp = []sparseEntry{
// {offset: 2, numBytes: 5} // Data fragment for [2..7]
// {offset: 18, numBytes: 3} // Data fragment for [18..21]
// }
//
// Then the content of the resulting sparse file with a "real" size of 25 is:
// var sparseData = "\x00"*2 + "abcde" + "\x00"*11 + "fgh" + "\x00"*4
type sparseEntry struct {
offset int64 // Starting position of the fragment
numBytes int64 // Length of the fragment
}
// Keywords for GNU sparse files in a PAX extended header
@ -109,7 +138,6 @@ func NewReader(r io.Reader) *Reader { return &Reader{r: r} }
//
// io.EOF is returned at the end of the input.
func (tr *Reader) Next() (*Header, error) {
var hdr *Header
if tr.RawAccounting {
if tr.rawBytes == nil {
tr.rawBytes = bytes.NewBuffer(nil)
@ -117,88 +145,88 @@ func (tr *Reader) Next() (*Header, error) {
tr.rawBytes.Reset()
}
}
if tr.err == nil {
tr.skipUnread()
}
if tr.err != nil {
return hdr, tr.err
}
hdr = tr.readHeader()
if hdr == nil {
return hdr, tr.err
}
// Check for PAX/GNU header.
switch hdr.Typeflag {
case TypeXHeader:
// PAX extended header
headers, err := parsePAX(tr)
if err != nil {
return nil, err
}
// We actually read the whole file,
// but this skips alignment padding
tr.skipUnread()
hdr = tr.readHeader()
mergePAX(hdr, headers)
// Check for a PAX format sparse file
sp, err := tr.checkForGNUSparsePAXHeaders(hdr, headers)
if err != nil {
tr.err = err
return nil, err
}
if sp != nil {
// Current file is a PAX format GNU sparse file.
// Set the current file reader to a sparse file reader.
tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size}
}
return hdr, nil
case TypeGNULongName:
// We have a GNU long name header. Its contents are the real file name.
realname, err := ioutil.ReadAll(tr)
if err != nil {
return nil, err
}
var b []byte
if tr.RawAccounting {
if _, err = tr.rawBytes.Write(realname); err != nil {
return nil, err
}
b = tr.RawBytes()
}
hdr, err := tr.Next()
// since the above call to Next() resets the buffer, we need to throw the bytes over
if tr.RawAccounting {
if _, err = tr.rawBytes.Write(b); err != nil {
return nil, err
}
}
hdr.Name = cString(realname)
return hdr, err
case TypeGNULongLink:
// We have a GNU long link header.
realname, err := ioutil.ReadAll(tr)
if err != nil {
return nil, err
}
var b []byte
if tr.RawAccounting {
if _, err = tr.rawBytes.Write(realname); err != nil {
return nil, err
}
b = tr.RawBytes()
}
hdr, err := tr.Next()
// since the above call to Next() resets the buffer, we need to throw the bytes over
if tr.RawAccounting {
if _, err = tr.rawBytes.Write(b); err != nil {
return nil, err
}
}
hdr.Linkname = cString(realname)
return hdr, err
if tr.err != nil {
return nil, tr.err
}
return hdr, tr.err
var hdr *Header
var extHdrs map[string]string
// Externally, Next iterates through the tar archive as if it is a series of
// files. Internally, the tar format often uses fake "files" to add meta
// data that describes the next file. These meta data "files" should not
// normally be visible to the outside. As such, this loop iterates through
// one or more "header files" until it finds a "normal file".
loop:
for {
tr.err = tr.skipUnread()
if tr.err != nil {
return nil, tr.err
}
hdr = tr.readHeader()
if tr.err != nil {
return nil, tr.err
}
// Check for PAX/GNU special headers and files.
switch hdr.Typeflag {
case TypeXHeader:
extHdrs, tr.err = parsePAX(tr)
if tr.err != nil {
return nil, tr.err
}
continue loop // This is a meta header affecting the next header
case TypeGNULongName, TypeGNULongLink:
var realname []byte
realname, tr.err = ioutil.ReadAll(tr)
if tr.err != nil {
return nil, tr.err
}
if tr.RawAccounting {
if _, tr.err = tr.rawBytes.Write(realname); tr.err != nil {
return nil, tr.err
}
}
// Convert GNU extensions to use PAX headers.
if extHdrs == nil {
extHdrs = make(map[string]string)
}
var p parser
switch hdr.Typeflag {
case TypeGNULongName:
extHdrs[paxPath] = p.parseString(realname)
case TypeGNULongLink:
extHdrs[paxLinkpath] = p.parseString(realname)
}
if p.err != nil {
tr.err = p.err
return nil, tr.err
}
continue loop // This is a meta header affecting the next header
default:
mergePAX(hdr, extHdrs)
// Check for a PAX format sparse file
sp, err := tr.checkForGNUSparsePAXHeaders(hdr, extHdrs)
if err != nil {
tr.err = err
return nil, err
}
if sp != nil {
// Current file is a PAX format GNU sparse file.
// Set the current file reader to a sparse file reader.
tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size)
if tr.err != nil {
return nil, tr.err
}
}
break loop // This is a file, so stop
}
}
return hdr, nil
}
// checkForGNUSparsePAXHeaders checks the PAX headers for GNU sparse headers. If they are found, then
@ -375,6 +403,7 @@ func parsePAX(r io.Reader) (map[string]string, error) {
return nil, err
}
}
sbuf := string(buf)
// For GNU PAX sparse format 0.0 support.
// This function transforms the sparse format 0.0 headers into sparse format 0.1 headers.
@ -383,35 +412,17 @@ func parsePAX(r io.Reader) (map[string]string, error) {
headers := make(map[string]string)
// Each record is constructed as
// "%d %s=%s\n", length, keyword, value
for len(buf) > 0 {
// or the header was empty to start with.
var sp int
// The size field ends at the first space.
sp = bytes.IndexByte(buf, ' ')
if sp == -1 {
return nil, ErrHeader
}
// Parse the first token as a decimal integer.
n, err := strconv.ParseInt(string(buf[:sp]), 10, 0)
for len(sbuf) > 0 {
key, value, residual, err := parsePAXRecord(sbuf)
if err != nil {
return nil, ErrHeader
}
// Extract everything between the decimal and the n -1 on the
// beginning to eat the ' ', -1 on the end to skip the newline.
var record []byte
record, buf = buf[sp+1:n-1], buf[n:]
// The first equals is guaranteed to mark the end of the key.
// Everything else is value.
eq := bytes.IndexByte(record, '=')
if eq == -1 {
return nil, ErrHeader
}
key, value := record[:eq], record[eq+1:]
sbuf = residual
keyStr := string(key)
if keyStr == paxGNUSparseOffset || keyStr == paxGNUSparseNumBytes {
// GNU sparse format 0.0 special key. Write to sparseMap instead of using the headers map.
sparseMap.Write(value)
sparseMap.WriteString(value)
sparseMap.Write([]byte{','})
} else {
// Normal key. Set the value in the headers map.
@ -426,9 +437,42 @@ func parsePAX(r io.Reader) (map[string]string, error) {
return headers, nil
}
// cString parses bytes as a NUL-terminated C-style string.
// parsePAXRecord parses the input PAX record string into a key-value pair.
// If parsing is successful, it will slice off the currently read record and
// return the remainder as r.
//
// A PAX record is of the following form:
// "%d %s=%s\n" % (size, key, value)
func parsePAXRecord(s string) (k, v, r string, err error) {
// The size field ends at the first space.
sp := strings.IndexByte(s, ' ')
if sp == -1 {
return "", "", s, ErrHeader
}
// Parse the first token as a decimal integer.
n, perr := strconv.ParseInt(s[:sp], 10, 0) // Intentionally parse as native int
if perr != nil || n < 5 || int64(len(s)) < n {
return "", "", s, ErrHeader
}
// Extract everything between the space and the final newline.
rec, nl, rem := s[sp+1:n-1], s[n-1:n], s[n:]
if nl != "\n" {
return "", "", s, ErrHeader
}
// The first equals separates the key from the value.
eq := strings.IndexByte(rec, '=')
if eq == -1 {
return "", "", s, ErrHeader
}
return rec[:eq], rec[eq+1:], rem, nil
}
// parseString parses bytes as a NUL-terminated C-style string.
// If a NUL byte is not found then the whole slice is returned as a string.
func cString(b []byte) string {
func (*parser) parseString(b []byte) string {
n := 0
for n < len(b) && b[n] != 0 {
n++
@ -436,19 +480,51 @@ func cString(b []byte) string {
return string(b[0:n])
}
func (tr *Reader) octal(b []byte) int64 {
// Check for binary format first.
// parseNumeric parses the input as being encoded in either base-256 or octal.
// This function may return negative numbers.
// If parsing fails or an integer overflow occurs, err will be set.
func (p *parser) parseNumeric(b []byte) int64 {
// Check for base-256 (binary) format first.
// If the first bit is set, then all following bits constitute a two's
// complement encoded number in big-endian byte order.
if len(b) > 0 && b[0]&0x80 != 0 {
var x int64
for i, c := range b {
if i == 0 {
c &= 0x7f // ignore signal bit in first byte
}
x = x<<8 | int64(c)
// Handling negative numbers relies on the following identity:
// -a-1 == ^a
//
// If the number is negative, we use an inversion mask to invert the
// data bytes and treat the value as an unsigned number.
var inv byte // 0x00 if positive or zero, 0xff if negative
if b[0]&0x40 != 0 {
inv = 0xff
}
return x
var x uint64
for i, c := range b {
c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing
if i == 0 {
c &= 0x7f // Ignore signal bit in first byte
}
if (x >> 56) > 0 {
p.err = ErrHeader // Integer overflow
return 0
}
x = x<<8 | uint64(c)
}
if (x >> 63) > 0 {
p.err = ErrHeader // Integer overflow
return 0
}
if inv == 0xff {
return ^int64(x)
}
return int64(x)
}
// Normal case is base-8 (octal) format.
return p.parseOctal(b)
}
func (p *parser) parseOctal(b []byte) int64 {
// Because unused fields are filled with NULs, we need
// to skip leading NULs. Fields may also be padded with
// spaces or NULs.
@ -459,27 +535,55 @@ func (tr *Reader) octal(b []byte) int64 {
if len(b) == 0 {
return 0
}
x, err := strconv.ParseUint(cString(b), 8, 64)
if err != nil {
tr.err = err
x, perr := strconv.ParseUint(p.parseString(b), 8, 64)
if perr != nil {
p.err = ErrHeader
}
return int64(x)
}
// skipUnread skips any unread bytes in the existing file entry, as well as any alignment padding.
func (tr *Reader) skipUnread() {
nr := tr.numBytes() + tr.pad // number of bytes to skip
// skipUnread skips any unread bytes in the existing file entry, as well as any
// alignment padding. It returns io.ErrUnexpectedEOF if any io.EOF is
// encountered in the data portion; it is okay to hit io.EOF in the padding.
//
// Note that this function still works properly even when sparse files are being
// used since numBytes returns the bytes remaining in the underlying io.Reader.
func (tr *Reader) skipUnread() error {
dataSkip := tr.numBytes() // Number of data bytes to skip
totalSkip := dataSkip + tr.pad // Total number of bytes to skip
tr.curr, tr.pad = nil, 0
if tr.RawAccounting {
_, tr.err = io.CopyN(tr.rawBytes, tr.r, nr)
return
_, tr.err = io.CopyN(tr.rawBytes, tr.r, totalSkip)
return tr.err
}
if sr, ok := tr.r.(io.Seeker); ok {
if _, err := sr.Seek(nr, os.SEEK_CUR); err == nil {
return
// If possible, Seek to the last byte before the end of the data section.
// Do this because Seek is often lazy about reporting errors; this will mask
// the fact that the tar stream may be truncated. We can rely on the
// io.CopyN done shortly afterwards to trigger any IO errors.
var seekSkipped int64 // Number of bytes skipped via Seek
if sr, ok := tr.r.(io.Seeker); ok && dataSkip > 1 {
// Not all io.Seeker can actually Seek. For example, os.Stdin implements
// io.Seeker, but calling Seek always returns an error and performs
// no action. Thus, we try an innocent seek to the current position
// to see if Seek is really supported.
pos1, err := sr.Seek(0, os.SEEK_CUR)
if err == nil {
// Seek seems supported, so perform the real Seek.
pos2, err := sr.Seek(dataSkip-1, os.SEEK_CUR)
if err != nil {
tr.err = err
return tr.err
}
seekSkipped = pos2 - pos1
}
}
_, tr.err = io.CopyN(ioutil.Discard, tr.r, nr)
var copySkipped int64 // Number of bytes skipped via CopyN
copySkipped, tr.err = io.CopyN(ioutil.Discard, tr.r, totalSkip-seekSkipped)
if tr.err == io.EOF && seekSkipped+copySkipped < dataSkip {
tr.err = io.ErrUnexpectedEOF
}
return tr.err
}
func (tr *Reader) verifyChecksum(header []byte) bool {
@ -487,23 +591,32 @@ func (tr *Reader) verifyChecksum(header []byte) bool {
return false
}
given := tr.octal(header[148:156])
var p parser
given := p.parseOctal(header[148:156])
unsigned, signed := checksum(header)
return given == unsigned || given == signed
return p.err == nil && (given == unsigned || given == signed)
}
// readHeader reads the next block header and assumes that the underlying reader
// is already aligned to a block boundary.
//
// The err will be set to io.EOF only when one of the following occurs:
// * Exactly 0 bytes are read and EOF is hit.
// * Exactly 1 block of zeros is read and EOF is hit.
// * At least 2 blocks of zeros are read.
func (tr *Reader) readHeader() *Header {
header := tr.hdrBuff[:]
copy(header, zeroBlock)
if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil {
if n, err := io.ReadFull(tr.r, header); err != nil {
tr.err = err
// because it could read some of the block, but reach EOF first
if tr.err == io.EOF && tr.RawAccounting {
if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
return nil
if _, err := tr.rawBytes.Write(header[:n]); err != nil {
tr.err = err
}
}
return nil
return nil // io.EOF is okay here
}
if tr.RawAccounting {
if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
@ -513,14 +626,15 @@ func (tr *Reader) readHeader() *Header {
// Two blocks of zero bytes marks the end of the archive.
if bytes.Equal(header, zeroBlock[0:blockSize]) {
if _, tr.err = io.ReadFull(tr.r, header); tr.err != nil {
if n, err := io.ReadFull(tr.r, header); err != nil {
tr.err = err
// because it could read some of the block, but reach EOF first
if tr.err == io.EOF && tr.RawAccounting {
if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
return nil
if _, err := tr.rawBytes.Write(header[:n]); err != nil {
tr.err = err
}
}
return nil
return nil // io.EOF is okay here
}
if tr.RawAccounting {
if _, tr.err = tr.rawBytes.Write(header); tr.err != nil {
@ -541,18 +655,19 @@ func (tr *Reader) readHeader() *Header {
}
// Unpack
var p parser
hdr := new(Header)
s := slicer(header)
hdr.Name = cString(s.next(100))
hdr.Mode = tr.octal(s.next(8))
hdr.Uid = int(tr.octal(s.next(8)))
hdr.Gid = int(tr.octal(s.next(8)))
hdr.Size = tr.octal(s.next(12))
hdr.ModTime = time.Unix(tr.octal(s.next(12)), 0)
hdr.Name = p.parseString(s.next(100))
hdr.Mode = p.parseNumeric(s.next(8))
hdr.Uid = int(p.parseNumeric(s.next(8)))
hdr.Gid = int(p.parseNumeric(s.next(8)))
hdr.Size = p.parseNumeric(s.next(12))
hdr.ModTime = time.Unix(p.parseNumeric(s.next(12)), 0)
s.next(8) // chksum
hdr.Typeflag = s.next(1)[0]
hdr.Linkname = cString(s.next(100))
hdr.Linkname = p.parseString(s.next(100))
// The remainder of the header depends on the value of magic.
// The original (v7) version of tar had no explicit magic field,
@ -572,70 +687,76 @@ func (tr *Reader) readHeader() *Header {
switch format {
case "posix", "gnu", "star":
hdr.Uname = cString(s.next(32))
hdr.Gname = cString(s.next(32))
hdr.Uname = p.parseString(s.next(32))
hdr.Gname = p.parseString(s.next(32))
devmajor := s.next(8)
devminor := s.next(8)
if hdr.Typeflag == TypeChar || hdr.Typeflag == TypeBlock {
hdr.Devmajor = tr.octal(devmajor)
hdr.Devminor = tr.octal(devminor)
hdr.Devmajor = p.parseNumeric(devmajor)
hdr.Devminor = p.parseNumeric(devminor)
}
var prefix string
switch format {
case "posix", "gnu":
prefix = cString(s.next(155))
prefix = p.parseString(s.next(155))
case "star":
prefix = cString(s.next(131))
hdr.AccessTime = time.Unix(tr.octal(s.next(12)), 0)
hdr.ChangeTime = time.Unix(tr.octal(s.next(12)), 0)
prefix = p.parseString(s.next(131))
hdr.AccessTime = time.Unix(p.parseNumeric(s.next(12)), 0)
hdr.ChangeTime = time.Unix(p.parseNumeric(s.next(12)), 0)
}
if len(prefix) > 0 {
hdr.Name = prefix + "/" + hdr.Name
}
}
if tr.err != nil {
if p.err != nil {
tr.err = p.err
return nil
}
nb := hdr.Size
if isHeaderOnlyType(hdr.Typeflag) {
nb = 0
}
if nb < 0 {
tr.err = ErrHeader
return nil
}
// Maximum value of hdr.Size is 64 GB (12 octal digits),
// so there's no risk of int64 overflowing.
nb := int64(hdr.Size)
tr.pad = -nb & (blockSize - 1) // blockSize is a power of two
// Set the current file reader.
tr.pad = -nb & (blockSize - 1) // blockSize is a power of two
tr.curr = &regFileReader{r: tr.r, nb: nb}
// Check for old GNU sparse format entry.
if hdr.Typeflag == TypeGNUSparse {
// Get the real size of the file.
hdr.Size = tr.octal(header[483:495])
hdr.Size = p.parseNumeric(header[483:495])
if p.err != nil {
tr.err = p.err
return nil
}
// Read the sparse map.
sp := tr.readOldGNUSparseMap(header)
if tr.err != nil {
return nil
}
// Current file is a GNU sparse file. Update the current file reader.
tr.curr = &sparseFileReader{rfr: tr.curr.(*regFileReader), sp: sp, tot: hdr.Size}
tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size)
if tr.err != nil {
return nil
}
}
return hdr
}
// A sparseEntry holds a single entry in a sparse file's sparse map.
// A sparse entry indicates the offset and size in a sparse file of a
// block of data.
type sparseEntry struct {
offset int64
numBytes int64
}
// readOldGNUSparseMap reads the sparse map as stored in the old GNU sparse format.
// The sparse map is stored in the tar header if it's small enough. If it's larger than four entries,
// then one or more extension headers are used to store the rest of the sparse map.
func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
var p parser
isExtended := header[oldGNUSparseMainHeaderIsExtendedOffset] != 0
spCap := oldGNUSparseMainHeaderNumEntries
if isExtended {
@ -646,10 +767,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
// Read the four entries from the main tar header
for i := 0; i < oldGNUSparseMainHeaderNumEntries; i++ {
offset := tr.octal(s.next(oldGNUSparseOffsetSize))
numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize))
if tr.err != nil {
tr.err = ErrHeader
offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize))
numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize))
if p.err != nil {
tr.err = p.err
return nil
}
if offset == 0 && numBytes == 0 {
@ -673,10 +794,10 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
isExtended = sparseHeader[oldGNUSparseExtendedHeaderIsExtendedOffset] != 0
s = slicer(sparseHeader)
for i := 0; i < oldGNUSparseExtendedHeaderNumEntries; i++ {
offset := tr.octal(s.next(oldGNUSparseOffsetSize))
numBytes := tr.octal(s.next(oldGNUSparseNumBytesSize))
if tr.err != nil {
tr.err = ErrHeader
offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize))
numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize))
if p.err != nil {
tr.err = p.err
return nil
}
if offset == 0 && numBytes == 0 {
@ -688,134 +809,111 @@ func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry {
return sp
}
// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format version 1.0.
// The sparse map is stored just before the file data and padded out to the nearest block boundary.
// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format
// version 1.0. The format of the sparse map consists of a series of
// newline-terminated numeric fields. The first field is the number of entries
// and is always present. Following this are the entries, consisting of two
// fields (offset, numBytes). This function must stop reading at the end
// boundary of the block containing the last newline.
//
// Note that the GNU manual says that numeric values should be encoded in octal
// format. However, the GNU tar utility itself outputs these values in decimal.
// As such, this library treats values as being encoded in decimal.
func readGNUSparseMap1x0(r io.Reader) ([]sparseEntry, error) {
buf := make([]byte, 2*blockSize)
sparseHeader := buf[:blockSize]
var cntNewline int64
var buf bytes.Buffer
var blk = make([]byte, blockSize)
// readDecimal is a helper function to read a decimal integer from the sparse map
// while making sure to read from the file in blocks of size blockSize
readDecimal := func() (int64, error) {
// Look for newline
nl := bytes.IndexByte(sparseHeader, '\n')
if nl == -1 {
if len(sparseHeader) >= blockSize {
// This is an error
return 0, ErrHeader
// feedTokens copies data in numBlock chunks from r into buf until there are
// at least cnt newlines in buf. It will not read more blocks than needed.
var feedTokens = func(cnt int64) error {
for cntNewline < cnt {
if _, err := io.ReadFull(r, blk); err != nil {
if err == io.EOF {
err = io.ErrUnexpectedEOF
}
return err
}
oldLen := len(sparseHeader)
newLen := oldLen + blockSize
if cap(sparseHeader) < newLen {
// There's more header, but we need to make room for the next block
copy(buf, sparseHeader)
sparseHeader = buf[:newLen]
} else {
// There's more header, and we can just reslice
sparseHeader = sparseHeader[:newLen]
}
// Now that sparseHeader is large enough, read next block
if _, err := io.ReadFull(r, sparseHeader[oldLen:newLen]); err != nil {
return 0, err
}
// leaving this function for io.Reader makes it more testable
if tr, ok := r.(*Reader); ok && tr.RawAccounting {
if _, err := tr.rawBytes.Write(sparseHeader[oldLen:newLen]); err != nil {
return 0, err
buf.Write(blk)
for _, c := range blk {
if c == '\n' {
cntNewline++
}
}
// Look for a newline in the new data
nl = bytes.IndexByte(sparseHeader[oldLen:newLen], '\n')
if nl == -1 {
// This is an error
return 0, ErrHeader
}
nl += oldLen // We want the position from the beginning
}
// Now that we've found a newline, read a number
n, err := strconv.ParseInt(string(sparseHeader[:nl]), 10, 0)
if err != nil {
return 0, ErrHeader
}
// Update sparseHeader to consume this number
sparseHeader = sparseHeader[nl+1:]
return n, nil
return nil
}
// Read the first block
if _, err := io.ReadFull(r, sparseHeader); err != nil {
// nextToken gets the next token delimited by a newline. This assumes that
// at least one newline exists in the buffer.
var nextToken = func() string {
cntNewline--
tok, _ := buf.ReadString('\n')
return tok[:len(tok)-1] // Cut off newline
}
// Parse for the number of entries.
// Use integer overflow resistant math to check this.
if err := feedTokens(1); err != nil {
return nil, err
}
// leaving this function for io.Reader makes it more testable
if tr, ok := r.(*Reader); ok && tr.RawAccounting {
if _, err := tr.rawBytes.Write(sparseHeader); err != nil {
return nil, err
}
numEntries, err := strconv.ParseInt(nextToken(), 10, 0) // Intentionally parse as native int
if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) {
return nil, ErrHeader
}
// The first line contains the number of entries
numEntries, err := readDecimal()
if err != nil {
// Parse for all member entries.
// numEntries is trusted after this since a potential attacker must have
// committed resources proportional to what this library used.
if err := feedTokens(2 * numEntries); err != nil {
return nil, err
}
// Read all the entries
sp := make([]sparseEntry, 0, numEntries)
for i := int64(0); i < numEntries; i++ {
// Read the offset
offset, err := readDecimal()
offset, err := strconv.ParseInt(nextToken(), 10, 64)
if err != nil {
return nil, err
return nil, ErrHeader
}
// Read numBytes
numBytes, err := readDecimal()
numBytes, err := strconv.ParseInt(nextToken(), 10, 64)
if err != nil {
return nil, err
return nil, ErrHeader
}
sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes})
}
return sp, nil
}
// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format version 0.1.
// The sparse map is stored in the PAX headers.
func readGNUSparseMap0x1(headers map[string]string) ([]sparseEntry, error) {
// Get number of entries
numEntriesStr, ok := headers[paxGNUSparseNumBlocks]
if !ok {
return nil, ErrHeader
}
numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0)
if err != nil {
// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format
// version 0.1. The sparse map is stored in the PAX headers.
func readGNUSparseMap0x1(extHdrs map[string]string) ([]sparseEntry, error) {
// Get number of entries.
// Use integer overflow resistant math to check this.
numEntriesStr := extHdrs[paxGNUSparseNumBlocks]
numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0) // Intentionally parse as native int
if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) {
return nil, ErrHeader
}
sparseMap := strings.Split(headers[paxGNUSparseMap], ",")
// There should be two numbers in sparseMap for each entry
// There should be two numbers in sparseMap for each entry.
sparseMap := strings.Split(extHdrs[paxGNUSparseMap], ",")
if int64(len(sparseMap)) != 2*numEntries {
return nil, ErrHeader
}
// Loop through the entries in the sparse map
// Loop through the entries in the sparse map.
// numEntries is trusted now.
sp := make([]sparseEntry, 0, numEntries)
for i := int64(0); i < numEntries; i++ {
offset, err := strconv.ParseInt(sparseMap[2*i], 10, 0)
offset, err := strconv.ParseInt(sparseMap[2*i], 10, 64)
if err != nil {
return nil, ErrHeader
}
numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 0)
numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 64)
if err != nil {
return nil, ErrHeader
}
sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes})
}
return sp, nil
}
@ -832,10 +930,18 @@ func (tr *Reader) numBytes() int64 {
// Read reads from the current entry in the tar archive.
// It returns 0, io.EOF when it reaches the end of that entry,
// until Next is called to advance to the next entry.
//
// Calling Read on special types like TypeLink, TypeSymLink, TypeChar,
// TypeBlock, TypeDir, and TypeFifo returns 0, io.EOF regardless of what
// the Header.Size claims.
func (tr *Reader) Read(b []byte) (n int, err error) {
if tr.err != nil {
return 0, tr.err
}
if tr.curr == nil {
return 0, io.EOF
}
n, err = tr.curr.Read(b)
if err != nil && err != io.EOF {
tr.err = err
@ -865,9 +971,33 @@ func (rfr *regFileReader) numBytes() int64 {
return rfr.nb
}
// readHole reads a sparse file hole ending at offset toOffset
func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int {
n64 := toOffset - sfr.pos
// newSparseFileReader creates a new sparseFileReader, but validates all of the
// sparse entries before doing so.
func newSparseFileReader(rfr numBytesReader, sp []sparseEntry, total int64) (*sparseFileReader, error) {
if total < 0 {
return nil, ErrHeader // Total size cannot be negative
}
// Validate all sparse entries. These are the same checks as performed by
// the BSD tar utility.
for i, s := range sp {
switch {
case s.offset < 0 || s.numBytes < 0:
return nil, ErrHeader // Negative values are never okay
case s.offset > math.MaxInt64-s.numBytes:
return nil, ErrHeader // Integer overflow with large length
case s.offset+s.numBytes > total:
return nil, ErrHeader // Region extends beyond the "real" size
case i > 0 && sp[i-1].offset+sp[i-1].numBytes > s.offset:
return nil, ErrHeader // Regions can't overlap and must be in order
}
}
return &sparseFileReader{rfr: rfr, sp: sp, total: total}, nil
}
// readHole reads a sparse hole ending at endOffset.
func (sfr *sparseFileReader) readHole(b []byte, endOffset int64) int {
n64 := endOffset - sfr.pos
if n64 > int64(len(b)) {
n64 = int64(len(b))
}
@ -881,46 +1011,54 @@ func (sfr *sparseFileReader) readHole(b []byte, toOffset int64) int {
// Read reads the sparse file data in expanded form.
func (sfr *sparseFileReader) Read(b []byte) (n int, err error) {
if len(sfr.sp) == 0 {
// No more data fragments to read from.
if sfr.pos < sfr.tot {
// We're in the last hole
n = sfr.readHole(b, sfr.tot)
return
}
// Otherwise, we're at the end of the file
return 0, io.EOF
}
if sfr.pos < sfr.sp[0].offset {
// We're in a hole
n = sfr.readHole(b, sfr.sp[0].offset)
return
// Skip past all empty fragments.
for len(sfr.sp) > 0 && sfr.sp[0].numBytes == 0 {
sfr.sp = sfr.sp[1:]
}
// We're not in a hole, so we'll read from the next data fragment
posInFragment := sfr.pos - sfr.sp[0].offset
bytesLeft := sfr.sp[0].numBytes - posInFragment
// If there are no more fragments, then it is possible that there
// is one last sparse hole.
if len(sfr.sp) == 0 {
// This behavior matches the BSD tar utility.
// However, GNU tar stops returning data even if sfr.total is unmet.
if sfr.pos < sfr.total {
return sfr.readHole(b, sfr.total), nil
}
return 0, io.EOF
}
// In front of a data fragment, so read a hole.
if sfr.pos < sfr.sp[0].offset {
return sfr.readHole(b, sfr.sp[0].offset), nil
}
// In a data fragment, so read from it.
// This math is overflow free since we verify that offset and numBytes can
// be safely added when creating the sparseFileReader.
endPos := sfr.sp[0].offset + sfr.sp[0].numBytes // End offset of fragment
bytesLeft := endPos - sfr.pos // Bytes left in fragment
if int64(len(b)) > bytesLeft {
b = b[0:bytesLeft]
b = b[:bytesLeft]
}
n, err = sfr.rfr.Read(b)
sfr.pos += int64(n)
if int64(n) == bytesLeft {
// We're done with this fragment
sfr.sp = sfr.sp[1:]
if err == io.EOF {
if sfr.pos < endPos {
err = io.ErrUnexpectedEOF // There was supposed to be more data
} else if sfr.pos < sfr.total {
err = nil // There is still an implicit sparse hole at the end
}
}
if err == io.EOF && sfr.pos < sfr.tot {
// We reached the end of the last fragment's data, but there's a final hole
err = nil
if sfr.pos == endPos {
sfr.sp = sfr.sp[1:] // We are done with this fragment, so pop it
}
return
return n, err
}
// numBytes returns the number of bytes left to read in the sparse file's
// sparse-encoded data in the tar archive.
func (sfr *sparseFileReader) numBytes() int64 {
return sfr.rfr.nb
return sfr.rfr.numBytes()
}

File diff suppressed because it is too large Load diff

View file

@ -94,13 +94,12 @@ func TestRoundTrip(t *testing.T) {
var b bytes.Buffer
tw := NewWriter(&b)
hdr := &Header{
Name: "file.txt",
Uid: 1 << 21, // too big for 8 octal digits
Size: int64(len(data)),
ModTime: time.Now(),
Name: "file.txt",
Uid: 1 << 21, // too big for 8 octal digits
Size: int64(len(data)),
// https://github.com/golang/go/commit/0e3355903d2ebcf5ee9e76096f51ac9a116a9dbb#diff-d7bf2a98d7b57b6ff754ca406f1b7581R105
ModTime: time.Now().AddDate(0, 0, 0).Round(1 * time.Second),
}
// tar only supports second precision.
hdr.ModTime = hdr.ModTime.Add(-time.Duration(hdr.ModTime.Nanosecond()) * time.Nanosecond)
if err := tw.WriteHeader(hdr); err != nil {
t.Fatalf("tw.WriteHeader: %v", err)
}
@ -147,17 +146,6 @@ func TestHeaderRoundTrip(t *testing.T) {
},
fm: 0644,
},
// hard link.
{
h: &Header{
Name: "hard.txt",
Mode: 0644 | c_ISLNK,
Size: 0,
ModTime: time.Unix(1360600916, 0),
Typeflag: TypeLink,
},
fm: 0644 | os.ModeSymlink,
},
// symbolic link.
{
h: &Header{
@ -246,6 +234,33 @@ func TestHeaderRoundTrip(t *testing.T) {
},
fm: 0600 | os.ModeSticky,
},
// hard link.
{
h: &Header{
Name: "hard.txt",
Mode: 0644 | c_ISREG,
Size: 0,
Linkname: "file.txt",
ModTime: time.Unix(1360600916, 0),
Typeflag: TypeLink,
},
fm: 0644,
},
// More information.
{
h: &Header{
Name: "info.txt",
Mode: 0600 | c_ISREG,
Size: 0,
Uid: 1000,
Gid: 1000,
ModTime: time.Unix(1360602540, 0),
Uname: "slartibartfast",
Gname: "users",
Typeflag: TypeReg,
},
fm: 0600,
},
}
for i, g := range golden {
@ -268,12 +283,37 @@ func TestHeaderRoundTrip(t *testing.T) {
if got, want := h2.Size, g.h.Size; got != want {
t.Errorf("i=%d: Size: got %v, want %v", i, got, want)
}
if got, want := h2.Uid, g.h.Uid; got != want {
t.Errorf("i=%d: Uid: got %d, want %d", i, got, want)
}
if got, want := h2.Gid, g.h.Gid; got != want {
t.Errorf("i=%d: Gid: got %d, want %d", i, got, want)
}
if got, want := h2.Uname, g.h.Uname; got != want {
t.Errorf("i=%d: Uname: got %q, want %q", i, got, want)
}
if got, want := h2.Gname, g.h.Gname; got != want {
t.Errorf("i=%d: Gname: got %q, want %q", i, got, want)
}
if got, want := h2.Linkname, g.h.Linkname; got != want {
t.Errorf("i=%d: Linkname: got %v, want %v", i, got, want)
}
if got, want := h2.Typeflag, g.h.Typeflag; got != want {
t.Logf("%#v %#v", g.h, fi.Sys())
t.Errorf("i=%d: Typeflag: got %q, want %q", i, got, want)
}
if got, want := h2.Mode, g.h.Mode; got != want {
t.Errorf("i=%d: Mode: got %o, want %o", i, got, want)
}
if got, want := fi.Mode(), g.fm; got != want {
t.Errorf("i=%d: fi.Mode: got %o, want %o", i, got, want)
}
if got, want := h2.AccessTime, g.h.AccessTime; got != want {
t.Errorf("i=%d: AccessTime: got %v, want %v", i, got, want)
}
if got, want := h2.ChangeTime, g.h.ChangeTime; got != want {
t.Errorf("i=%d: ChangeTime: got %v, want %v", i, got, want)
}
if got, want := h2.ModTime, g.h.ModTime; got != want {
t.Errorf("i=%d: ModTime: got %v, want %v", i, got, want)
}

BIN
archive/tar/testdata/gnu-multi-hdrs.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/hardlink.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/hdr-only.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/issue10968.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/issue11169.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/issue12435.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/neg-size.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/pax-multi-hdrs.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/pax-path-hdr.tar vendored Normal file

Binary file not shown.

BIN
archive/tar/testdata/ustar-file-reg.tar vendored Normal file

Binary file not shown.

View file

@ -12,8 +12,8 @@ import (
"errors"
"fmt"
"io"
"os"
"path"
"sort"
"strconv"
"strings"
"time"
@ -23,7 +23,6 @@ var (
ErrWriteTooLong = errors.New("archive/tar: write too long")
ErrFieldTooLong = errors.New("archive/tar: header field too long")
ErrWriteAfterClose = errors.New("archive/tar: write after close")
errNameTooLong = errors.New("archive/tar: name too long")
errInvalidHeader = errors.New("archive/tar: header field too long or contains invalid values")
)
@ -43,6 +42,10 @@ type Writer struct {
paxHdrBuff [blockSize]byte // buffer to use in writeHeader when writing a pax header
}
type formatter struct {
err error // Last error seen
}
// NewWriter creates a new Writer writing to w.
func NewWriter(w io.Writer) *Writer { return &Writer{w: w} }
@ -69,17 +72,9 @@ func (tw *Writer) Flush() error {
}
// Write s into b, terminating it with a NUL if there is room.
// If the value is too long for the field and allowPax is true add a paxheader record instead
func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string, paxHeaders map[string]string) {
needsPaxHeader := allowPax && len(s) > len(b) || !isASCII(s)
if needsPaxHeader {
paxHeaders[paxKeyword] = s
return
}
func (f *formatter) formatString(b []byte, s string) {
if len(s) > len(b) {
if tw.err == nil {
tw.err = ErrFieldTooLong
}
f.err = ErrFieldTooLong
return
}
ascii := toASCII(s)
@ -90,40 +85,40 @@ func (tw *Writer) cString(b []byte, s string, allowPax bool, paxKeyword string,
}
// Encode x as an octal ASCII string and write it into b with leading zeros.
func (tw *Writer) octal(b []byte, x int64) {
func (f *formatter) formatOctal(b []byte, x int64) {
s := strconv.FormatInt(x, 8)
// leading zeros, but leave room for a NUL.
for len(s)+1 < len(b) {
s = "0" + s
}
tw.cString(b, s, false, paxNone, nil)
f.formatString(b, s)
}
// Write x into b, either as octal or as binary (GNUtar/star extension).
// If the value is too long for the field and writingPax is enabled both for the field and the add a paxheader record instead
func (tw *Writer) numeric(b []byte, x int64, allowPax bool, paxKeyword string, paxHeaders map[string]string) {
// Try octal first.
s := strconv.FormatInt(x, 8)
if len(s) < len(b) {
tw.octal(b, x)
// fitsInBase256 reports whether x can be encoded into n bytes using base-256
// encoding. Unlike octal encoding, base-256 encoding does not require that the
// string ends with a NUL character. Thus, all n bytes are available for output.
//
// If operating in binary mode, this assumes strict GNU binary mode; which means
// that the first byte can only be either 0x80 or 0xff. Thus, the first byte is
// equivalent to the sign bit in two's complement form.
func fitsInBase256(n int, x int64) bool {
var binBits = uint(n-1) * 8
return n >= 9 || (x >= -1<<binBits && x < 1<<binBits)
}
// Write x into b, as binary (GNUtar/star extension).
func (f *formatter) formatNumeric(b []byte, x int64) {
if fitsInBase256(len(b), x) {
for i := len(b) - 1; i >= 0; i-- {
b[i] = byte(x)
x >>= 8
}
b[0] |= 0x80 // Highest bit indicates binary format
return
}
// If it is too long for octal, and pax is preferred, use a pax header
if allowPax && tw.preferPax {
tw.octal(b, 0)
s := strconv.FormatInt(x, 10)
paxHeaders[paxKeyword] = s
return
}
// Too big: use binary (big-endian).
tw.usedBinary = true
for i := len(b) - 1; x > 0 && i >= 0; i-- {
b[i] = byte(x)
x >>= 8
}
b[0] |= 0x80 // highest bit indicates binary format
f.formatOctal(b, 0) // Last resort, just write zero
f.err = ErrFieldTooLong
}
var (
@ -162,6 +157,7 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
// subsecond time resolution, but for now let's just capture
// too long fields or non ascii characters
var f formatter
var header []byte
// We need to select which scratch buffer to use carefully,
@ -176,10 +172,40 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
copy(header, zeroBlock)
s := slicer(header)
// Wrappers around formatter that automatically sets paxHeaders if the
// argument extends beyond the capacity of the input byte slice.
var formatString = func(b []byte, s string, paxKeyword string) {
needsPaxHeader := paxKeyword != paxNone && len(s) > len(b) || !isASCII(s)
if needsPaxHeader {
paxHeaders[paxKeyword] = s
return
}
f.formatString(b, s)
}
var formatNumeric = func(b []byte, x int64, paxKeyword string) {
// Try octal first.
s := strconv.FormatInt(x, 8)
if len(s) < len(b) {
f.formatOctal(b, x)
return
}
// If it is too long for octal, and PAX is preferred, use a PAX header.
if paxKeyword != paxNone && tw.preferPax {
f.formatOctal(b, 0)
s := strconv.FormatInt(x, 10)
paxHeaders[paxKeyword] = s
return
}
tw.usedBinary = true
f.formatNumeric(b, x)
}
// keep a reference to the filename to allow to overwrite it later if we detect that we can use ustar longnames instead of pax
pathHeaderBytes := s.next(fileNameSize)
tw.cString(pathHeaderBytes, hdr.Name, true, paxPath, paxHeaders)
formatString(pathHeaderBytes, hdr.Name, paxPath)
// Handle out of range ModTime carefully.
var modTime int64
@ -187,25 +213,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
modTime = hdr.ModTime.Unix()
}
tw.octal(s.next(8), hdr.Mode) // 100:108
tw.numeric(s.next(8), int64(hdr.Uid), true, paxUid, paxHeaders) // 108:116
tw.numeric(s.next(8), int64(hdr.Gid), true, paxGid, paxHeaders) // 116:124
tw.numeric(s.next(12), hdr.Size, true, paxSize, paxHeaders) // 124:136
tw.numeric(s.next(12), modTime, false, paxNone, nil) // 136:148 --- consider using pax for finer granularity
s.next(8) // chksum (148:156)
s.next(1)[0] = hdr.Typeflag // 156:157
f.formatOctal(s.next(8), hdr.Mode) // 100:108
formatNumeric(s.next(8), int64(hdr.Uid), paxUid) // 108:116
formatNumeric(s.next(8), int64(hdr.Gid), paxGid) // 116:124
formatNumeric(s.next(12), hdr.Size, paxSize) // 124:136
formatNumeric(s.next(12), modTime, paxNone) // 136:148 --- consider using pax for finer granularity
s.next(8) // chksum (148:156)
s.next(1)[0] = hdr.Typeflag // 156:157
tw.cString(s.next(100), hdr.Linkname, true, paxLinkpath, paxHeaders)
formatString(s.next(100), hdr.Linkname, paxLinkpath)
copy(s.next(8), []byte("ustar\x0000")) // 257:265
tw.cString(s.next(32), hdr.Uname, true, paxUname, paxHeaders) // 265:297
tw.cString(s.next(32), hdr.Gname, true, paxGname, paxHeaders) // 297:329
tw.numeric(s.next(8), hdr.Devmajor, false, paxNone, nil) // 329:337
tw.numeric(s.next(8), hdr.Devminor, false, paxNone, nil) // 337:345
copy(s.next(8), []byte("ustar\x0000")) // 257:265
formatString(s.next(32), hdr.Uname, paxUname) // 265:297
formatString(s.next(32), hdr.Gname, paxGname) // 297:329
formatNumeric(s.next(8), hdr.Devmajor, paxNone) // 329:337
formatNumeric(s.next(8), hdr.Devminor, paxNone) // 337:345
// keep a reference to the prefix to allow to overwrite it later if we detect that we can use ustar longnames instead of pax
prefixHeaderBytes := s.next(155)
tw.cString(prefixHeaderBytes, "", false, paxNone, nil) // 345:500 prefix
formatString(prefixHeaderBytes, "", paxNone) // 345:500 prefix
// Use the GNU magic instead of POSIX magic if we used any GNU extensions.
if tw.usedBinary {
@ -215,37 +241,26 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
_, paxPathUsed := paxHeaders[paxPath]
// try to use a ustar header when only the name is too long
if !tw.preferPax && len(paxHeaders) == 1 && paxPathUsed {
suffix := hdr.Name
prefix := ""
if len(hdr.Name) > fileNameSize && isASCII(hdr.Name) {
var err error
prefix, suffix, err = tw.splitUSTARLongName(hdr.Name)
if err == nil {
// ok we can use a ustar long name instead of pax, now correct the fields
prefix, suffix, ok := splitUSTARPath(hdr.Name)
if ok {
// Since we can encode in USTAR format, disable PAX header.
delete(paxHeaders, paxPath)
// remove the path field from the pax header. this will suppress the pax header
delete(paxHeaders, paxPath)
// update the path fields
tw.cString(pathHeaderBytes, suffix, false, paxNone, nil)
tw.cString(prefixHeaderBytes, prefix, false, paxNone, nil)
// Use the ustar magic if we used ustar long names.
if len(prefix) > 0 && !tw.usedBinary {
copy(header[257:265], []byte("ustar\x00"))
}
}
// Update the path fields
formatString(pathHeaderBytes, suffix, paxNone)
formatString(prefixHeaderBytes, prefix, paxNone)
}
}
// The chksum field is terminated by a NUL and a space.
// This is different from the other octal fields.
chksum, _ := checksum(header)
tw.octal(header[148:155], chksum)
f.formatOctal(header[148:155], chksum) // Never fails
header[155] = ' '
if tw.err != nil {
// problem with header; probably integer too big for a field.
// Check if there were any formatting errors.
if f.err != nil {
tw.err = f.err
return tw.err
}
@ -270,28 +285,25 @@ func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error {
return tw.err
}
// writeUSTARLongName splits a USTAR long name hdr.Name.
// name must be < 256 characters. errNameTooLong is returned
// if hdr.Name can't be split. The splitting heuristic
// is compatible with gnu tar.
func (tw *Writer) splitUSTARLongName(name string) (prefix, suffix string, err error) {
// splitUSTARPath splits a path according to USTAR prefix and suffix rules.
// If the path is not splittable, then it will return ("", "", false).
func splitUSTARPath(name string) (prefix, suffix string, ok bool) {
length := len(name)
if length > fileNamePrefixSize+1 {
if length <= fileNameSize || !isASCII(name) {
return "", "", false
} else if length > fileNamePrefixSize+1 {
length = fileNamePrefixSize + 1
} else if name[length-1] == '/' {
length--
}
i := strings.LastIndex(name[:length], "/")
// nlen contains the resulting length in the name field.
// plen contains the resulting length in the prefix field.
nlen := len(name) - i - 1
plen := i
nlen := len(name) - i - 1 // nlen is length of suffix
plen := i // plen is length of prefix
if i <= 0 || nlen > fileNameSize || nlen == 0 || plen > fileNamePrefixSize {
err = errNameTooLong
return
return "", "", false
}
prefix, suffix = name[:i], name[i+1:]
return
return name[:i], name[i+1:], true
}
// writePaxHeader writes an extended pax header to the
@ -304,11 +316,11 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
// succeed, and seems harmless enough.
ext.ModTime = hdr.ModTime
// The spec asks that we namespace our pseudo files
// with the current pid.
pid := os.Getpid()
// with the current pid. However, this results in differing outputs
// for identical inputs. As such, the constant 0 is now used instead.
// golang.org/issue/12358
dir, file := path.Split(hdr.Name)
fullName := path.Join(dir,
fmt.Sprintf("PaxHeaders.%d", pid), file)
fullName := path.Join(dir, "PaxHeaders.0", file)
ascii := toASCII(fullName)
if len(ascii) > 100 {
@ -318,8 +330,15 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
// Construct the body
var buf bytes.Buffer
for k, v := range paxHeaders {
fmt.Fprint(&buf, paxHeader(k+"="+v))
// Keys are sorted before writing to body to allow deterministic output.
var keys []string
for k := range paxHeaders {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
fmt.Fprint(&buf, formatPAXRecord(k, paxHeaders[k]))
}
ext.Size = int64(len(buf.Bytes()))
@ -335,17 +354,18 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
return nil
}
// paxHeader formats a single pax record, prefixing it with the appropriate length
func paxHeader(msg string) string {
const padding = 2 // Extra padding for space and newline
size := len(msg) + padding
// formatPAXRecord formats a single PAX record, prefixing it with the
// appropriate length.
func formatPAXRecord(k, v string) string {
const padding = 3 // Extra padding for ' ', '=', and '\n'
size := len(k) + len(v) + padding
size += len(strconv.Itoa(size))
record := fmt.Sprintf("%d %s\n", size, msg)
record := fmt.Sprintf("%d %s=%s\n", size, k, v)
// Final adjustment if adding size field increased the record size.
if len(record) != size {
// Final adjustment if adding size increased
// the number of digits in size
size = len(record)
record = fmt.Sprintf("%d %s\n", size, msg)
record = fmt.Sprintf("%d %s=%s\n", size, k, v)
}
return record
}
@ -355,7 +375,7 @@ func paxHeader(msg string) string {
// hdr.Size bytes are written after WriteHeader.
func (tw *Writer) Write(b []byte) (n int, err error) {
if tw.closed {
err = ErrWriteTooLong
err = ErrWriteAfterClose
return
}
overwrite := false

View file

@ -9,8 +9,10 @@ import (
"fmt"
"io"
"io/ioutil"
"math"
"os"
"reflect"
"sort"
"strings"
"testing"
"testing/iotest"
@ -147,6 +149,44 @@ var writerTests = []*writerTest{
},
},
},
// This file was produced using gnu tar 1.26
// echo "Slartibartfast" > file.txt
// ln file.txt hard.txt
// tar -b 1 --format=ustar -c -f hardlink.tar file.txt hard.txt
{
file: "testdata/hardlink.tar",
entries: []*writerTestEntry{
{
header: &Header{
Name: "file.txt",
Mode: 0644,
Uid: 1000,
Gid: 100,
Size: 15,
ModTime: time.Unix(1425484303, 0),
Typeflag: '0',
Uname: "vbatts",
Gname: "users",
},
contents: "Slartibartfast\n",
},
{
header: &Header{
Name: "hard.txt",
Mode: 0644,
Uid: 1000,
Gid: 100,
Size: 0,
ModTime: time.Unix(1425484303, 0),
Typeflag: '1',
Linkname: "file.txt",
Uname: "vbatts",
Gname: "users",
},
// no contents
},
},
},
}
// Render byte array in a two-character hexadecimal string, spaced for easy visual inspection.
@ -253,7 +293,7 @@ func TestPax(t *testing.T) {
t.Fatal(err)
}
// Simple test to make sure PAX extensions are in effect
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
t.Fatal("Expected at least one PAX header to be written.")
}
// Test that we can get a long name back out of the archive.
@ -292,7 +332,7 @@ func TestPaxSymlink(t *testing.T) {
t.Fatal(err)
}
// Simple test to make sure PAX extensions are in effect
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
t.Fatal("Expected at least one PAX header to be written.")
}
// Test that we can get a long name back out of the archive.
@ -342,7 +382,7 @@ func TestPaxNonAscii(t *testing.T) {
t.Fatal(err)
}
// Simple test to make sure PAX extensions are in effect
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
t.Fatal("Expected at least one PAX header to be written.")
}
// Test that we can get a long name back out of the archive.
@ -401,21 +441,49 @@ func TestPaxXattrs(t *testing.T) {
}
}
func TestPAXHeader(t *testing.T) {
medName := strings.Repeat("CD", 50)
longName := strings.Repeat("AB", 100)
paxTests := [][2]string{
{paxPath + "=/etc/hosts", "19 path=/etc/hosts\n"},
{"a=b", "6 a=b\n"}, // Single digit length
{"a=names", "11 a=names\n"}, // Test case involving carries
{paxPath + "=" + longName, fmt.Sprintf("210 path=%s\n", longName)},
{paxPath + "=" + medName, fmt.Sprintf("110 path=%s\n", medName)}}
func TestPaxHeadersSorted(t *testing.T) {
fileinfo, err := os.Stat("testdata/small.txt")
if err != nil {
t.Fatal(err)
}
hdr, err := FileInfoHeader(fileinfo, "")
if err != nil {
t.Fatalf("os.Stat: %v", err)
}
contents := strings.Repeat(" ", int(hdr.Size))
for _, test := range paxTests {
key, expected := test[0], test[1]
if result := paxHeader(key); result != expected {
t.Fatalf("paxHeader: got %s, expected %s", result, expected)
}
hdr.Xattrs = map[string]string{
"foo": "foo",
"bar": "bar",
"baz": "baz",
"qux": "qux",
}
var buf bytes.Buffer
writer := NewWriter(&buf)
if err := writer.WriteHeader(hdr); err != nil {
t.Fatal(err)
}
if _, err = writer.Write([]byte(contents)); err != nil {
t.Fatal(err)
}
if err := writer.Close(); err != nil {
t.Fatal(err)
}
// Simple test to make sure PAX extensions are in effect
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.0")) {
t.Fatal("Expected at least one PAX header to be written.")
}
// xattr bar should always appear before others
indices := []int{
bytes.Index(buf.Bytes(), []byte("bar=bar")),
bytes.Index(buf.Bytes(), []byte("baz=baz")),
bytes.Index(buf.Bytes(), []byte("foo=foo")),
bytes.Index(buf.Bytes(), []byte("qux=qux")),
}
if !sort.IntsAreSorted(indices) {
t.Fatal("PAX headers are not sorted")
}
}
@ -489,3 +557,166 @@ func TestValidTypeflagWithPAXHeader(t *testing.T) {
}
}
}
func TestWriteAfterClose(t *testing.T) {
var buffer bytes.Buffer
tw := NewWriter(&buffer)
hdr := &Header{
Name: "small.txt",
Size: 5,
}
if err := tw.WriteHeader(hdr); err != nil {
t.Fatalf("Failed to write header: %s", err)
}
tw.Close()
if _, err := tw.Write([]byte("Kilts")); err != ErrWriteAfterClose {
t.Fatalf("Write: got %v; want ErrWriteAfterClose", err)
}
}
func TestSplitUSTARPath(t *testing.T) {
var sr = strings.Repeat
var vectors = []struct {
input string // Input path
prefix string // Expected output prefix
suffix string // Expected output suffix
ok bool // Split success?
}{
{"", "", "", false},
{"abc", "", "", false},
{"用戶名", "", "", false},
{sr("a", fileNameSize), "", "", false},
{sr("a", fileNameSize) + "/", "", "", false},
{sr("a", fileNameSize) + "/a", sr("a", fileNameSize), "a", true},
{sr("a", fileNamePrefixSize) + "/", "", "", false},
{sr("a", fileNamePrefixSize) + "/a", sr("a", fileNamePrefixSize), "a", true},
{sr("a", fileNameSize+1), "", "", false},
{sr("/", fileNameSize+1), sr("/", fileNameSize-1), "/", true},
{sr("a", fileNamePrefixSize) + "/" + sr("b", fileNameSize),
sr("a", fileNamePrefixSize), sr("b", fileNameSize), true},
{sr("a", fileNamePrefixSize) + "//" + sr("b", fileNameSize), "", "", false},
{sr("a/", fileNameSize), sr("a/", 77) + "a", sr("a/", 22), true},
}
for _, v := range vectors {
prefix, suffix, ok := splitUSTARPath(v.input)
if prefix != v.prefix || suffix != v.suffix || ok != v.ok {
t.Errorf("splitUSTARPath(%q):\ngot (%q, %q, %v)\nwant (%q, %q, %v)",
v.input, prefix, suffix, ok, v.prefix, v.suffix, v.ok)
}
}
}
func TestFormatPAXRecord(t *testing.T) {
var medName = strings.Repeat("CD", 50)
var longName = strings.Repeat("AB", 100)
var vectors = []struct {
inputKey string
inputVal string
output string
}{
{"k", "v", "6 k=v\n"},
{"path", "/etc/hosts", "19 path=/etc/hosts\n"},
{"path", longName, "210 path=" + longName + "\n"},
{"path", medName, "110 path=" + medName + "\n"},
{"foo", "ba", "9 foo=ba\n"},
{"foo", "bar", "11 foo=bar\n"},
{"foo", "b=\nar=\n==\x00", "18 foo=b=\nar=\n==\x00\n"},
{"foo", "hello9 foo=ba\nworld", "27 foo=hello9 foo=ba\nworld\n"},
{"☺☻☹", "日a本b語ç", "27 ☺☻☹=日a本b語ç\n"},
{"\x00hello", "\x00world", "17 \x00hello=\x00world\n"},
}
for _, v := range vectors {
output := formatPAXRecord(v.inputKey, v.inputVal)
if output != v.output {
t.Errorf("formatPAXRecord(%q, %q): got %q, want %q",
v.inputKey, v.inputVal, output, v.output)
}
}
}
func TestFitsInBase256(t *testing.T) {
var vectors = []struct {
input int64
width int
ok bool
}{
{+1, 8, true},
{0, 8, true},
{-1, 8, true},
{1 << 56, 8, false},
{(1 << 56) - 1, 8, true},
{-1 << 56, 8, true},
{(-1 << 56) - 1, 8, false},
{121654, 8, true},
{-9849849, 8, true},
{math.MaxInt64, 9, true},
{0, 9, true},
{math.MinInt64, 9, true},
{math.MaxInt64, 12, true},
{0, 12, true},
{math.MinInt64, 12, true},
}
for _, v := range vectors {
ok := fitsInBase256(v.width, v.input)
if ok != v.ok {
t.Errorf("checkNumeric(%d, %d): got %v, want %v", v.input, v.width, ok, v.ok)
}
}
}
func TestFormatNumeric(t *testing.T) {
var vectors = []struct {
input int64
output string
ok bool
}{
// Test base-256 (binary) encoded values.
{-1, "\xff", true},
{-1, "\xff\xff", true},
{-1, "\xff\xff\xff", true},
{(1 << 0), "0", false},
{(1 << 8) - 1, "\x80\xff", true},
{(1 << 8), "0\x00", false},
{(1 << 16) - 1, "\x80\xff\xff", true},
{(1 << 16), "00\x00", false},
{-1 * (1 << 0), "\xff", true},
{-1*(1<<0) - 1, "0", false},
{-1 * (1 << 8), "\xff\x00", true},
{-1*(1<<8) - 1, "0\x00", false},
{-1 * (1 << 16), "\xff\x00\x00", true},
{-1*(1<<16) - 1, "00\x00", false},
{537795476381659745, "0000000\x00", false},
{537795476381659745, "\x80\x00\x00\x00\x07\x76\xa2\x22\xeb\x8a\x72\x61", true},
{-615126028225187231, "0000000\x00", false},
{-615126028225187231, "\xff\xff\xff\xff\xf7\x76\xa2\x22\xeb\x8a\x72\x61", true},
{math.MaxInt64, "0000000\x00", false},
{math.MaxInt64, "\x80\x00\x00\x00\x7f\xff\xff\xff\xff\xff\xff\xff", true},
{math.MinInt64, "0000000\x00", false},
{math.MinInt64, "\xff\xff\xff\xff\x80\x00\x00\x00\x00\x00\x00\x00", true},
{math.MaxInt64, "\x80\x7f\xff\xff\xff\xff\xff\xff\xff", true},
{math.MinInt64, "\xff\x80\x00\x00\x00\x00\x00\x00\x00", true},
}
for _, v := range vectors {
var f formatter
output := make([]byte, len(v.output))
f.formatNumeric(output, v.input)
ok := (f.err == nil)
if ok != v.ok {
if v.ok {
t.Errorf("formatNumeric(%d): got formatting failure, want success", v.input)
} else {
t.Errorf("formatNumeric(%d): got formatting success, want failure", v.input)
}
}
if string(output) != v.output {
t.Errorf("formatNumeric(%d): got %q, want %q", v.input, output, v.output)
}
}
}

39
cmd/tar-split/README.md Normal file
View file

@ -0,0 +1,39 @@
# tar-split utility
## Installation
go get -u github.com/vbatts/tar-split/cmd/tar-split
## Usage
### Disassembly
```bash
$ sha256sum archive.tar
d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 archive.tar
$ mkdir ./x
$ tar-split disasm --output tar-data.json.gz ./archive.tar | tar -C ./x -x
time="2015-07-20T15:45:04-04:00" level=info msg="created tar-data.json.gz from ./archive.tar (read 204800 bytes)"
```
### Assembly
```bash
$ tar-split asm --output new.tar --input ./tar-data.json.gz --path ./x/
INFO[0000] created new.tar from ./x/ and ./tar-data.json.gz (wrote 204800 bytes)
$ sha256sum new.tar
d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 new.tar
```
### Estimating metadata size
```bash
$ tar-split checksize ./archive.tar
inspecting "./archive.tar" (size 200k)
-- number of files: 28
-- size of metadata uncompressed: 28k
-- size of gzip compressed metadata: 1k
```

64
cmd/tar-split/asm.go Normal file
View file

@ -0,0 +1,64 @@
package main
import (
"compress/gzip"
"io"
"os"
"github.com/Sirupsen/logrus"
"github.com/urfave/cli"
"github.com/vbatts/tar-split/tar/asm"
"github.com/vbatts/tar-split/tar/storage"
)
func CommandAsm(c *cli.Context) {
if len(c.Args()) > 0 {
logrus.Warnf("%d additional arguments passed are ignored", len(c.Args()))
}
if len(c.String("input")) == 0 {
logrus.Fatalf("--input filename must be set")
}
if len(c.String("output")) == 0 {
logrus.Fatalf("--output filename must be set ([FILENAME|-])")
}
if len(c.String("path")) == 0 {
logrus.Fatalf("--path must be set")
}
var outputStream io.Writer
if c.String("output") == "-" {
outputStream = os.Stdout
} else {
fh, err := os.Create(c.String("output"))
if err != nil {
logrus.Fatal(err)
}
defer fh.Close()
outputStream = fh
}
// Get the tar metadata reader
mf, err := os.Open(c.String("input"))
if err != nil {
logrus.Fatal(err)
}
defer mf.Close()
mfz, err := gzip.NewReader(mf)
if err != nil {
logrus.Fatal(err)
}
defer mfz.Close()
metaUnpacker := storage.NewJSONUnpacker(mfz)
// XXX maybe get the absolute path here
fileGetter := storage.NewPathFileGetter(c.String("path"))
ots := asm.NewOutputTarStream(fileGetter, metaUnpacker)
defer ots.Close()
i, err := io.Copy(outputStream, ots)
if err != nil {
logrus.Fatal(err)
}
logrus.Infof("created %s from %s and %s (wrote %d bytes)", c.String("output"), c.String("path"), c.String("input"), i)
}

View file

@ -1,29 +1,25 @@
// +build ignore
package main
import (
"archive/tar"
"compress/gzip"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"github.com/Sirupsen/logrus"
"github.com/urfave/cli"
"github.com/vbatts/tar-split/tar/asm"
"github.com/vbatts/tar-split/tar/storage"
)
var (
flCleanup = flag.Bool("cleanup", true, "cleanup tempfiles")
)
func main() {
flag.Parse()
for _, arg := range flag.Args() {
func CommandChecksize(c *cli.Context) {
if len(c.Args()) == 0 {
logrus.Fatalf("please specify tar archives to check ('-' will check stdin)")
}
for _, arg := range c.Args() {
fh, err := os.Open(arg)
if err != nil {
log.Fatal(err)
@ -40,8 +36,10 @@ func main() {
log.Fatal(err)
}
defer packFh.Close()
if *flCleanup {
if !c.Bool("work") {
defer os.Remove(packFh.Name())
} else {
fmt.Printf(" -- working file preserved: %s\n", packFh.Name())
}
sp := storage.NewJSONPacker(packFh)
@ -83,7 +81,7 @@ func main() {
log.Fatal(err)
}
defer gzPackFh.Close()
if *flCleanup {
if !c.Bool("work") {
defer os.Remove(gzPackFh.Name())
}

63
cmd/tar-split/disasm.go Normal file
View file

@ -0,0 +1,63 @@
package main
import (
"compress/gzip"
"io"
"io/ioutil"
"os"
"github.com/Sirupsen/logrus"
"github.com/urfave/cli"
"github.com/vbatts/tar-split/tar/asm"
"github.com/vbatts/tar-split/tar/storage"
)
func CommandDisasm(c *cli.Context) {
if len(c.Args()) != 1 {
logrus.Fatalf("please specify tar to be disabled <NAME|->")
}
if len(c.String("output")) == 0 {
logrus.Fatalf("--output filename must be set")
}
// Set up the tar input stream
var inputStream io.Reader
if c.Args()[0] == "-" {
inputStream = os.Stdin
} else {
fh, err := os.Open(c.Args()[0])
if err != nil {
logrus.Fatal(err)
}
defer fh.Close()
inputStream = fh
}
// Set up the metadata storage
mf, err := os.OpenFile(c.String("output"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(0600))
if err != nil {
logrus.Fatal(err)
}
defer mf.Close()
mfz := gzip.NewWriter(mf)
defer mfz.Close()
metaPacker := storage.NewJSONPacker(mfz)
// we're passing nil here for the file putter, because the ApplyDiff will
// handle the extraction of the archive
its, err := asm.NewInputTarStream(inputStream, metaPacker, nil)
if err != nil {
logrus.Fatal(err)
}
var out io.Writer
if c.Bool("no-stdout") {
out = ioutil.Discard
} else {
out = os.Stdout
}
i, err := io.Copy(out, its)
if err != nil {
logrus.Fatal(err)
}
logrus.Infof("created %s from %s (read %d bytes)", c.String("output"), c.Args()[0], i)
}

91
cmd/tar-split/main.go Normal file
View file

@ -0,0 +1,91 @@
package main
import (
"os"
"github.com/Sirupsen/logrus"
"github.com/urfave/cli"
"github.com/vbatts/tar-split/version"
)
func main() {
app := cli.NewApp()
app.Name = "tar-split"
app.Usage = "tar assembly and disassembly utility"
app.Version = version.VERSION
app.Author = "Vincent Batts"
app.Email = "vbatts@hashbangbash.com"
app.Action = cli.ShowAppHelp
app.Before = func(c *cli.Context) error {
logrus.SetOutput(os.Stderr)
if c.Bool("debug") {
logrus.SetLevel(logrus.DebugLevel)
}
return nil
}
app.Flags = []cli.Flag{
cli.BoolFlag{
Name: "debug, D",
Usage: "debug output",
// defaults to false
},
}
app.Commands = []cli.Command{
{
Name: "disasm",
Aliases: []string{"d"},
Usage: "disassemble the input tar stream",
Action: CommandDisasm,
Flags: []cli.Flag{
cli.StringFlag{
Name: "output",
Value: "tar-data.json.gz",
Usage: "output of disassembled tar stream",
},
cli.BoolFlag{
Name: "no-stdout",
Usage: "do not throughput the stream to STDOUT",
},
},
},
{
Name: "asm",
Aliases: []string{"a"},
Usage: "assemble tar stream",
Action: CommandAsm,
Flags: []cli.Flag{
cli.StringFlag{
Name: "input",
Value: "tar-data.json.gz",
Usage: "input of disassembled tar stream",
},
cli.StringFlag{
Name: "output",
Value: "-",
Usage: "reassembled tar archive",
},
cli.StringFlag{
Name: "path",
Value: "",
Usage: "relative path of extracted tar",
},
},
},
{
Name: "checksize",
Usage: "displays size estimates for metadata storage of a Tar archive",
Action: CommandChecksize,
Flags: []cli.Flag{
cli.BoolFlag{
Name: "work",
Usage: "do not delete the working directory",
// defaults to false
},
},
},
}
if err := app.Run(os.Args); err != nil {
logrus.Fatal(err)
}
}

94
concept/DESIGN.md Normal file
View file

@ -0,0 +1,94 @@
# Flow of TAR stream
## `./archive/tar`
The import path `github.com/vbatts/tar-split/archive/tar` is fork of upstream golang stdlib [`archive/tar`](http://golang.org/pkg/archive/tar/).
It adds plumbing to access raw bytes of the tar stream as the headers and payload are read.
## Packer interface
For ease of storage and usage of the raw bytes, there will be a storage
interface, that accepts an io.Writer (This way you could pass it an in memory
buffer or a file handle).
Having a Packer interface can allow configuration of hash.Hash for file payloads
and providing your own io.Writer.
Instead of having a state directory to store all the header information for all
Readers, we will leave that up to user of Reader. Because we can not assume an
ID for each Reader, and keeping that information differentiated.
## State Directory
Perhaps we could deduplicate the header info, by hashing the rawbytes and
storing them in a directory tree like:
./ac/dc/beef
Then reference the hash of the header info, in the positional records for the
tar stream. Though this could be a future feature, and not required for an
initial implementation. Also, this would imply an owned state directory, rather
than just writing storage info to an io.Writer.
## Concept Example
First we'll get an archive to work with. For repeatability, we'll make an
archive from what you've just cloned:
```
git archive --format=tar -o tar-split.tar HEAD .
```
Then build the example main.go:
```
go build ./main.go
```
Now run the example over the archive:
```
$ ./main tar-split.tar
2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out"
pax_global_header pre: 512 read: 52
.travis.yml pre: 972 read: 374
DESIGN.md pre: 650 read: 1131
LICENSE pre: 917 read: 1075
README.md pre: 973 read: 4289
archive/ pre: 831 read: 0
archive/tar/ pre: 512 read: 0
archive/tar/common.go pre: 512 read: 7790
[...]
tar/storage/entry_test.go pre: 667 read: 1137
tar/storage/getter.go pre: 911 read: 2741
tar/storage/getter_test.go pre: 843 read: 1491
tar/storage/packer.go pre: 557 read: 3141
tar/storage/packer_test.go pre: 955 read: 3096
EOF padding: 1512
Remainder: 512
Size: 215040; Sum: 215040
```
*What are we seeing here?*
* `pre` is the header of a file entry, and potentially the padding from the
end of the prior file's payload. Also with particular tar extensions and pax
attributes, the header can exceed 512 bytes.
* `read` is the size of the file payload from the entry
* `EOF padding` is the expected 1024 null bytes on the end of a tar archive,
plus potential padding from the end of the prior file entry's payload
* `Remainder` is the remaining bytes of an archive. This is typically deadspace
as most tar implmentations will return after having reached the end of the
1024 null bytes. Though various implementations will include some amount of
bytes here, which will affect the checksum of the resulting tar archive,
therefore this must be accounted for as well.
Ideally the input tar and output `*.out`, will match:
```
$ sha1sum tar-split.tar*
ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar
ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar.out
```

View file

@ -3,13 +3,15 @@ package asm
import (
"bytes"
"fmt"
"hash"
"hash/crc64"
"io"
"sync"
"github.com/vbatts/tar-split/tar/storage"
)
// NewOutputTarStream returns an io.ReadCloser that is an assemble tar archive
// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive
// stream.
//
// It takes a storage.FileGetter, for mapping the file payloads that are to be read in,
@ -23,44 +25,106 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose
}
pr, pw := io.Pipe()
go func() {
for {
entry, err := up.Next()
if err != nil {
pw.CloseWithError(err)
break
}
switch entry.Type {
case storage.SegmentType:
if _, err := pw.Write(entry.Payload); err != nil {
pw.CloseWithError(err)
break
}
case storage.FileType:
if entry.Size == 0 {
continue
}
fh, err := fg.Get(entry.Name)
if err != nil {
pw.CloseWithError(err)
break
}
defer fh.Close()
c := crc64.New(storage.CRCTable)
tRdr := io.TeeReader(fh, c)
if _, err := io.Copy(pw, tRdr); err != nil {
pw.CloseWithError(err)
break
}
if !bytes.Equal(c.Sum(nil), entry.Payload) {
// I would rather this be a comparable ErrInvalidChecksum or such,
// but since it's coming through the PipeReader, the context of
// _which_ file would be lost...
pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name))
break
}
}
err := WriteOutputTarStream(fg, up, pw)
if err != nil {
pw.CloseWithError(err)
} else {
pw.Close()
}
pw.Close()
}()
return pr
}
// WriteOutputTarStream writes assembled tar archive to a writer.
func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error {
// ... Since these are interfaces, this is possible, so let's not have a nil pointer
if fg == nil || up == nil {
return nil
}
var copyBuffer []byte
var crcHash hash.Hash
var crcSum []byte
var multiWriter io.Writer
for {
entry, err := up.Next()
if err != nil {
if err == io.EOF {
return nil
}
return err
}
switch entry.Type {
case storage.SegmentType:
if _, err := w.Write(entry.Payload); err != nil {
return err
}
case storage.FileType:
if entry.Size == 0 {
continue
}
fh, err := fg.Get(entry.GetName())
if err != nil {
return err
}
if crcHash == nil {
crcHash = crc64.New(storage.CRCTable)
crcSum = make([]byte, 8)
multiWriter = io.MultiWriter(w, crcHash)
copyBuffer = byteBufferPool.Get().([]byte)
defer byteBufferPool.Put(copyBuffer)
} else {
crcHash.Reset()
}
if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil {
fh.Close()
return err
}
if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) {
// I would rather this be a comparable ErrInvalidChecksum or such,
// but since it's coming through the PipeReader, the context of
// _which_ file would be lost...
fh.Close()
return fmt.Errorf("file integrity checksum failed for %q", entry.GetName())
}
fh.Close()
}
}
}
var byteBufferPool = &sync.Pool{
New: func() interface{} {
return make([]byte, 32*1024)
},
}
// copyWithBuffer is taken from stdlib io.Copy implementation
// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367
func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) {
for {
nr, er := src.Read(buf)
if nr > 0 {
nw, ew := dst.Write(buf[0:nr])
if nw > 0 {
written += int64(nw)
}
if ew != nil {
err = ew
break
}
if nr != nw {
err = io.ErrShortWrite
break
}
}
if er == io.EOF {
break
}
if er != nil {
err = er
break
}
}
return written, err
}

View file

@ -5,6 +5,7 @@ import (
"compress/gzip"
"crypto/sha1"
"fmt"
"hash/crc64"
"io"
"io/ioutil"
"os"
@ -33,108 +34,223 @@ var entries = []struct {
Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187},
Size: 26,
},
Body: []byte("café con leche, por favor"),
},
{
Entry: storage.Entry{
Type: storage.FileType,
NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip.
Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187},
Size: 26,
},
Body: []byte("café con leche, por favor"),
},
}
var entriesMangled = []struct {
Entry storage.Entry
Body []byte
}{
{
Entry: storage.Entry{
Type: storage.FileType,
Name: "./hurr.txt",
Payload: []byte{3, 116, 164, 177, 171, 236, 107, 78},
Size: 20,
},
// switch
Body: []byte("imma derp til I hurr"),
},
{
Entry: storage.Entry{
Type: storage.FileType,
Name: "./ermahgerd.txt",
Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187},
Size: 26,
},
// san not con
Body: []byte("café sans leche, por favor"),
},
{
Entry: storage.Entry{
Type: storage.FileType,
NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4},
Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187},
Size: 26,
},
Body: []byte("café con leche, por favor"),
},
}
func TestTarStreamOld(t *testing.T) {
func TestTarStreamMangledGetterPutter(t *testing.T) {
fgp := storage.NewBufferFileGetPutter()
// first lets prep a GetPutter and Packer
for i := range entries {
if entries[i].Entry.Type == storage.FileType {
j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body))
j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body))
if err != nil {
t.Error(err)
}
if j != entries[i].Entry.Size {
t.Errorf("size %q: expected %d; got %d",
entries[i].Entry.Name,
entries[i].Entry.GetName(),
entries[i].Entry.Size,
j)
}
if !bytes.Equal(csum, entries[i].Entry.Payload) {
t.Errorf("checksum %q: expected %v; got %v",
entries[i].Entry.Name,
entries[i].Entry.GetName(),
entries[i].Entry.Payload,
csum)
}
}
}
// next we'll use these to produce a tar stream.
_ = NewOutputTarStream(fgp, nil)
// TODO finish this
for _, e := range entriesMangled {
if e.Entry.Type == storage.FileType {
rdr, err := fgp.Get(e.Entry.GetName())
if err != nil {
t.Error(err)
}
c := crc64.New(storage.CRCTable)
i, err := io.Copy(c, rdr)
if err != nil {
t.Fatal(err)
}
rdr.Close()
csum := c.Sum(nil)
if bytes.Equal(csum, e.Entry.Payload) {
t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v",
i,
e.Entry.GetName(),
csum)
}
}
}
}
var testCases = []struct {
path string
expectedSHA1Sum string
expectedSize int64
}{
{"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240},
{"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480},
{"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880},
{"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240},
{"./testdata/extranils.tar.gz", "e187b4b3e739deaccc257342f4940f34403dc588", 10648},
{"./testdata/notenoughnils.tar.gz", "72f93f41efd95290baa5c174c234f5d4c22ce601", 512},
}
func TestTarStream(t *testing.T) {
var (
expectedSum = "1eb237ff69bca6e22789ecb05b45d35ca307adbd"
expectedSize int64 = 10240
)
fh, err := os.Open("./testdata/t.tar.gz")
if err != nil {
t.Fatal(err)
}
defer fh.Close()
gzRdr, err := gzip.NewReader(fh)
if err != nil {
t.Fatal(err)
}
defer gzRdr.Close()
for _, tc := range testCases {
fh, err := os.Open(tc.path)
if err != nil {
t.Fatal(err)
}
defer fh.Close()
gzRdr, err := gzip.NewReader(fh)
if err != nil {
t.Fatal(err)
}
defer gzRdr.Close()
// Setup where we'll store the metadata
w := bytes.NewBuffer([]byte{})
sp := storage.NewJSONPacker(w)
fgp := storage.NewBufferFileGetPutter()
// Setup where we'll store the metadata
w := bytes.NewBuffer([]byte{})
sp := storage.NewJSONPacker(w)
fgp := storage.NewBufferFileGetPutter()
// wrap the disassembly stream
tarStream, err := NewInputTarStream(gzRdr, sp, fgp)
if err != nil {
t.Fatal(err)
}
// wrap the disassembly stream
tarStream, err := NewInputTarStream(gzRdr, sp, fgp)
if err != nil {
t.Fatal(err)
}
// get a sum of the stream after it has passed through to ensure it's the same.
h0 := sha1.New()
tRdr0 := io.TeeReader(tarStream, h0)
// get a sum of the stream after it has passed through to ensure it's the same.
h0 := sha1.New()
i, err := io.Copy(h0, tarStream)
if err != nil {
t.Fatal(err)
}
// read it all to the bit bucket
i, err := io.Copy(ioutil.Discard, tRdr0)
if err != nil {
t.Fatal(err)
}
if i != tc.expectedSize {
t.Errorf("size of tar: expected %d; got %d", tc.expectedSize, i)
}
if fmt.Sprintf("%x", h0.Sum(nil)) != tc.expectedSHA1Sum {
t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil))
}
if i != expectedSize {
t.Errorf("size of tar: expected %d; got %d", expectedSize, i)
}
if fmt.Sprintf("%x", h0.Sum(nil)) != expectedSum {
t.Fatalf("checksum of tar: expected %s; got %x", expectedSum, h0.Sum(nil))
}
//t.Logf("%s", w.String()) // if we fail, then show the packed info
t.Logf("%s", w.String()) // if we fail, then show the packed info
// If we've made it this far, then we'll turn it around and create a tar
// stream from the packed metadata and buffered file contents.
r := bytes.NewBuffer(w.Bytes())
sup := storage.NewJSONUnpacker(r)
// and reuse the fgp that we Put the payloads to.
// If we've made it this far, then we'll turn it around and create a tar
// stream from the packed metadata and buffered file contents.
r := bytes.NewBuffer(w.Bytes())
sup := storage.NewJSONUnpacker(r)
// and reuse the fgp that we Put the payloads to.
rc := NewOutputTarStream(fgp, sup)
h1 := sha1.New()
i, err = io.Copy(h1, rc)
if err != nil {
t.Fatal(err)
}
rc := NewOutputTarStream(fgp, sup)
h1 := sha1.New()
tRdr1 := io.TeeReader(rc, h1)
// read it all to the bit bucket
i, err = io.Copy(ioutil.Discard, tRdr1)
if err != nil {
t.Fatal(err)
}
if i != expectedSize {
t.Errorf("size of output tar: expected %d; got %d", expectedSize, i)
}
if fmt.Sprintf("%x", h1.Sum(nil)) != expectedSum {
t.Fatalf("checksum of output tar: expected %s; got %x", expectedSum, h1.Sum(nil))
if i != tc.expectedSize {
t.Errorf("size of output tar: expected %d; got %d", tc.expectedSize, i)
}
if fmt.Sprintf("%x", h1.Sum(nil)) != tc.expectedSHA1Sum {
t.Fatalf("checksum of output tar: expected %s; got %x", tc.expectedSHA1Sum, h1.Sum(nil))
}
}
}
func BenchmarkAsm(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, tc := range testCases {
func() {
fh, err := os.Open(tc.path)
if err != nil {
b.Fatal(err)
}
defer fh.Close()
gzRdr, err := gzip.NewReader(fh)
if err != nil {
b.Fatal(err)
}
defer gzRdr.Close()
// Setup where we'll store the metadata
w := bytes.NewBuffer([]byte{})
sp := storage.NewJSONPacker(w)
fgp := storage.NewBufferFileGetPutter()
// wrap the disassembly stream
tarStream, err := NewInputTarStream(gzRdr, sp, fgp)
if err != nil {
b.Fatal(err)
}
// read it all to the bit bucket
i1, err := io.Copy(ioutil.Discard, tarStream)
if err != nil {
b.Fatal(err)
}
r := bytes.NewBuffer(w.Bytes())
sup := storage.NewJSONUnpacker(r)
// and reuse the fgp that we Put the payloads to.
rc := NewOutputTarStream(fgp, sup)
i2, err := io.Copy(ioutil.Discard, rc)
if err != nil {
b.Fatal(err)
}
if i1 != i2 {
b.Errorf("%s: input(%d) and ouput(%d) byte count didn't match", tc.path, i1, i2)
}
}()
}
}
}

View file

@ -22,8 +22,8 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
// What to do here... folks will want their own access to the Reader that is
// their tar archive stream, but we'll need that same stream to use our
// forked 'archive/tar'.
// Perhaps do an io.TeeReader that hand back an io.Reader for them to read
// from, and we'll mitm the stream to store metadata.
// Perhaps do an io.TeeReader that hands back an io.Reader for them to read
// from, and we'll MITM the stream to store metadata.
// We'll need a storage.FilePutter too ...
// Another concern, whether to do any storage.FilePutter operations, such that we
@ -32,7 +32,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
// Perhaps we have a DiscardFilePutter that is a bit bucket.
// we'll return the pipe reader, since TeeReader does not buffer and will
// only read what the outputRdr Read's. Since Tar archive's have padding on
// only read what the outputRdr Read's. Since Tar archives have padding on
// the end, we want to be the one reading the padding, even if the user's
// `archive/tar` doesn't care.
pR, pW := io.Pipe()
@ -55,12 +55,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
}
// even when an EOF is reached, there is often 1024 null bytes on
// the end of an archive. Collect them too.
_, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: tr.RawBytes(),
})
if err != nil {
pW.CloseWithError(err)
if b := tr.RawBytes(); len(b) > 0 {
_, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: b,
})
if err != nil {
pW.CloseWithError(err)
return
}
}
break // not return. We need the end of the reader.
}
@ -68,11 +71,15 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
break // not return. We need the end of the reader.
}
if _, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: tr.RawBytes(),
}); err != nil {
pW.CloseWithError(err)
if b := tr.RawBytes(); len(b) > 0 {
_, err := p.AddEntry(storage.Entry{
Type: storage.SegmentType,
Payload: b,
})
if err != nil {
pW.CloseWithError(err)
return
}
}
var csum []byte
@ -81,18 +88,23 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
_, csum, err = fp.Put(hdr.Name, tr)
if err != nil {
pW.CloseWithError(err)
return
}
}
// File entries added, regardless of size
_, err = p.AddEntry(storage.Entry{
entry := storage.Entry{
Type: storage.FileType,
Name: hdr.Name,
Size: hdr.Size,
Payload: csum,
})
}
// For proper marshalling of non-utf8 characters
entry.SetName(hdr.Name)
// File entries added, regardless of size
_, err = p.AddEntry(entry)
if err != nil {
pW.CloseWithError(err)
return
}
if b := tr.RawBytes(); len(b) > 0 {
@ -102,6 +114,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
})
if err != nil {
pW.CloseWithError(err)
return
}
}
}
@ -111,6 +124,7 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
remainder, err := ioutil.ReadAll(outputRdr)
if err != nil && err != io.EOF {
pW.CloseWithError(err)
return
}
_, err = p.AddEntry(storage.Entry{
Type: storage.SegmentType,
@ -118,9 +132,9 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
})
if err != nil {
pW.CloseWithError(err)
} else {
pW.Close()
return
}
pW.Close()
}()
return pR, nil

BIN
tar/asm/testdata/extranils.tar.gz vendored Normal file

Binary file not shown.

BIN
tar/asm/testdata/fatlonglink.tar.gz vendored Normal file

Binary file not shown.

BIN
tar/asm/testdata/iso-8859.tar.gz vendored Normal file

Binary file not shown.

BIN
tar/asm/testdata/longlink.tar.gz vendored Normal file

Binary file not shown.

BIN
tar/asm/testdata/notenoughnils.tar.gz vendored Normal file

Binary file not shown.

View file

@ -5,7 +5,7 @@ Packing and unpacking the Entries of the stream. The types of streams are
either segments of raw bytes (for the raw headers and various padding) and for
an entry marking a file payload.
The raw bytes are stored precisely in the packed (marshalled) Entry. Where as
The raw bytes are stored precisely in the packed (marshalled) Entry, whereas
the file payload marker include the name of the file, size, and crc64 checksum
(for basic file integrity).
*/

View file

@ -1,5 +1,7 @@
package storage
import "unicode/utf8"
// Entries is for sorting by Position
type Entries []Entry
@ -19,11 +21,11 @@ const (
// SegmentType represents a raw bytes segment from the archive stream. These raw
// byte segments consist of the raw headers and various padding.
//
// It's payload is to be marshalled base64 encoded.
// Its payload is to be marshalled base64 encoded.
SegmentType
)
// Entry is a the structure for packing and unpacking the information read from
// Entry is the structure for packing and unpacking the information read from
// the Tar archive.
//
// FileType Payload checksum is using `hash/crc64` for basic file integrity,
@ -32,8 +34,45 @@ const (
// collisions in a sample of 18.2 million, CRC64 had none.
type Entry struct {
Type Type `json:"type"`
Name string `json:"name",omitempty`
Size int64 `json:"size",omitempty`
Payload []byte `json:"payload"` // SegmentType store payload here; FileType store crc64 checksum here;
Name string `json:"name,omitempty"`
NameRaw []byte `json:"name_raw,omitempty"`
Size int64 `json:"size,omitempty"`
Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here;
Position int `json:"position"`
}
// SetName will check name for valid UTF-8 string, and set the appropriate
// field. See https://github.com/vbatts/tar-split/issues/17
func (e *Entry) SetName(name string) {
if utf8.ValidString(name) {
e.Name = name
} else {
e.NameRaw = []byte(name)
}
}
// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
// field
func (e *Entry) SetNameBytes(name []byte) {
if utf8.Valid(name) {
e.Name = string(name)
} else {
e.NameRaw = name
}
}
// GetName returns the string for the entry's name, regardless of the field stored in
func (e *Entry) GetName() string {
if len(e.NameRaw) > 0 {
return string(e.NameRaw)
}
return e.Name
}
// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in
func (e *Entry) GetNameBytes() []byte {
if len(e.NameRaw) > 0 {
return e.NameRaw
}
return []byte(e.Name)
}

View file

@ -39,10 +39,10 @@ func TestEntries(t *testing.T) {
func TestFile(t *testing.T) {
f := Entry{
Type: FileType,
Name: "./hello.txt",
Size: 100,
Position: 2,
}
f.SetName("./hello.txt")
buf, err := json.Marshal(f)
if err != nil {
@ -54,8 +54,37 @@ func TestFile(t *testing.T) {
t.Fatal(err)
}
if f.Name != f1.Name {
t.Errorf("expected Name %q, got %q", f.Name, f1.Name)
if f.GetName() != f1.GetName() {
t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
}
if f.Size != f1.Size {
t.Errorf("expected Size %q, got %q", f.Size, f1.Size)
}
if f.Position != f1.Position {
t.Errorf("expected Position %q, got %q", f.Position, f1.Position)
}
}
func TestFileRaw(t *testing.T) {
f := Entry{
Type: FileType,
Size: 100,
Position: 2,
}
f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74})
buf, err := json.Marshal(f)
if err != nil {
t.Fatal(err)
}
f1 := Entry{}
if err = json.Unmarshal(buf, &f1); err != nil {
t.Fatal(err)
}
if f.GetName() != f1.GetName() {
t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
}
if f.Size != f1.Size {
t.Errorf("expected Size %q, got %q", f.Size, f1.Size)

View file

@ -5,25 +5,24 @@ import (
"errors"
"hash/crc64"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
)
// FileGetter is the interface for getting a stream of a file payload, address
// by name/filepath. Presumably, the names will be scoped to relative file
// paths.
// FileGetter is the interface for getting a stream of a file payload,
// addressed by name/filename. Presumably, the names will be scoped to relative
// file paths.
type FileGetter interface {
// Get returns a stream for the provided file path
Get(filepath string) (output io.ReadCloser, err error)
Get(filename string) (output io.ReadCloser, err error)
}
// FilePutter is the interface for storing a stream of a file payload,
// addressed by name/filepath.
// addressed by name/filename.
type FilePutter interface {
// Put returns the size of the stream received, and the crc64 checksum for
// the provided stream
Put(filepath string, input io.Reader) (size int64, checksum []byte, err error)
Put(filename string, input io.Reader) (size int64, checksum []byte, err error)
}
// FileGetPutter is the interface that groups both Getting and Putting file
@ -44,8 +43,7 @@ type pathFileGetter struct {
}
func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) {
// FIXME might should have a check for '../../../../etc/passwd' attempts?
return os.Open(path.Join(pfg.root, filename))
return os.Open(filepath.Join(pfg.root, filename))
}
type bufferFileGetPutter struct {
@ -61,15 +59,15 @@ func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) {
}
func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) {
c := crc64.New(CRCTable)
tRdr := io.TeeReader(r, c)
b := bytes.NewBuffer([]byte{})
i, err := io.Copy(b, tRdr)
crc := crc64.New(CRCTable)
buf := bytes.NewBuffer(nil)
cw := io.MultiWriter(crc, buf)
i, err := io.Copy(cw, r)
if err != nil {
return 0, nil, err
}
bfgp.files[name] = b.Bytes()
return i, c.Sum(nil), nil
bfgp.files[name] = buf.Bytes()
return i, crc.Sum(nil), nil
}
type readCloserWrapper struct {
@ -78,7 +76,7 @@ type readCloserWrapper struct {
func (w *readCloserWrapper) Close() error { return nil }
// NewBufferFileGetPutter is simple in memory FileGetPutter
// NewBufferFileGetPutter is a simple in-memory FileGetPutter
//
// Implication is this is memory intensive...
// Probably best for testing or light weight cases.
@ -98,8 +96,7 @@ type bitBucketFilePutter struct {
func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) {
c := crc64.New(CRCTable)
tRdr := io.TeeReader(r, c)
i, err := io.Copy(ioutil.Discard, tRdr)
i, err := io.Copy(c, r)
return i, c.Sum(nil), err
}

View file

@ -2,7 +2,9 @@ package storage
import (
"bytes"
"fmt"
"io/ioutil"
"strings"
"testing"
)
@ -39,6 +41,7 @@ func TestGetter(t *testing.T) {
}
}
}
func TestPutter(t *testing.T) {
fp := NewDiscardFilePutter()
// map[filename]map[body]crc64sum
@ -60,3 +63,22 @@ func TestPutter(t *testing.T) {
}
}
}
func BenchmarkPutter(b *testing.B) {
files := []string{
strings.Repeat("foo", 1000),
strings.Repeat("bar", 1000),
strings.Repeat("baz", 1000),
strings.Repeat("fooz", 1000),
strings.Repeat("vbatts", 1000),
strings.Repeat("systemd", 1000),
}
for i := 0; i < b.N; i++ {
fgp := NewBufferFileGetPutter()
for n, body := range files {
if _, _, err := fgp.Put(fmt.Sprintf("%d", n), bytes.NewBufferString(body)); err != nil {
b.Fatal(err)
}
}
}
}

View file

@ -1,15 +1,15 @@
package storage
import (
"bufio"
"encoding/json"
"errors"
"io"
"path"
"path/filepath"
"unicode/utf8"
)
// ErrDuplicatePath is occured when a tar archive has more than one entry for
// the same file path
// ErrDuplicatePath occurs when a tar archive has more than one entry for the
// same file path
var ErrDuplicatePath = errors.New("duplicates of file paths not supported")
// Packer describes the methods to pack Entries to a storage destination
@ -32,40 +32,24 @@ type PackUnpacker interface {
*/
type jsonUnpacker struct {
r io.Reader
b *bufio.Reader
isEOF bool
seen seenNames
seen seenNames
dec *json.Decoder
}
func (jup *jsonUnpacker) Next() (*Entry, error) {
var e Entry
if jup.isEOF {
// since ReadBytes() will return read bytes AND an EOF, we handle it this
// round-a-bout way so we can Unmarshal the tail with relevant errors, but
// still get an io.EOF when the stream is ended.
return nil, io.EOF
}
line, err := jup.b.ReadBytes('\n')
if err != nil && err != io.EOF {
err := jup.dec.Decode(&e)
if err != nil {
return nil, err
} else if err == io.EOF {
jup.isEOF = true
}
err = json.Unmarshal(line, &e)
if err != nil && jup.isEOF {
// if the remainder actually _wasn't_ a remaining json structure, then just EOF
return nil, io.EOF
}
// check for dup name
if e.Type == FileType {
cName := path.Clean(e.Name)
cName := filepath.Clean(e.GetName())
if _, ok := jup.seen[cName]; ok {
return nil, ErrDuplicatePath
}
jup.seen[cName] = emptyByte
jup.seen[cName] = struct{}{}
}
return &e, err
@ -77,8 +61,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) {
// Each Entry read are expected to be delimited by new line.
func NewJSONUnpacker(r io.Reader) Unpacker {
return &jsonUnpacker{
r: r,
b: bufio.NewReader(r),
dec: json.NewDecoder(r),
seen: seenNames{},
}
}
@ -90,20 +73,24 @@ type jsonPacker struct {
seen seenNames
}
type seenNames map[string]byte
// used in the seenNames map. byte is a uint8, and we'll re-use the same one
// for minimalism.
const emptyByte byte = 0
type seenNames map[string]struct{}
func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
// if Name is not valid utf8, switch it to raw first.
if e.Name != "" {
if !utf8.ValidString(e.Name) {
e.NameRaw = []byte(e.Name)
e.Name = ""
}
}
// check early for dup name
if e.Type == FileType {
cName := path.Clean(e.Name)
cName := filepath.Clean(e.GetName())
if _, ok := jp.seen[cName]; ok {
return -1, ErrDuplicatePath
}
jp.seen[cName] = emptyByte
jp.seen[cName] = struct{}{}
}
e.Position = jp.pos
@ -117,7 +104,7 @@ func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
return e.Position, nil
}
// NewJSONPacker provides an Packer that writes each Entry (SegmentType and
// NewJSONPacker provides a Packer that writes each Entry (SegmentType and
// FileType) as a json document.
//
// The Entries are delimited by new line.

View file

@ -4,6 +4,8 @@ import (
"bytes"
"compress/gzip"
"io"
"io/ioutil"
"os"
"testing"
)
@ -159,5 +161,58 @@ func TestGzip(t *testing.T) {
if len(entries) != len(e) {
t.Errorf("expected %d entries, got %d", len(e), len(entries))
}
}
func BenchmarkGetPut(b *testing.B) {
e := []Entry{
Entry{
Type: SegmentType,
Payload: []byte("how"),
},
Entry{
Type: SegmentType,
Payload: []byte("y'all"),
},
Entry{
Type: FileType,
Name: "./hurr.txt",
Payload: []byte("deadbeef"),
},
Entry{
Type: SegmentType,
Payload: []byte("doin"),
},
}
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
func() {
fh, err := ioutil.TempFile("", "tar-split.")
if err != nil {
b.Fatal(err)
}
defer os.Remove(fh.Name())
defer fh.Close()
jp := NewJSONPacker(fh)
for i := range e {
if _, err := jp.AddEntry(e[i]); err != nil {
b.Fatal(err)
}
}
fh.Sync()
up := NewJSONUnpacker(fh)
for {
_, err := up.Next()
if err != nil {
if err == io.EOF {
break
}
b.Fatal(err)
}
}
}()
}
})
}

84
tar_benchmark_test.go Normal file
View file

@ -0,0 +1,84 @@
package tartest
import (
"io"
"io/ioutil"
"os"
"testing"
upTar "archive/tar"
ourTar "github.com/vbatts/tar-split/archive/tar"
)
var testfile = "./archive/tar/testdata/sparse-formats.tar"
func BenchmarkUpstreamTar(b *testing.B) {
for n := 0; n < b.N; n++ {
fh, err := os.Open(testfile)
if err != nil {
b.Fatal(err)
}
tr := upTar.NewReader(fh)
for {
_, err := tr.Next()
if err != nil {
if err == io.EOF {
break
}
fh.Close()
b.Fatal(err)
}
io.Copy(ioutil.Discard, tr)
}
fh.Close()
}
}
func BenchmarkOurTarNoAccounting(b *testing.B) {
for n := 0; n < b.N; n++ {
fh, err := os.Open(testfile)
if err != nil {
b.Fatal(err)
}
tr := ourTar.NewReader(fh)
tr.RawAccounting = false // this is default, but explicit here
for {
_, err := tr.Next()
if err != nil {
if err == io.EOF {
break
}
fh.Close()
b.Fatal(err)
}
io.Copy(ioutil.Discard, tr)
}
fh.Close()
}
}
func BenchmarkOurTarYesAccounting(b *testing.B) {
for n := 0; n < b.N; n++ {
fh, err := os.Open(testfile)
if err != nil {
b.Fatal(err)
}
tr := ourTar.NewReader(fh)
tr.RawAccounting = true // This enables mechanics for collecting raw bytes
for {
_ = tr.RawBytes()
_, err := tr.Next()
_ = tr.RawBytes()
if err != nil {
if err == io.EOF {
break
}
fh.Close()
b.Fatal(err)
}
io.Copy(ioutil.Discard, tr)
_ = tr.RawBytes()
}
fh.Close()
}
}

4
version/gen.go Normal file
View file

@ -0,0 +1,4 @@
package version
// from `go get github.com/vbatts/go-get-version`
//go:generate go-get-version -package version -variable VERSION -output version.go

7
version/version.go Normal file
View file

@ -0,0 +1,7 @@
package version
// AUTO-GENEREATED. DO NOT EDIT
// 2016-09-26 19:53:30.825879 -0400 EDT
// VERSION is the generated version from /home/vbatts/src/vb/tar-split/version
var VERSION = "v0.10.1-4-gf280282"