1
0
Fork 1
mirror of https://github.com/vbatts/tar-split.git synced 2025-07-25 00:10:29 +00:00

Add tar/asm.IterateHeaders

This allows reading the metadata contained in tar-split
without expensively recreating the whole tar stream
including full contents.

We have two use cases for this:
- In a situation where tar-split is distributed along with
  a separate metadata stream, ensuring that the two are
  exactly consistent
- Reading the tar headers allows making a ~cheap check
  of consistency of on-disk layers, just checking that the
  files exist in expected sizes, without reading the full
  contents.

This can be implemented outside of this repo, but it's
not ideal:
- The function necessarily hard-codes some assumptions
  about how tar-split determines the boundaries of
  SegmentType/FileType entries (or, indeed, whether it
  uses FileType entries at all). That's best maintained
  directly beside the code that creates this.
- The ExpectedPadding() value is not currently exported,
  so the consumer would have to heuristically guess where
  the padding ends.

Signed-off-by: Miloslav Trmač <mitr@redhat.com>
This commit is contained in:
Miloslav Trmač 2024-09-11 19:54:20 +02:00
parent fe4605ae8b
commit 99c8914877
5 changed files with 190 additions and 1 deletions

57
tar/asm/iterate.go Normal file
View file

@ -0,0 +1,57 @@
package asm
import (
"bytes"
"fmt"
"io"
"github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/storage"
)
// IterateHeaders calls handler for each tar header provided by Unpacker
func IterateHeaders(unpacker storage.Unpacker, handler func(hdr *tar.Header) error) error {
// We assume about NewInputTarStream:
// - There is a separate SegmentType entry for every tar header, but only one SegmentType entry for the full header incl. any extensions
// - (There is a FileType entry for every tar header, we ignore it)
// - Trailing padding of a file, if any, is included in the next SegmentType entry
// - At the end, there may be SegmentType entries just for the terminating zero blocks.
var pendingPadding int64 = 0
for {
tsEntry, err := unpacker.Next()
if err != nil {
if err == io.EOF {
return nil
}
return fmt.Errorf("reading tar-split entries: %w", err)
}
switch tsEntry.Type {
case storage.SegmentType:
payload := tsEntry.Payload
if int64(len(payload)) < pendingPadding {
return fmt.Errorf("expected %d bytes of padding after previous file, but next SegmentType only has %d bytes", pendingPadding, len(payload))
}
payload = payload[pendingPadding:]
pendingPadding = 0
tr := tar.NewReader(bytes.NewReader(payload))
hdr, err := tr.Next()
if err != nil {
if err == io.EOF { // Probably the last entry, but lets let the unpacker drive that.
break
}
return fmt.Errorf("decoding a tar header from a tar-split entry: %w", err)
}
if err := handler(hdr); err != nil {
return err
}
pendingPadding = tr.ExpectedPadding()
case storage.FileType:
// Nothing
default:
return fmt.Errorf("unexpected tar-split entry type %q", tsEntry.Type)
}
}
}

119
tar/asm/iterate_test.go Normal file
View file

@ -0,0 +1,119 @@
package asm
import (
"bytes"
"fmt"
"io"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/storage"
)
func createTestTarheader(index int, typeFlag byte, size int64) tar.Header {
n := (index + 1) * 100 // Use predictable, but distinct, values for all headers
res := tar.Header{
Typeflag: typeFlag,
Name: fmt.Sprintf("name%d", n),
Size: size,
Mode: int64(n + 1),
Uid: n + 2,
Gid: n + 3,
Uname: fmt.Sprintf("user%d", n),
Gname: fmt.Sprintf("group%d", n),
ModTime: time.Unix(int64(n+4), 0),
AccessTime: time.Unix(int64(n+5), 0),
ChangeTime: time.Unix(int64(n+6), 0),
PAXRecords: map[string]string{fmt.Sprintf("key%d", n): fmt.Sprintf("value%d", n)},
Format: tar.FormatPAX, // We must set a format, in the default one AccessTime and ChangeTime are discarded.
}
switch res.Typeflag {
case tar.TypeLink, tar.TypeSymlink:
res.Linkname = fmt.Sprintf("link%d", n)
case tar.TypeChar, tar.TypeBlock:
res.Devmajor = int64(n + 7)
res.Devminor = int64(n + 8)
}
return res
}
func TestIterateHeaders(t *testing.T) {
entries := []struct {
typeFlag byte
size int64
}{
{tar.TypeReg, 0},
{tar.TypeReg, 1},
{tar.TypeReg, 511},
{tar.TypeReg, 512},
{tar.TypeReg, 513},
{tar.TypeLink, 0},
{tar.TypeSymlink, 0},
{tar.TypeChar, 0},
{tar.TypeBlock, 0},
{tar.TypeDir, 0},
{tar.TypeFifo, 0},
}
var tarball bytes.Buffer
var expected []tar.Header
w := tar.NewWriter(&tarball)
for i, e := range entries {
hdr := createTestTarheader(i, e.typeFlag, e.size)
err := w.WriteHeader(&hdr)
require.NoError(t, err)
data := make([]byte, e.size)
_, err = w.Write(data)
require.NoError(t, err)
expected = append(expected, hdr)
}
err := w.Close()
require.NoError(t, err)
var tarSplit bytes.Buffer
tsReader, err := NewInputTarStream(&tarball, storage.NewJSONPacker(&tarSplit), storage.NewDiscardFilePutter())
require.NoError(t, err)
_, err = io.Copy(io.Discard, tsReader)
require.NoError(t, err)
unpacker := storage.NewJSONUnpacker(&tarSplit)
var actual []tar.Header
err = IterateHeaders(unpacker, func(hdr *tar.Header) error {
actual = append(actual, *hdr)
return nil
})
require.NoError(t, err)
assert.Equal(t, len(expected), len(actual))
for i := range expected {
expected := &expected[i]
actual := &actual[i]
assert.Equal(t, expected.Typeflag, actual.Typeflag)
assert.Equal(t, expected.Name, actual.Name)
assert.Equal(t, expected.Linkname, actual.Linkname)
assert.Equal(t, expected.Size, actual.Size)
assert.Equal(t, expected.Mode, actual.Mode)
assert.Equal(t, expected.Uid, actual.Uid)
assert.Equal(t, expected.Gid, actual.Gid)
assert.Equal(t, expected.Uname, actual.Uname)
assert.Equal(t, expected.Gname, actual.Gname)
assert.True(t, actual.ModTime.Equal(expected.ModTime))
assert.True(t, actual.AccessTime.Equal(expected.AccessTime))
assert.True(t, actual.ChangeTime.Equal(expected.ChangeTime))
assert.Equal(t, expected.Devmajor, actual.Devmajor)
assert.Equal(t, expected.Devminor, actual.Devminor)
assert.Equal(t, expected.Xattrs, actual.Xattrs) //nolint:staticcheck // We do want a comprehensive coverage in this test.
// We cant compare PAXRecords for complete equality, because tar.Writer adds atime and ctime entries. So ensure all expected records are present.
for k, v := range expected.PAXRecords {
v2, ok := actual.PAXRecords[k]
assert.True(t, ok, k)
assert.Equal(t, v, v2)
}
assert.Equal(t, expected.Format, actual.Format)
}
}