diff --git a/tarsum/fileinfosums.go b/tarsum/fileinfosums.go new file mode 100644 index 0000000..f9f4680 --- /dev/null +++ b/tarsum/fileinfosums.go @@ -0,0 +1,125 @@ +package tarsum + +import "sort" + +// This info will be accessed through interface so the actual name and sum cannot be medled with +type FileInfoSumInterface interface { + // File name + Name() string + // Checksum of this particular file and its headers + Sum() string + // Position of file in the tar + Pos() int64 +} + +type fileInfoSum struct { + name string + sum string + pos int64 +} + +func (fis fileInfoSum) Name() string { + return fis.name +} +func (fis fileInfoSum) Sum() string { + return fis.sum +} +func (fis fileInfoSum) Pos() int64 { + return fis.pos +} + +type FileInfoSums []FileInfoSumInterface + +// GetFile returns the first FileInfoSumInterface with a matching name +func (fis FileInfoSums) GetFile(name string) FileInfoSumInterface { + for i := range fis { + if fis[i].Name() == name { + return fis[i] + } + } + return nil +} + +// GetAllFile returns a FileInfoSums with all matching names +func (fis FileInfoSums) GetAllFile(name string) FileInfoSums { + f := FileInfoSums{} + for i := range fis { + if fis[i].Name() == name { + f = append(f, fis[i]) + } + } + return f +} + +func contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} + +func (fis FileInfoSums) GetDuplicatePaths() (dups FileInfoSums) { + seen := make(map[string]int, len(fis)) // allocate earl. no need to grow this map. + for i := range fis { + f := fis[i] + if _, ok := seen[f.Name()]; ok { + dups = append(dups, f) + } else { + seen[f.Name()] = 0 + } + } + return dups +} + +func (fis FileInfoSums) Len() int { return len(fis) } +func (fis FileInfoSums) Swap(i, j int) { fis[i], fis[j] = fis[j], fis[i] } + +func (fis FileInfoSums) SortByPos() { + sort.Sort(byPos{fis}) +} + +func (fis FileInfoSums) SortByNames() { + sort.Sort(byName{fis}) +} + +func (fis FileInfoSums) SortBySums() { + dups := fis.GetDuplicatePaths() + if len(dups) > 0 { + sort.Sort(bySum{fis, dups}) + } else { + sort.Sort(bySum{fis, nil}) + } +} + +// byName is a sort.Sort helper for sorting by file names. +// If names are the same, order them by their appearance in the tar archive +type byName struct{ FileInfoSums } + +func (bn byName) Less(i, j int) bool { + if bn.FileInfoSums[i].Name() == bn.FileInfoSums[j].Name() { + return bn.FileInfoSums[i].Pos() < bn.FileInfoSums[j].Pos() + } + return bn.FileInfoSums[i].Name() < bn.FileInfoSums[j].Name() +} + +// bySum is a sort.Sort helper for sorting by the sums of all the fileinfos in the tar archive +type bySum struct { + FileInfoSums + dups FileInfoSums +} + +func (bs bySum) Less(i, j int) bool { + if bs.dups != nil && bs.FileInfoSums[i].Name() == bs.FileInfoSums[j].Name() { + return bs.FileInfoSums[i].Pos() < bs.FileInfoSums[j].Pos() + } + return bs.FileInfoSums[i].Sum() < bs.FileInfoSums[j].Sum() +} + +// byPos is a sort.Sort helper for sorting by the sums of all the fileinfos by their original order +type byPos struct{ FileInfoSums } + +func (bp byPos) Less(i, j int) bool { + return bp.FileInfoSums[i].Pos() < bp.FileInfoSums[j].Pos() +} diff --git a/tarsum/fileinfosums_test.go b/tarsum/fileinfosums_test.go new file mode 100644 index 0000000..e1c6cc1 --- /dev/null +++ b/tarsum/fileinfosums_test.go @@ -0,0 +1,45 @@ +package tarsum + +import "testing" + +func newFileInfoSums() FileInfoSums { + return FileInfoSums{ + fileInfoSum{name: "file3", sum: "2abcdef1234567890", pos: 2}, + fileInfoSum{name: "dup1", sum: "deadbeef1", pos: 5}, + fileInfoSum{name: "file1", sum: "0abcdef1234567890", pos: 0}, + fileInfoSum{name: "file4", sum: "3abcdef1234567890", pos: 3}, + fileInfoSum{name: "dup1", sum: "deadbeef0", pos: 4}, + fileInfoSum{name: "file2", sum: "1abcdef1234567890", pos: 1}, + } +} + +func TestSortFileInfoSums(t *testing.T) { + dups := newFileInfoSums().GetAllFile("dup1") + if len(dups) != 2 { + t.Errorf("expected length 2, got %d", len(dups)) + } + dups.SortByNames() + if dups[0].Pos() != 4 { + t.Errorf("sorted dups should be ordered by position. Expected 4, got %d", dups[0].Pos()) + } + + fis := newFileInfoSums() + expected := "0abcdef1234567890" + fis.SortBySums() + got := fis[0].Sum() + if got != expected { + t.Errorf("Expected %q, got %q", expected, got) + } + + fis = newFileInfoSums() + expected = "dup1" + fis.SortByNames() + gotFis := fis[0] + if gotFis.Name() != expected { + t.Errorf("Expected %q, got %q", expected, gotFis.Name()) + } + // since a duplicate is first, ensure it is ordered first by position too + if gotFis.Pos() != 4 { + t.Errorf("Expected %d, got %d", 4, gotFis.Pos()) + } +} diff --git a/tarsum/tarsum.go b/tarsum/tarsum.go index 69775fa..4ae71f0 100644 --- a/tarsum/tarsum.go +++ b/tarsum/tarsum.go @@ -39,7 +39,7 @@ func NewTarSum(r io.Reader, dc bool, v Version) (TarSum, error) { // checksums of a tar archive type TarSum interface { io.Reader - GetSums() map[string]string + GetSums() FileInfoSums Sum([]byte) string Version() Version } @@ -54,7 +54,8 @@ type tarSum struct { bufGz *bytes.Buffer bufData []byte h hash.Hash - sums map[string]string + sums FileInfoSums + fileCounter int64 currentFile string finished bool first bool @@ -126,7 +127,7 @@ func (ts *tarSum) Read(buf []byte) (int, error) { ts.h = sha256.New() ts.h.Reset() ts.first = true - ts.sums = make(map[string]string) + ts.sums = FileInfoSums{} } if ts.finished { @@ -153,7 +154,8 @@ func (ts *tarSum) Read(buf []byte) (int, error) { return 0, err } if !ts.first { - ts.sums[ts.currentFile] = hex.EncodeToString(ts.h.Sum(nil)) + ts.sums = append(ts.sums, fileInfoSum{name: ts.currentFile, sum: hex.EncodeToString(ts.h.Sum(nil)), pos: ts.fileCounter}) + ts.fileCounter++ ts.h.Reset() } else { ts.first = false @@ -218,25 +220,20 @@ func (ts *tarSum) Read(buf []byte) (int, error) { } func (ts *tarSum) Sum(extra []byte) string { - var sums []string - - for _, sum := range ts.sums { - sums = append(sums, sum) - } - sort.Strings(sums) + ts.sums.SortBySums() h := sha256.New() if extra != nil { h.Write(extra) } - for _, sum := range sums { - log.Debugf("-->%s<--", sum) - h.Write([]byte(sum)) + for _, fis := range ts.sums { + log.Debugf("-->%s<--", fis.Sum()) + h.Write([]byte(fis.Sum())) } checksum := ts.Version().String() + "+sha256:" + hex.EncodeToString(h.Sum(nil)) log.Debugf("checksum processed: %s", checksum) return checksum } -func (ts *tarSum) GetSums() map[string]string { +func (ts *tarSum) GetSums() FileInfoSums { return ts.sums } diff --git a/tarsum/tarsum_test.go b/tarsum/tarsum_test.go index 6616cba..d0b4c94 100644 --- a/tarsum/tarsum_test.go +++ b/tarsum/tarsum_test.go @@ -59,6 +59,22 @@ var testLayers = []testLayer{ { options: &sizedOptions{1, 1024 * 1024, false, false}, // a 1mb file (in memory) tarsum: "tarsum+sha256:8bf12d7e67c51ee2e8306cba569398b1b9f419969521a12ffb9d8875e8836738"}, + { + // this tar has two files with the same path + filename: "testdata/collision/collision-0.tar", + tarsum: "tarsum+sha256:08653904a68d3ab5c59e65ef58c49c1581caa3c34744f8d354b3f575ea04424a"}, + { + // this tar has the same two files (with the same path), but reversed order. ensuring is has different hash than above + filename: "testdata/collision/collision-1.tar", + tarsum: "tarsum+sha256:b51c13fbefe158b5ce420d2b930eef54c5cd55c50a2ee4abdddea8fa9f081e0d"}, + { + // this tar has newer of collider-0.tar, ensuring is has different hash + filename: "testdata/collision/collision-2.tar", + tarsum: "tarsum+sha256:381547080919bb82691e995508ae20ed33ce0f6948d41cafbeb70ce20c73ee8e"}, + { + // this tar has newer of collider-1.tar, ensuring is has different hash + filename: "testdata/collision/collision-3.tar", + tarsum: "tarsum+sha256:f886e431c08143164a676805205979cd8fa535dfcef714db5515650eea5a7c0f"}, } type sizedOptions struct { diff --git a/tarsum/testdata/collision/collision-0.tar b/tarsum/testdata/collision/collision-0.tar new file mode 100644 index 0000000..1c636b3 Binary files /dev/null and b/tarsum/testdata/collision/collision-0.tar differ diff --git a/tarsum/testdata/collision/collision-1.tar b/tarsum/testdata/collision/collision-1.tar new file mode 100644 index 0000000..b411be9 Binary files /dev/null and b/tarsum/testdata/collision/collision-1.tar differ diff --git a/tarsum/testdata/collision/collision-2.tar b/tarsum/testdata/collision/collision-2.tar new file mode 100644 index 0000000..7b5c04a Binary files /dev/null and b/tarsum/testdata/collision/collision-2.tar differ diff --git a/tarsum/testdata/collision/collision-3.tar b/tarsum/testdata/collision/collision-3.tar new file mode 100644 index 0000000..f8c6458 Binary files /dev/null and b/tarsum/testdata/collision/collision-3.tar differ