1
0
Fork 1
mirror of https://github.com/vbatts/tar-split.git synced 2024-11-15 12:58:38 +00:00
tar-split/tar/storage/packer.go
Vincent Batts 032efafc29 tar/storage: work with raw (invalid utf8) names
When the entry name is not UTF-8, for example ISO-8859-1, then store the
raw bytes.
To accommodate this, we will have getters and setters for the entry's
name now. Since this most heavily affects the json marshalling, we'll
double check the sanity of the name before storing it in the JSONPacker.
2015-09-23 15:27:33 -04:00

146 lines
3.2 KiB
Go

package storage
import (
"bufio"
"encoding/json"
"errors"
"io"
"path/filepath"
"github.com/vbatts/tar-split/tar/common"
)
// ErrDuplicatePath occurs when a tar archive has more than one entry for the
// same file path
var ErrDuplicatePath = errors.New("duplicates of file paths not supported")
// Packer describes the methods to pack Entries to a storage destination
type Packer interface {
// AddEntry packs the Entry and returns its position
AddEntry(e Entry) (int, error)
}
// Unpacker describes the methods to read Entries from a source
type Unpacker interface {
// Next returns the next Entry being unpacked, or error, until io.EOF
Next() (*Entry, error)
}
/* TODO(vbatts) figure out a good model for this
type PackUnpacker interface {
Packer
Unpacker
}
*/
type jsonUnpacker struct {
r io.Reader
b *bufio.Reader
isEOF bool
seen seenNames
}
func (jup *jsonUnpacker) Next() (*Entry, error) {
var e Entry
if jup.isEOF {
// since ReadBytes() will return read bytes AND an EOF, we handle it this
// round-a-bout way so we can Unmarshal the tail with relevant errors, but
// still get an io.EOF when the stream is ended.
return nil, io.EOF
}
line, err := jup.b.ReadBytes('\n')
if err != nil && err != io.EOF {
return nil, err
} else if err == io.EOF {
jup.isEOF = true
}
err = json.Unmarshal(line, &e)
if err != nil && jup.isEOF {
// if the remainder actually _wasn't_ a remaining json structure, then just EOF
return nil, io.EOF
}
// check for dup name
if e.Type == FileType {
cName := filepath.Clean(e.GetName())
if _, ok := jup.seen[cName]; ok {
return nil, ErrDuplicatePath
}
jup.seen[cName] = struct{}{}
}
return &e, err
}
// NewJSONUnpacker provides an Unpacker that reads Entries (SegmentType and
// FileType) as a json document.
//
// Each Entry read are expected to be delimited by new line.
func NewJSONUnpacker(r io.Reader) Unpacker {
return &jsonUnpacker{
r: r,
b: bufio.NewReader(r),
seen: seenNames{},
}
}
type jsonPacker struct {
w io.Writer
e *json.Encoder
pos int
seen seenNames
}
type seenNames map[string]struct{}
func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
// if Name is not valid utf8, switch it to raw first.
if e.Name != "" {
if !common.IsValidUtf8String(e.Name) {
e.NameRaw = []byte(e.Name)
e.Name = ""
}
}
// check early for dup name
if e.Type == FileType {
cName := filepath.Clean(e.GetName())
if _, ok := jp.seen[cName]; ok {
return -1, ErrDuplicatePath
}
jp.seen[cName] = struct{}{}
}
e.Position = jp.pos
err := jp.e.Encode(e)
if err != nil {
return -1, err
}
// made it this far, increment now
jp.pos++
return e.Position, nil
}
// NewJSONPacker provides a Packer that writes each Entry (SegmentType and
// FileType) as a json document.
//
// The Entries are delimited by new line.
func NewJSONPacker(w io.Writer) Packer {
return &jsonPacker{
w: w,
e: json.NewEncoder(w),
seen: seenNames{},
}
}
/*
TODO(vbatts) perhaps have a more compact packer/unpacker, maybe using msgapck
(https://github.com/ugorji/go)
Even though, since our jsonUnpacker and jsonPacker just take
io.Reader/io.Writer, then we can get away with passing them a
gzip.Reader/gzip.Writer
*/