Merge pull request #2 from vbatts/findbase

base: much needed love
This commit is contained in:
Vincent Batts 2016-09-13 11:43:52 -04:00 committed by GitHub
commit 7271153d2a
9 changed files with 300 additions and 119 deletions

View File

@ -14,22 +14,24 @@ import (
"github.com/vbatts/dedupe-linker/file"
)
// NewBase populates the directories needed in a dedupe-base directory
func NewBase(path string, hashName string) (*Base, error) {
root := filepath.Join(path, "dedup")
for _, p := range []string{"blobs/" + hashName, "state", "tmp"} {
if err := os.MkdirAll(filepath.Join(root, p), 0755); err != nil && !os.IsExist(err) {
if err := os.MkdirAll(filepath.Join(path, p), 0755); err != nil && !os.IsExist(err) {
return nil, err
}
}
return &Base{Path: root, HashName: hashName, Hash: cryptomap.DetermineHash(hashName)}, nil
return &Base{Path: path, HashName: hashName, Hash: cryptomap.DetermineHash(hashName)}, nil
}
// Base is the destination for all hardlinks. Where stored objects are in a content addressible tree.
type Base struct {
Path string
HashName string
Hash crypto.Hash
}
// Stat provides the os.FileInfo for the object of `sum` address
func (b Base) Stat(sum string) (os.FileInfo, error) {
return os.Stat(b.blobPath(sum))
}
@ -41,12 +43,15 @@ func (b Base) blobPath(sum string) string {
return filepath.Join(b.Path, "blobs", b.HashName, sum[0:2], sum)
}
type ReaderSeekerCloser interface {
// ReadSeekCloser is like an io.ReadCloser, but can Seek too
type ReadSeekCloser interface {
io.Reader
io.Seeker
io.Closer
}
// SameFile checks whether the object of `sum` address, and `path` file path are the same file.
// This checks by inode and device.
func (b Base) SameFile(sum, path string) bool {
var (
bInode, dInode uint64
@ -66,7 +71,7 @@ func (b Base) SameFile(sum, path string) bool {
}
// GetBlob store the content from src, for the sum and hashType
func (b Base) GetBlob(sum string) (ReaderSeekerCloser, error) {
func (b Base) GetBlob(sum string) (ReadSeekCloser, error) {
return os.Open(b.blobPath(sum))
}
@ -118,7 +123,8 @@ func (b Base) tmpFile() (*os.File, error) {
return ioutil.TempFile(filepath.Join(b.Path, "tmp"), "put")
}
// Hard link the file from src to the blob for sum
// LinkFrom make a hard link the file from src to the blob of address `sum`.
// TODO this function is going away, because it makes no assessment of the checksum of `src`
func (b Base) LinkFrom(src, sum string) error {
if err := os.MkdirAll(filepath.Dir(b.blobPath(sum)), 0756); err != nil && !os.IsExist(err) {
return err
@ -135,7 +141,8 @@ func randomString() (string, error) {
return fmt.Sprintf("%x", buf), nil
}
// SafeLink overrides newname if it already exists. If there is an error in creating the link, the transaction is rolled back
// SafeLink overrides newname if it already exists. If there is an error in
// creating the link, the transaction is rolled back
func SafeLink(oldname, newname string) error {
var backupName string
// check if newname exists
@ -169,7 +176,7 @@ func SafeLink(oldname, newname string) error {
return nil
}
// Hard link the file for sum to the path at dest
// LinkTo makes a hard link the file of address `sum` to the path at `dest`
func (b Base) LinkTo(dest, sum string) error {
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil && !os.IsExist(err) {
return err

View File

@ -2,7 +2,6 @@ package base
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path"
@ -50,7 +49,7 @@ func TestGetPut(t *testing.T) {
t.Fatal(err)
}
rHash := "deadbeaf"
rHash := "8f074e76e82ae6156c451019840a6f857bbe5157"
rMsg := "this is the dead beef"
r := bytes.NewReader([]byte(rMsg))
@ -58,12 +57,18 @@ func TestGetPut(t *testing.T) {
if err != nil {
t.Error(err)
}
if sum != rHash {
t.Errorf("expected %q; got %q", rHash, sum)
}
fi, err := b.Stat(rHash)
if err != nil {
t.Error(err)
}
fmt.Printf("%#v\n", fi.Sys())
if fi == nil {
t.Fatal("did not find the blob " + rHash)
}
//fmt.Printf("%#v\n", fi.Sys())
if err = b.LinkTo(path.Join(srcDir, "beef1.txt"), rHash); err != nil {
t.Error(err)
@ -72,7 +77,10 @@ func TestGetPut(t *testing.T) {
if err != nil {
t.Error(err)
}
fmt.Printf("%#v\n", fi2.Sys())
if fi2 == nil {
t.Fatal("did not find the linked file " + path.Join(srcDir, "beef1.txt"))
}
//fmt.Printf("%#v\n", fi2.Sys())
if err = b.LinkTo(path.Join(srcDir, "beef1.txt"), rHash); err != nil && !os.IsExist(err) {
t.Error(err)

61
base/findbase.go Normal file
View File

@ -0,0 +1,61 @@
package base
import (
"io/ioutil"
"os"
"path/filepath"
"syscall"
"github.com/vbatts/dedupe-linker/file"
)
// FindBase steps up the directory tree to find the top-level that is still on
// the same device as the path provided
func FindBase(path string) (string, error) {
stat, err := os.Lstat(path)
if err != nil {
return "", err
}
if stat.IsDir() {
return findBaseInfo(stat)
}
return FindBase(filepath.Dir(path))
}
func findBaseInfo(stat os.FileInfo) (string, error) {
dirstat, err := os.Lstat(filepath.Dir(stat.Name()))
if err != nil {
return "", err
}
if stat.Name() == dirstat.Name() {
return stat.Name(), nil
}
if sameDevice(stat, dirstat) {
return findBaseInfo(dirstat)
}
return stat.Name(), nil
}
func hasPermission(path string) bool {
stat, err := os.Lstat(path)
if err != nil {
return false
}
if !stat.IsDir() {
path = filepath.Dir(path)
}
fh, err := ioutil.TempFile(path, "perm.test.")
if err != nil {
return false
}
os.Remove(fh.Name())
return true
}
func sameDevice(file1, file2 os.FileInfo) bool {
sys1 := file1.Sys().(*syscall.Stat_t)
sys2 := file2.Sys().(*syscall.Stat_t)
return ((file.MajorDev(sys1.Dev) == file.MajorDev(sys2.Dev)) && (file.MinorDev(sys1.Dev) == file.MinorDev(sys2.Dev)))
}

76
base/findbase_test.go Normal file
View File

@ -0,0 +1,76 @@
package base
import (
"io/ioutil"
"os"
"testing"
)
func TestHasPermission(t *testing.T) {
if !hasPermission("/tmp") {
t.Error("expected to have permission to /tmp, but did not")
}
if hasPermission("/") {
t.Error("expected to not have permission to /, but did")
}
}
func TestSameDev(t *testing.T) {
file1, err := ioutil.TempFile("", "test")
if err != nil {
t.Fatal(err)
}
defer file1.Close()
file2, err := ioutil.TempFile("", "test")
if err != nil {
t.Fatal(err)
}
defer file2.Close()
stat1, err := file1.Stat()
if err != nil {
t.Fatal(err)
}
stat2, err := file2.Stat()
if err != nil {
t.Fatal(err)
}
if !sameDevice(stat1, stat2) {
t.Errorf("expected the two files to be on same device. But %q and %q are not", file1.Name(), file2.Name())
} else {
os.Remove(stat1.Name())
os.Remove(stat2.Name())
}
}
// perhaps this is naive. Travis' /tmp is on the same device and not sure how to request it be tmpfs w/o needing sudo
func testNotSameDev(t *testing.T) {
file1, err := ioutil.TempFile("/tmp", "test")
if err != nil {
t.Fatal(err)
}
defer file1.Close()
file2, err := ioutil.TempFile(os.Getenv("HOME"), "test")
if err != nil {
t.Fatal(err)
}
defer file2.Close()
stat1, err := file1.Stat()
if err != nil {
t.Fatal(err)
}
stat2, err := file2.Stat()
if err != nil {
t.Fatal(err)
}
if sameDevice(stat1, stat2) {
t.Errorf("expected the two files _not_ to be on same device. But %q and %q are not", file1.Name(), file2.Name())
} else {
os.Remove(stat1.Name())
os.Remove(stat2.Name())
}
}

View File

@ -2,36 +2,37 @@ package cryptomap
import (
"crypto"
"log"
"strings"
// Importing all the currently supported hashes
_ "crypto/md5"
_ "crypto/sha1"
_ "crypto/sha256"
_ "crypto/sha512"
"log"
"strings"
)
var knownCiphers = map[string]crypto.Hash{
"md5": crypto.MD5,
// DefaultCipher is the crypto cipher default used if none is specified or
// specified is unknown.
var DefaultCipher = "sha1"
// Ciphers is the known set of mappings for string to crypto.Hash
// use an init() to add custom hash ciphers
var Ciphers = map[string]crypto.Hash{
"md5": crypto.MD5,
"sha1": crypto.SHA1,
"sha224": crypto.SHA224,
"sha256": crypto.SHA256,
"sha384": crypto.SHA384,
"sha512": crypto.SHA512,
}
// DetermineHash takes a generic string, like "sha1" and returns the
// corresponding crypto.Hash
func DetermineHash(str string) (h crypto.Hash) {
switch strings.ToLower(str) {
case "md5":
h = crypto.MD5
case "sha1":
h = crypto.SHA1
case "sha224":
h = crypto.SHA224
case "sha256":
h = crypto.SHA256
case "sha384":
h = crypto.SHA384
case "sha512":
h = crypto.SHA512
default:
log.Printf("WARNING: unknown cipher %q. using 'sha1'", str)
h = crypto.SHA1
if h, ok := Ciphers[strings.ToLower(str)]; ok {
return h
}
return h
log.Printf("WARNING: unknown cipher %q. using %q", str, DefaultCipher)
return Ciphers[DefaultCipher]
}

94
file/dev.go Normal file
View File

@ -0,0 +1,94 @@
package file
import (
"fmt"
"os"
"syscall"
)
// SameInodePaths checks whether path1 and path2 are the same inode
func SameInodePaths(path1, path2 string) (match bool, err error) {
var inode1, inode2 uint64
if inode1, err = GetInode(path1); err != nil {
return false, err
}
if inode2, err = GetInode(path2); err != nil {
return false, err
}
return inode1 == inode2, nil
}
// SameDevPaths checks whether path1 and path2 are on the same device
func SameDevPaths(path1, path2 string) (match bool, err error) {
var dev1, dev2 uint64
if dev1, err = GetDev(path1); err != nil {
return false, err
}
if dev2, err = GetDev(path2); err != nil {
return false, err
}
return dev1 == dev2, nil
}
// FormatDev has a scary name, but just pretty prints the stat_t.dev as "major:minor"
func FormatDev(stat *syscall.Stat_t) string {
return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev))
}
// MajorDev provides the major device number from a stat_t.dev
func MajorDev(dev uint64) uint64 {
return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff)))
}
// MinorDev provides the minor device number from a stat_t.dev
func MinorDev(dev uint64) uint64 {
return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff)))
}
// GetLstat returns the system stat_t for the file at path.
// (symlinks are not deferenced)
func GetLstat(path string) (*syscall.Stat_t, error) {
fi, err := os.Lstat(path)
if err != nil {
return nil, err
}
return fi.Sys().(*syscall.Stat_t), nil
}
// GetStat returns the system stat_t for the file at path.
// (symlinks are deferenced)
func GetStat(path string) (*syscall.Stat_t, error) {
fi, err := os.Stat(path)
if err != nil {
return nil, err
}
return fi.Sys().(*syscall.Stat_t), nil
}
// GetInode returns the inode for path
func GetInode(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Ino, nil
}
// GetDev returns the device for path
func GetDev(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Dev, nil
}
// GetNlink returns the number of links for path. For directories, that is
// number of entries. For regular files, that is number of hardlinks.
func GetNlink(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Nlink, nil
}

View File

@ -6,11 +6,13 @@ import (
"io"
"os"
"path/filepath"
"syscall"
"time"
)
type FileHashInfo struct {
// HashInfo for tracking the information regarding a file, it's checksum
// and status.
// If Err is set then the caller must take an appropriate action.
type HashInfo struct {
HashType crypto.Hash
Hash string
Path string
@ -19,8 +21,10 @@ type FileHashInfo struct {
Err error
}
func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan FileHashInfo {
out := make(chan FileHashInfo, workers)
// HashFileGetter walks the provided `path` with `workers` number of threads.
// The channel of HashInfo are for each regular file encountered.
func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan HashInfo {
out := make(chan HashInfo, workers)
go func() {
err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
if err != nil {
@ -39,15 +43,15 @@ func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan stru
}
})
if err != nil {
out <- FileHashInfo{Err: err}
out <- HashInfo{Err: err}
}
close(out)
}()
return out
}
func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo {
fhi := FileHashInfo{HashType: hash, Path: path, ModTime: info.ModTime(), Size: info.Size()}
func hashFile(path string, hash crypto.Hash, info os.FileInfo) *HashInfo {
fhi := HashInfo{HashType: hash, Path: path, ModTime: info.ModTime(), Size: info.Size()}
h := hash.New()
fh, err := os.Open(path)
if err != nil {
@ -63,74 +67,3 @@ func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo {
fhi.Hash = fmt.Sprintf("%x", h.Sum(nil))
return &fhi
}
// SameInodePaths checks whether path1 and path2 are the same inode
func SameInodePaths(path1, path2 string) (match bool, err error) {
var inode1, inode2 uint64
if inode1, err = GetInode(path1); err != nil {
return false, err
}
if inode2, err = GetInode(path2); err != nil {
return false, err
}
return inode1 == inode2, nil
}
// SameInodePaths checks whether path1 and path2 are on the same device
func SameDevPaths(path1, path2 string) (match bool, err error) {
var dev1, dev2 uint64
if dev1, err = GetDev(path1); err != nil {
return false, err
}
if dev2, err = GetDev(path2); err != nil {
return false, err
}
return dev1 == dev2, nil
}
func FormatDev(stat *syscall.Stat_t) string {
return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev))
}
func MajorDev(dev uint64) uint64 {
return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff)))
}
func MinorDev(dev uint64) uint64 {
return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff)))
}
func GetStat(path string) (*syscall.Stat_t, error) {
fi, err := os.Stat(path)
if err != nil {
return nil, err
}
return fi.Sys().(*syscall.Stat_t), nil
}
// GetInode returns the inode for path
func GetInode(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Ino, nil
}
// GetDev returns the device for path
func GetDev(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Dev, nil
}
// GetNlink returns the number of links for path
func GetNlink(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Nlink, nil
}

10
main.go
View File

@ -14,10 +14,10 @@ import (
)
var (
varBaseDir = filepath.Join(os.Getenv("HOME"), ".local/dedupe-linker/var")
varBaseDir = filepath.Join(os.Getenv("HOME"), ".dedupe-linker/")
flVarBase = flag.String("b", varBaseDir, "base directory where files are duplicated")
flCipher = flag.String("c", "sha1", "block cipher to use (sha1, or sha256)")
flCipher = flag.String("c", cryptomap.DefaultCipher, "block cipher to use (sha1, or sha256)")
flWorkers = flag.Int("w", 2, "workers to do summing")
flNoop = flag.Bool("noop", false, "don't do any moving or linking")
flDebug = flag.Bool("debug", false, "enable debug output")
@ -53,8 +53,8 @@ func main() {
var (
hash = cryptomap.DetermineHash(*flCipher)
//infos = []*file.FileHashInfo{}
//results := make(chan file.FileHashInfo, 2)
//infos = []*file.HashInfo{}
//results := make(chan file.HashInfo, 2)
)
for _, arg := range flag.Args() {
@ -77,7 +77,7 @@ func main() {
fmt.Printf("%s [%d] %s\n", fi.Hash, fi.Size, fi.Path)
} else {
if os.Getenv("DEBUG") != "" {
fmt.Printf("%q: %q\n", fi.Path, ourbase.HasBlob(fi.Hash))
fmt.Printf("%q: %t\n", fi.Path, ourbase.HasBlob(fi.Hash))
}
if ourbase.HasBlob(fi.Hash) && !ourbase.SameFile(fi.Hash, fi.Path) {
if err := ourbase.LinkTo(fi.Path, fi.Hash); err != nil {

View File

@ -1,14 +1,15 @@
// Package walker is a work-in-progress
package walker
import (
"github.com/vbatts/dedupe-linker/base"
)
type Walker struct {
type walker struct {
Base *base.Base
}
func (w Walker) Walk(path string, quit chan int) error {
func (w walker) Walk(path string, quit chan int) error {
// XXX what is going on here?
select {
case <-quit: