From 842fa03b61f8e7a760905d3aae5aa38824f2788c Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Fri, 12 Sep 2014 16:10:10 -0400 Subject: [PATCH] reorg --- base/base.go | 24 +++++++ crypto.go | 33 +++++++++ file/hash.go | 146 ++++++++++++++++++++++++++++++++++++++ main.go | 194 +++++---------------------------------------------- 4 files changed, 220 insertions(+), 177 deletions(-) create mode 100644 base/base.go create mode 100644 crypto.go create mode 100644 file/hash.go diff --git a/base/base.go b/base/base.go new file mode 100644 index 0000000..a22d27c --- /dev/null +++ b/base/base.go @@ -0,0 +1,24 @@ +package base + +import ( + "crypto" + "os" + "path/filepath" +) + +func InitVarBase(base string) error { + for _, path := range []string{"dedup/blobs", "dedup/state"} { + if err := os.MkdirAll(filepath.Join(base, path), 0755); err != nil { + return err + } + } + return nil +} + +type Base struct { + Path string +} + +func (b Base) HasBlob(hashType crypto.Hash, sum string) bool { + return true +} diff --git a/crypto.go b/crypto.go new file mode 100644 index 0000000..feaf008 --- /dev/null +++ b/crypto.go @@ -0,0 +1,33 @@ +package main + +import ( + "crypto" + _ "crypto/md5" + _ "crypto/sha1" + _ "crypto/sha256" + _ "crypto/sha512" + "log" + "strings" +) + +func DetermineHash(str string) (h crypto.Hash) { + switch strings.ToLower(str) { + case "md5": + h = crypto.MD5 + case "sha1": + h = crypto.SHA1 + case "sha224": + h = crypto.SHA224 + case "sha256": + h = crypto.SHA256 + case "sha384": + h = crypto.SHA384 + case "sha512": + h = crypto.SHA512 + default: + log.Println("WARNING: unknown cipher %q. using 'sha1'", str) + h = crypto.SHA1 + } + + return h +} diff --git a/file/hash.go b/file/hash.go new file mode 100644 index 0000000..3972b53 --- /dev/null +++ b/file/hash.go @@ -0,0 +1,146 @@ +package file + +import ( + "crypto" + "fmt" + "io" + "os" + "path/filepath" + "sync" + "syscall" + "time" +) + +type FileHashInfo struct { + HashType crypto.Hash + Hash string + Path string + ModTime time.Time + Err error +} + +func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan FileHashInfo { + out := make(chan FileHashInfo, workers) + go func() { + var wg sync.WaitGroup + err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.Mode().IsRegular() { + return nil + } + wg.Add(1) + go func() { + fhi := hashFile(path, hash, info) + select { + case out <- *fhi: + case <-done: + } + wg.Done() + }() + select { + case <-done: + return fmt.Errorf("walk canceled") + default: + return nil + } + }) + if err != nil { + out <- FileHashInfo{Err: err} + } + go func() { + wg.Wait() + close(out) + }() + }() + return out +} + +func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo { + fhi := FileHashInfo{HashType: hash, Path: path, ModTime: info.ModTime()} + h := hash.New() + fh, err := os.Open(path) + if err != nil { + fhi.Err = err + return &fhi + } + if _, err = io.Copy(h, fh); err != nil { + fhi.Err = err + return &fhi + } + fh.Close() + fhi.Hash = fmt.Sprintf("%x", h.Sum(nil)) + return &fhi +} + +// SameInodePaths checks whether path1 and path2 are the same inode +func SameInodePaths(path1, path2 string) (match bool, err error) { + var inode1, inode2 uint64 + if inode1, err = GetInode(path1); err != nil { + return false, err + } + if inode2, err = GetInode(path2); err != nil { + return false, err + } + return inode1 == inode2, nil +} + +// SameInodePaths checks whether path1 and path2 are on the same device +func SameDevPaths(path1, path2 string) (match bool, err error) { + var dev1, dev2 uint64 + if dev1, err = GetDev(path1); err != nil { + return false, err + } + if dev2, err = GetDev(path2); err != nil { + return false, err + } + return dev1 == dev2, nil +} + +func FormatDev(stat *syscall.Stat_t) string { + return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev)) +} + +func MajorDev(dev uint64) uint64 { + return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff))) +} + +func MinorDev(dev uint64) uint64 { + return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff))) +} + +func GetStat(path string) (*syscall.Stat_t, error) { + fi, err := os.Stat(path) + if err != nil { + return nil, err + } + return fi.Sys().(*syscall.Stat_t), nil +} + +// GetInode returns the inode for path +func GetInode(path string) (uint64, error) { + stat, err := GetStat(path) + if err != nil { + return 0, err + } + return stat.Ino, nil +} + +// GetDev returns the device for path +func GetDev(path string) (uint64, error) { + stat, err := GetStat(path) + if err != nil { + return 0, err + } + return stat.Dev, nil +} + +// GetNlink returns the number of links for path +func GetNlink(path string) (uint64, error) { + stat, err := GetStat(path) + if err != nil { + return 0, err + } + return stat.Nlink, nil +} diff --git a/main.go b/main.go index 85eac3e..910dfcf 100644 --- a/main.go +++ b/main.go @@ -1,22 +1,15 @@ package main import ( - "crypto" - _ "crypto/md5" - _ "crypto/sha1" - _ "crypto/sha256" - _ "crypto/sha512" "flag" "fmt" - "io" "log" "os" "path/filepath" "runtime" - "strings" - "sync" - "syscall" - "time" + + "./base" + "./file" ) var ( @@ -34,196 +27,43 @@ func init() { func main() { flag.Parse() - if err := InitVarBase(*flVarBase); err != nil { + if err := base.InitVarBase(*flVarBase); err != nil { log.Fatal(err) } var ( - hash crypto.Hash - //infos = []*FileHashInfo{} + hash = DetermineHash(*flCipher) + ourbase = base.Base{Path: *flVarBase} + //infos = []*file.FileHashInfo{} //mu = sync.Mutex{} - //results := make(chan FileHashInfo, 2) + //results := make(chan file.FileHashInfo, 2) //wg := sync.WaitGroup{} ) - switch strings.ToLower(*flCipher) { - case "md5": - hash = crypto.MD5 - case "sha1": - hash = crypto.SHA1 - case "sha224": - hash = crypto.SHA224 - case "sha256": - hash = crypto.SHA256 - case "sha384": - hash = crypto.SHA384 - case "sha512": - hash = crypto.SHA512 - default: - log.Fatalf("ERROR: unknown cipher %q", *flCipher) - } - for _, arg := range flag.Args() { - if m, err := SameDevPaths(*flVarBase, arg); err != nil { + if m, err := file.SameDevPaths(*flVarBase, arg); err != nil { log.Fatal(err) } else if !m { log.Printf("SKIPPING: %q is not on the same device as %q", arg, *flVarBase) continue } done := make(chan struct{}) - infos := HashFileGetter(arg, hash, *flWorkers, done) + infos := file.HashFileGetter(arg, hash, *flWorkers, done) for fi := range infos { if fi.Err != nil { log.Println(fi.Err) done <- struct{}{} } - fmt.Printf("%x %s\n", fi.Hash, fi.Path) + fmt.Printf("%s %s\n", fi.Hash, fi.Path) + if ourbase.HasBlob(fi.HashType, fi.Hash) { + // TODO check if they have the same Inode + // if not, then clobber + } else { + // TODO hard link to blobs + } } } //if len(infos) > 0 { //fmt.Println("collected", len(infos), "sums") //} } - -func InitVarBase(base string) error { - for _, path := range []string{"dedup/blobs", "dedup/state"} { - if err := os.MkdirAll(filepath.Join(base, path), 0755); err != nil { - return err - } - } - return nil -} - -type FileHashInfo struct { - HashType crypto.Hash - Hash []byte - Path string - ModTime time.Time - Err error -} - -func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan FileHashInfo { - out := make(chan FileHashInfo, workers) - go func() { - var wg sync.WaitGroup - err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.Mode().IsRegular() { - return nil - } - wg.Add(1) - go func() { - fhi := hashFile(path, hash, info) - select { - case out <- *fhi: - case <-done: - } - wg.Done() - }() - select { - case <-done: - return fmt.Errorf("walk canceled") - default: - return nil - } - }) - if err != nil { - out <- FileHashInfo{Err: err} - } - go func() { - wg.Wait() - close(out) - }() - }() - return out -} - -func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo { - fhi := FileHashInfo{HashType: hash, Path: path, ModTime: info.ModTime()} - h := hash.New() - fh, err := os.Open(path) - if err != nil { - fhi.Err = err - return &fhi - } - if _, err = io.Copy(h, fh); err != nil { - fhi.Err = err - return &fhi - } - fh.Close() - fhi.Hash = h.Sum(nil) - return &fhi -} - -// SameInodePaths checks whether path1 and path2 are the same inode -func SameInodePaths(path1, path2 string) (match bool, err error) { - var inode1, inode2 uint64 - if inode1, err = GetInode(path1); err != nil { - return false, err - } - if inode2, err = GetInode(path2); err != nil { - return false, err - } - return inode1 == inode2, nil -} - -// SameInodePaths checks whether path1 and path2 are on the same device -func SameDevPaths(path1, path2 string) (match bool, err error) { - var dev1, dev2 uint64 - if dev1, err = GetDev(path1); err != nil { - return false, err - } - if dev2, err = GetDev(path2); err != nil { - return false, err - } - return dev1 == dev2, nil -} - -func FormatDev(stat *syscall.Stat_t) string { - return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev)) -} - -func MajorDev(dev uint64) uint64 { - return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff))) -} - -func MinorDev(dev uint64) uint64 { - return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff))) -} - -func GetStat(path string) (*syscall.Stat_t, error) { - fi, err := os.Stat(path) - if err != nil { - return nil, err - } - return fi.Sys().(*syscall.Stat_t), nil -} - -// GetInode returns the inode for path -func GetInode(path string) (uint64, error) { - stat, err := GetStat(path) - if err != nil { - return 0, err - } - return stat.Ino, nil -} - -// GetDev returns the device for path -func GetDev(path string) (uint64, error) { - stat, err := GetStat(path) - if err != nil { - return 0, err - } - return stat.Dev, nil -} - -// GetNlink returns the number of links for path -func GetNlink(path string) (uint64, error) { - stat, err := GetStat(path) - if err != nil { - return 0, err - } - return stat.Nlink, nil -}