1
0
Fork 0
mirror of https://github.com/vbatts/dedupe-linker.git synced 2025-01-16 02:30:07 +00:00
This commit is contained in:
Vincent Batts 2014-09-12 16:10:10 -04:00
parent 0a054d3410
commit 842fa03b61
4 changed files with 220 additions and 177 deletions

24
base/base.go Normal file
View file

@ -0,0 +1,24 @@
package base
import (
"crypto"
"os"
"path/filepath"
)
func InitVarBase(base string) error {
for _, path := range []string{"dedup/blobs", "dedup/state"} {
if err := os.MkdirAll(filepath.Join(base, path), 0755); err != nil {
return err
}
}
return nil
}
type Base struct {
Path string
}
func (b Base) HasBlob(hashType crypto.Hash, sum string) bool {
return true
}

33
crypto.go Normal file
View file

@ -0,0 +1,33 @@
package main
import (
"crypto"
_ "crypto/md5"
_ "crypto/sha1"
_ "crypto/sha256"
_ "crypto/sha512"
"log"
"strings"
)
func DetermineHash(str string) (h crypto.Hash) {
switch strings.ToLower(str) {
case "md5":
h = crypto.MD5
case "sha1":
h = crypto.SHA1
case "sha224":
h = crypto.SHA224
case "sha256":
h = crypto.SHA256
case "sha384":
h = crypto.SHA384
case "sha512":
h = crypto.SHA512
default:
log.Println("WARNING: unknown cipher %q. using 'sha1'", str)
h = crypto.SHA1
}
return h
}

146
file/hash.go Normal file
View file

@ -0,0 +1,146 @@
package file
import (
"crypto"
"fmt"
"io"
"os"
"path/filepath"
"sync"
"syscall"
"time"
)
type FileHashInfo struct {
HashType crypto.Hash
Hash string
Path string
ModTime time.Time
Err error
}
func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan FileHashInfo {
out := make(chan FileHashInfo, workers)
go func() {
var wg sync.WaitGroup
err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.Mode().IsRegular() {
return nil
}
wg.Add(1)
go func() {
fhi := hashFile(path, hash, info)
select {
case out <- *fhi:
case <-done:
}
wg.Done()
}()
select {
case <-done:
return fmt.Errorf("walk canceled")
default:
return nil
}
})
if err != nil {
out <- FileHashInfo{Err: err}
}
go func() {
wg.Wait()
close(out)
}()
}()
return out
}
func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo {
fhi := FileHashInfo{HashType: hash, Path: path, ModTime: info.ModTime()}
h := hash.New()
fh, err := os.Open(path)
if err != nil {
fhi.Err = err
return &fhi
}
if _, err = io.Copy(h, fh); err != nil {
fhi.Err = err
return &fhi
}
fh.Close()
fhi.Hash = fmt.Sprintf("%x", h.Sum(nil))
return &fhi
}
// SameInodePaths checks whether path1 and path2 are the same inode
func SameInodePaths(path1, path2 string) (match bool, err error) {
var inode1, inode2 uint64
if inode1, err = GetInode(path1); err != nil {
return false, err
}
if inode2, err = GetInode(path2); err != nil {
return false, err
}
return inode1 == inode2, nil
}
// SameInodePaths checks whether path1 and path2 are on the same device
func SameDevPaths(path1, path2 string) (match bool, err error) {
var dev1, dev2 uint64
if dev1, err = GetDev(path1); err != nil {
return false, err
}
if dev2, err = GetDev(path2); err != nil {
return false, err
}
return dev1 == dev2, nil
}
func FormatDev(stat *syscall.Stat_t) string {
return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev))
}
func MajorDev(dev uint64) uint64 {
return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff)))
}
func MinorDev(dev uint64) uint64 {
return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff)))
}
func GetStat(path string) (*syscall.Stat_t, error) {
fi, err := os.Stat(path)
if err != nil {
return nil, err
}
return fi.Sys().(*syscall.Stat_t), nil
}
// GetInode returns the inode for path
func GetInode(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Ino, nil
}
// GetDev returns the device for path
func GetDev(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Dev, nil
}
// GetNlink returns the number of links for path
func GetNlink(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Nlink, nil
}

194
main.go
View file

@ -1,22 +1,15 @@
package main package main
import ( import (
"crypto"
_ "crypto/md5"
_ "crypto/sha1"
_ "crypto/sha256"
_ "crypto/sha512"
"flag" "flag"
"fmt" "fmt"
"io"
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings"
"sync" "./base"
"syscall" "./file"
"time"
) )
var ( var (
@ -34,196 +27,43 @@ func init() {
func main() { func main() {
flag.Parse() flag.Parse()
if err := InitVarBase(*flVarBase); err != nil { if err := base.InitVarBase(*flVarBase); err != nil {
log.Fatal(err) log.Fatal(err)
} }
var ( var (
hash crypto.Hash hash = DetermineHash(*flCipher)
//infos = []*FileHashInfo{} ourbase = base.Base{Path: *flVarBase}
//infos = []*file.FileHashInfo{}
//mu = sync.Mutex{} //mu = sync.Mutex{}
//results := make(chan FileHashInfo, 2) //results := make(chan file.FileHashInfo, 2)
//wg := sync.WaitGroup{} //wg := sync.WaitGroup{}
) )
switch strings.ToLower(*flCipher) {
case "md5":
hash = crypto.MD5
case "sha1":
hash = crypto.SHA1
case "sha224":
hash = crypto.SHA224
case "sha256":
hash = crypto.SHA256
case "sha384":
hash = crypto.SHA384
case "sha512":
hash = crypto.SHA512
default:
log.Fatalf("ERROR: unknown cipher %q", *flCipher)
}
for _, arg := range flag.Args() { for _, arg := range flag.Args() {
if m, err := SameDevPaths(*flVarBase, arg); err != nil { if m, err := file.SameDevPaths(*flVarBase, arg); err != nil {
log.Fatal(err) log.Fatal(err)
} else if !m { } else if !m {
log.Printf("SKIPPING: %q is not on the same device as %q", arg, *flVarBase) log.Printf("SKIPPING: %q is not on the same device as %q", arg, *flVarBase)
continue continue
} }
done := make(chan struct{}) done := make(chan struct{})
infos := HashFileGetter(arg, hash, *flWorkers, done) infos := file.HashFileGetter(arg, hash, *flWorkers, done)
for fi := range infos { for fi := range infos {
if fi.Err != nil { if fi.Err != nil {
log.Println(fi.Err) log.Println(fi.Err)
done <- struct{}{} done <- struct{}{}
} }
fmt.Printf("%x %s\n", fi.Hash, fi.Path) fmt.Printf("%s %s\n", fi.Hash, fi.Path)
if ourbase.HasBlob(fi.HashType, fi.Hash) {
// TODO check if they have the same Inode
// if not, then clobber
} else {
// TODO hard link to blobs
}
} }
} }
//if len(infos) > 0 { //if len(infos) > 0 {
//fmt.Println("collected", len(infos), "sums") //fmt.Println("collected", len(infos), "sums")
//} //}
} }
func InitVarBase(base string) error {
for _, path := range []string{"dedup/blobs", "dedup/state"} {
if err := os.MkdirAll(filepath.Join(base, path), 0755); err != nil {
return err
}
}
return nil
}
type FileHashInfo struct {
HashType crypto.Hash
Hash []byte
Path string
ModTime time.Time
Err error
}
func HashFileGetter(path string, hash crypto.Hash, workers int, done <-chan struct{}) <-chan FileHashInfo {
out := make(chan FileHashInfo, workers)
go func() {
var wg sync.WaitGroup
err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.Mode().IsRegular() {
return nil
}
wg.Add(1)
go func() {
fhi := hashFile(path, hash, info)
select {
case out <- *fhi:
case <-done:
}
wg.Done()
}()
select {
case <-done:
return fmt.Errorf("walk canceled")
default:
return nil
}
})
if err != nil {
out <- FileHashInfo{Err: err}
}
go func() {
wg.Wait()
close(out)
}()
}()
return out
}
func hashFile(path string, hash crypto.Hash, info os.FileInfo) *FileHashInfo {
fhi := FileHashInfo{HashType: hash, Path: path, ModTime: info.ModTime()}
h := hash.New()
fh, err := os.Open(path)
if err != nil {
fhi.Err = err
return &fhi
}
if _, err = io.Copy(h, fh); err != nil {
fhi.Err = err
return &fhi
}
fh.Close()
fhi.Hash = h.Sum(nil)
return &fhi
}
// SameInodePaths checks whether path1 and path2 are the same inode
func SameInodePaths(path1, path2 string) (match bool, err error) {
var inode1, inode2 uint64
if inode1, err = GetInode(path1); err != nil {
return false, err
}
if inode2, err = GetInode(path2); err != nil {
return false, err
}
return inode1 == inode2, nil
}
// SameInodePaths checks whether path1 and path2 are on the same device
func SameDevPaths(path1, path2 string) (match bool, err error) {
var dev1, dev2 uint64
if dev1, err = GetDev(path1); err != nil {
return false, err
}
if dev2, err = GetDev(path2); err != nil {
return false, err
}
return dev1 == dev2, nil
}
func FormatDev(stat *syscall.Stat_t) string {
return fmt.Sprintf("%d:%d", MajorDev(stat.Dev), MinorDev(stat.Dev))
}
func MajorDev(dev uint64) uint64 {
return (((dev >> 8) & 0xfff) | ((dev >> 32) & ^uint64(0xfff)))
}
func MinorDev(dev uint64) uint64 {
return ((dev & 0xff) | ((dev >> 12) & ^uint64(0xff)))
}
func GetStat(path string) (*syscall.Stat_t, error) {
fi, err := os.Stat(path)
if err != nil {
return nil, err
}
return fi.Sys().(*syscall.Stat_t), nil
}
// GetInode returns the inode for path
func GetInode(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Ino, nil
}
// GetDev returns the device for path
func GetDev(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Dev, nil
}
// GetNlink returns the number of links for path
func GetNlink(path string) (uint64, error) {
stat, err := GetStat(path)
if err != nil {
return 0, err
}
return stat.Nlink, nil
}