From 5cbe83545d67b2a2d2c051db4d73ec97ee57ced7 Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Wed, 20 May 2015 10:19:44 -0400 Subject: [PATCH] dups: utility to hunt for (and dedup) files using hardlinking for dedup, otherwise just building a document of file checksum info for future reuse --- cmd/dups/main.go | 169 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 cmd/dups/main.go diff --git a/cmd/dups/main.go b/cmd/dups/main.go new file mode 100644 index 0000000..7a6ca05 --- /dev/null +++ b/cmd/dups/main.go @@ -0,0 +1,169 @@ +package main + +import ( + "crypto/rand" + "crypto/sha1" + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "runtime" + "sync" + "time" +) + +var ( + flLoadMap = flag.String("l", "", "load existing map from file") + flSaveMap = flag.String("s", "hash-map.json", "file to save map of file hashes to") + flHardlink = flag.Bool("H", false, "hardlink the duplicate files") + flQuiet = flag.Bool("q", false, "less output") + nprocs = 1 +) + +func init() { + nprocs = runtime.NumCPU() + runtime.GOMAXPROCS(nprocs) +} + +func main() { + flag.Parse() + for _, arg := range flag.Args() { + savings := int64(0) + found := map[string]string{} + if len(*flLoadMap) > 0 { + fh, err := os.Open(*flLoadMap) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + buf, err := ioutil.ReadAll(fh) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + if err = json.Unmarshal(buf, &found); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + } + workers := make(chan int, nprocs) + mu := sync.Mutex{} + err := filepath.Walk(arg, func(path string, info os.FileInfo, err error) error { + /* + if err != nil { + return err + } + */ + if !info.Mode().IsRegular() { + return nil + } + workers <- 1 + go func() { + defer func() { <-workers }() + + fh, err := os.Open(path) + if err != nil { + fmt.Fprintln(os.Stderr, err, path) + return + } + defer fh.Close() + + h := sha1.New() + if _, err = io.Copy(h, fh); err != nil { + fmt.Fprintln(os.Stderr, err, path) + return + } + sum := fmt.Sprintf("%x", h.Sum(nil)) + + // get the absolute filename + p, err := filepath.Abs(path) + if err != nil { + fmt.Fprintln(os.Stderr, err, path) + return + } + path = p + + mu.Lock() + defer mu.Unlock() + if fpath, ok := found[sum]; ok && fpath != path { + if !(*flQuiet) { + fmt.Printf("%q is the same content as %q\n", path, fpath) + } + if *flHardlink { + if err = SafeLink(fpath, path); err != nil { + fmt.Fprintln(os.Stderr, err, path) + return + } + fmt.Printf("linked %q to %q\n", path, fpath) + } + savings += info.Size() + } else { + found[sum] = path + } + }() + return nil + }) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + for len(workers) > 0 { + time.Sleep(5 * time.Microsecond) + } + fmt.Printf("Savings of %fmb\n", float64(savings)/1024.0/1024.0) + fh, err := os.Create(*flSaveMap) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + buf, err := json.Marshal(found) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + _, err = fh.Write(buf) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + fh.Close() + fmt.Fprintf(os.Stderr, "wrote %q\n", fh.Name()) + } +} + +// SafeLink overrides newname if it already exists. If there is an error in creating the link, the transaction is rolled back +func SafeLink(oldname, newname string) error { + var backupName string + // check if newname exists + if fi, err := os.Stat(newname); err == nil && fi != nil { + // make a random name + buf := make([]byte, 5) + if _, err = rand.Read(buf); err != nil { + return err + } + backupName = fmt.Sprintf("%s.%x", newname, buf) + // move newname to the random name backupName + if err = os.Rename(newname, backupName); err != nil { + return err + } + } + // hardlink oldname to newname + if err := os.Link(oldname, newname); err != nil { + // if that failed, and there is a backupName + if len(backupName) > 0 { + // then move back the backup + if err = os.Rename(backupName, newname); err != nil { + return err + } + } + return err + } + // remove the backupName + if len(backupName) > 0 { + os.Remove(backupName) + } + return nil +}