dups: utility to hunt for (and dedup) files

using hardlinking for dedup, otherwise just building a document of file
checksum info for future reuse
This commit is contained in:
Vincent Batts 2015-05-20 10:19:44 -04:00
parent 8ca2170b0a
commit 5cbe83545d
1 changed files with 169 additions and 0 deletions

169
cmd/dups/main.go Normal file
View File

@ -0,0 +1,169 @@
package main
import (
"crypto/rand"
"crypto/sha1"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"sync"
"time"
)
var (
flLoadMap = flag.String("l", "", "load existing map from file")
flSaveMap = flag.String("s", "hash-map.json", "file to save map of file hashes to")
flHardlink = flag.Bool("H", false, "hardlink the duplicate files")
flQuiet = flag.Bool("q", false, "less output")
nprocs = 1
)
func init() {
nprocs = runtime.NumCPU()
runtime.GOMAXPROCS(nprocs)
}
func main() {
flag.Parse()
for _, arg := range flag.Args() {
savings := int64(0)
found := map[string]string{}
if len(*flLoadMap) > 0 {
fh, err := os.Open(*flLoadMap)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
buf, err := ioutil.ReadAll(fh)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
if err = json.Unmarshal(buf, &found); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
workers := make(chan int, nprocs)
mu := sync.Mutex{}
err := filepath.Walk(arg, func(path string, info os.FileInfo, err error) error {
/*
if err != nil {
return err
}
*/
if !info.Mode().IsRegular() {
return nil
}
workers <- 1
go func() {
defer func() { <-workers }()
fh, err := os.Open(path)
if err != nil {
fmt.Fprintln(os.Stderr, err, path)
return
}
defer fh.Close()
h := sha1.New()
if _, err = io.Copy(h, fh); err != nil {
fmt.Fprintln(os.Stderr, err, path)
return
}
sum := fmt.Sprintf("%x", h.Sum(nil))
// get the absolute filename
p, err := filepath.Abs(path)
if err != nil {
fmt.Fprintln(os.Stderr, err, path)
return
}
path = p
mu.Lock()
defer mu.Unlock()
if fpath, ok := found[sum]; ok && fpath != path {
if !(*flQuiet) {
fmt.Printf("%q is the same content as %q\n", path, fpath)
}
if *flHardlink {
if err = SafeLink(fpath, path); err != nil {
fmt.Fprintln(os.Stderr, err, path)
return
}
fmt.Printf("linked %q to %q\n", path, fpath)
}
savings += info.Size()
} else {
found[sum] = path
}
}()
return nil
})
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
for len(workers) > 0 {
time.Sleep(5 * time.Microsecond)
}
fmt.Printf("Savings of %fmb\n", float64(savings)/1024.0/1024.0)
fh, err := os.Create(*flSaveMap)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
buf, err := json.Marshal(found)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
_, err = fh.Write(buf)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
fh.Close()
fmt.Fprintf(os.Stderr, "wrote %q\n", fh.Name())
}
}
// SafeLink overrides newname if it already exists. If there is an error in creating the link, the transaction is rolled back
func SafeLink(oldname, newname string) error {
var backupName string
// check if newname exists
if fi, err := os.Stat(newname); err == nil && fi != nil {
// make a random name
buf := make([]byte, 5)
if _, err = rand.Read(buf); err != nil {
return err
}
backupName = fmt.Sprintf("%s.%x", newname, buf)
// move newname to the random name backupName
if err = os.Rename(newname, backupName); err != nil {
return err
}
}
// hardlink oldname to newname
if err := os.Link(oldname, newname); err != nil {
// if that failed, and there is a backupName
if len(backupName) > 0 {
// then move back the backup
if err = os.Rename(backupName, newname); err != nil {
return err
}
}
return err
}
// remove the backupName
if len(backupName) > 0 {
os.Remove(backupName)
}
return nil
}