main.go: ruling out a colliding metadata for single PDF

Prompt claude.ai using 3.7 Sonnet:
```
one additional issue that the metadata json file is also named as the
hash of the PDF, but if there are two different URLs that download a
matching PDF, then while the hash will be the same the metadata is
actually different for the two, so it will collide there.  Lets also get
a sha1 hash of the URL string itself, and the metadata json file is
named with hash of the PDF, then a hyphen, then hash of the URL.
```

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
This commit is contained in:
Vincent Batts 2025-04-03 22:26:40 -04:00
parent 4400ea5bca
commit 67e835eea9
Signed by: vbatts
GPG key ID: E30EFAA812C6E5ED

80
main.go
View file

@ -22,6 +22,7 @@ import (
// FileMetadata stores information about a downloaded file // FileMetadata stores information about a downloaded file
type FileMetadata struct { type FileMetadata struct {
URL string `json:"url"` URL string `json:"url"`
URLHash string `json:"url_hash"`
ContentDisposition string `json:"content_disposition,omitempty"` ContentDisposition string `json:"content_disposition,omitempty"`
SuggestedFilename string `json:"suggested_filename,omitempty"` SuggestedFilename string `json:"suggested_filename,omitempty"`
ActualFilename string `json:"actual_filename"` ActualFilename string `json:"actual_filename"`
@ -45,6 +46,13 @@ type Config struct {
Verbose bool Verbose bool
} }
// Calculate SHA-1 hash of a string
func hashString(s string) string {
h := sha1.New()
h.Write([]byte(s))
return fmt.Sprintf("%x", h.Sum(nil))
}
func main() { func main() {
// Parse command line flags // Parse command line flags
concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads") concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
@ -204,6 +212,9 @@ func downloadWithRetry(ctx context.Context, client *http.Client, url string, con
} }
func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error { func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
// Calculate URL hash upfront
urlHash := hashString(url)
// Create HTTP request with context for cancellation // Create HTTP request with context for cancellation
req, err := http.NewRequestWithContext(ctx, "GET", url, nil) req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil { if err != nil {
@ -275,33 +286,49 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
} }
// Get the SHA-1 checksum // Get the SHA-1 checksum
sha1sum := fmt.Sprintf("%x", hash.Sum(nil)) contentHash := fmt.Sprintf("%x", hash.Sum(nil))
// Check if we already have this file // Check if this exact URL and content combination has already been downloaded
metadataFilename := fmt.Sprintf("%s-%s.json", contentHash, urlHash)
metadataPath := filepath.Join(config.MetadataDir, metadataFilename)
if _, err := os.Stat(metadataPath); err == nil && config.SkipExisting {
if config.Verbose {
log.Printf("This URL and content combination already exists, skipping: %s", url)
}
return nil
}
// Check if we already have this file content from a different URL
existingFile := ""
if config.SkipExisting { if config.SkipExisting {
existingPath, exists := findExistingFile(config.MetadataDir, sha1sum) // Look for any file with this content hash
if exists { contentFiles, err := filepath.Glob(filepath.Join(config.OutputDir, contentHash+"*"))
if err == nil && len(contentFiles) > 0 {
existingFile = contentFiles[0]
if config.Verbose { if config.Verbose {
log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath) log.Printf("File with content hash %s already exists at %s, reusing", contentHash, existingFile)
} }
return nil
} }
} }
// Create the target filename based on SHA-1 // Create the target filename based on SHA-1
targetFilename := sha1sum targetFilename := contentHash
if filepath.Ext(suggestedFilename) != "" { if filepath.Ext(suggestedFilename) != "" {
// Append original extension if available // Append original extension if available
targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename)) targetFilename = fmt.Sprintf("%s%s", contentHash, filepath.Ext(suggestedFilename))
} }
targetPath := filepath.Join(config.OutputDir, targetFilename) targetPath := filepath.Join(config.OutputDir, targetFilename)
// Close the temp file before copying its contents // Close the temp file before copying its contents
tempFile.Close() tempFile.Close()
// Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves // Copy the file if it doesn't already exist
if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil { if existingFile == "" {
return fmt.Errorf("failed to copy file: %w", err) // Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves
if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil {
return fmt.Errorf("failed to copy file: %w", err)
}
} }
// Parse Last-Modified header // Parse Last-Modified header
@ -314,10 +341,11 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
// Create metadata // Create metadata
metadata := FileMetadata{ metadata := FileMetadata{
URL: url, URL: url,
URLHash: urlHash,
ContentDisposition: contentDisposition, ContentDisposition: contentDisposition,
SuggestedFilename: suggestedFilename, SuggestedFilename: suggestedFilename,
ActualFilename: targetFilename, ActualFilename: targetFilename,
SHA1Checksum: sha1sum, SHA1Checksum: contentHash,
ETag: resp.Header.Get("ETag"), ETag: resp.Header.Get("ETag"),
LastModified: lastModified, LastModified: lastModified,
DownloadedAt: time.Now(), DownloadedAt: time.Now(),
@ -325,8 +353,7 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
ContentLength: n, ContentLength: n,
} }
// Write metadata to file // Write metadata to file with combined hash name
metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json")
metadataJSON, err := json.MarshalIndent(metadata, "", " ") metadataJSON, err := json.MarshalIndent(metadata, "", " ")
if err != nil { if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err) return fmt.Errorf("failed to marshal metadata: %w", err)
@ -337,7 +364,8 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
} }
if config.Verbose { if config.Verbose {
log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath) log.Printf("Successfully processed %s (%d bytes), content stored at %s, metadata at %s",
url, n, targetPath, metadataPath)
} }
return nil return nil
@ -382,25 +410,3 @@ func copyFileAcrossPartitions(srcPath, dstPath string) error {
return nil return nil
} }
// findExistingFile checks if a file with the given SHA-1 checksum already exists
func findExistingFile(metadataDir, sha1sum string) (string, bool) {
metadataPath := filepath.Join(metadataDir, sha1sum+".json")
_, err := os.Stat(metadataPath)
if err != nil {
return "", false
}
// Read the metadata to get the actual file path
data, err := os.ReadFile(metadataPath)
if err != nil {
return "", false
}
var metadata FileMetadata
if err := json.Unmarshal(data, &metadata); err != nil {
return "", false
}
return metadata.ActualFilename, true
}