main.go: ruling out a colliding metadata for single PDF
Prompt claude.ai using 3.7 Sonnet: ``` one additional issue that the metadata json file is also named as the hash of the PDF, but if there are two different URLs that download a matching PDF, then while the hash will be the same the metadata is actually different for the two, so it will collide there. Lets also get a sha1 hash of the URL string itself, and the metadata json file is named with hash of the PDF, then a hyphen, then hash of the URL. ``` Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
This commit is contained in:
parent
4400ea5bca
commit
67e835eea9
1 changed files with 43 additions and 37 deletions
80
main.go
80
main.go
|
@ -22,6 +22,7 @@ import (
|
||||||
// FileMetadata stores information about a downloaded file
|
// FileMetadata stores information about a downloaded file
|
||||||
type FileMetadata struct {
|
type FileMetadata struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
|
URLHash string `json:"url_hash"`
|
||||||
ContentDisposition string `json:"content_disposition,omitempty"`
|
ContentDisposition string `json:"content_disposition,omitempty"`
|
||||||
SuggestedFilename string `json:"suggested_filename,omitempty"`
|
SuggestedFilename string `json:"suggested_filename,omitempty"`
|
||||||
ActualFilename string `json:"actual_filename"`
|
ActualFilename string `json:"actual_filename"`
|
||||||
|
@ -45,6 +46,13 @@ type Config struct {
|
||||||
Verbose bool
|
Verbose bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate SHA-1 hash of a string
|
||||||
|
func hashString(s string) string {
|
||||||
|
h := sha1.New()
|
||||||
|
h.Write([]byte(s))
|
||||||
|
return fmt.Sprintf("%x", h.Sum(nil))
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Parse command line flags
|
// Parse command line flags
|
||||||
concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
|
concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
|
||||||
|
@ -204,6 +212,9 @@ func downloadWithRetry(ctx context.Context, client *http.Client, url string, con
|
||||||
}
|
}
|
||||||
|
|
||||||
func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
|
func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
|
||||||
|
// Calculate URL hash upfront
|
||||||
|
urlHash := hashString(url)
|
||||||
|
|
||||||
// Create HTTP request with context for cancellation
|
// Create HTTP request with context for cancellation
|
||||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -275,33 +286,49 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the SHA-1 checksum
|
// Get the SHA-1 checksum
|
||||||
sha1sum := fmt.Sprintf("%x", hash.Sum(nil))
|
contentHash := fmt.Sprintf("%x", hash.Sum(nil))
|
||||||
|
|
||||||
// Check if we already have this file
|
// Check if this exact URL and content combination has already been downloaded
|
||||||
|
metadataFilename := fmt.Sprintf("%s-%s.json", contentHash, urlHash)
|
||||||
|
metadataPath := filepath.Join(config.MetadataDir, metadataFilename)
|
||||||
|
|
||||||
|
if _, err := os.Stat(metadataPath); err == nil && config.SkipExisting {
|
||||||
|
if config.Verbose {
|
||||||
|
log.Printf("This URL and content combination already exists, skipping: %s", url)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we already have this file content from a different URL
|
||||||
|
existingFile := ""
|
||||||
if config.SkipExisting {
|
if config.SkipExisting {
|
||||||
existingPath, exists := findExistingFile(config.MetadataDir, sha1sum)
|
// Look for any file with this content hash
|
||||||
if exists {
|
contentFiles, err := filepath.Glob(filepath.Join(config.OutputDir, contentHash+"*"))
|
||||||
|
if err == nil && len(contentFiles) > 0 {
|
||||||
|
existingFile = contentFiles[0]
|
||||||
if config.Verbose {
|
if config.Verbose {
|
||||||
log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath)
|
log.Printf("File with content hash %s already exists at %s, reusing", contentHash, existingFile)
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the target filename based on SHA-1
|
// Create the target filename based on SHA-1
|
||||||
targetFilename := sha1sum
|
targetFilename := contentHash
|
||||||
if filepath.Ext(suggestedFilename) != "" {
|
if filepath.Ext(suggestedFilename) != "" {
|
||||||
// Append original extension if available
|
// Append original extension if available
|
||||||
targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename))
|
targetFilename = fmt.Sprintf("%s%s", contentHash, filepath.Ext(suggestedFilename))
|
||||||
}
|
}
|
||||||
targetPath := filepath.Join(config.OutputDir, targetFilename)
|
targetPath := filepath.Join(config.OutputDir, targetFilename)
|
||||||
|
|
||||||
// Close the temp file before copying its contents
|
// Close the temp file before copying its contents
|
||||||
tempFile.Close()
|
tempFile.Close()
|
||||||
|
|
||||||
// Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves
|
// Copy the file if it doesn't already exist
|
||||||
if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil {
|
if existingFile == "" {
|
||||||
return fmt.Errorf("failed to copy file: %w", err)
|
// Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves
|
||||||
|
if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil {
|
||||||
|
return fmt.Errorf("failed to copy file: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse Last-Modified header
|
// Parse Last-Modified header
|
||||||
|
@ -314,10 +341,11 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
|
||||||
// Create metadata
|
// Create metadata
|
||||||
metadata := FileMetadata{
|
metadata := FileMetadata{
|
||||||
URL: url,
|
URL: url,
|
||||||
|
URLHash: urlHash,
|
||||||
ContentDisposition: contentDisposition,
|
ContentDisposition: contentDisposition,
|
||||||
SuggestedFilename: suggestedFilename,
|
SuggestedFilename: suggestedFilename,
|
||||||
ActualFilename: targetFilename,
|
ActualFilename: targetFilename,
|
||||||
SHA1Checksum: sha1sum,
|
SHA1Checksum: contentHash,
|
||||||
ETag: resp.Header.Get("ETag"),
|
ETag: resp.Header.Get("ETag"),
|
||||||
LastModified: lastModified,
|
LastModified: lastModified,
|
||||||
DownloadedAt: time.Now(),
|
DownloadedAt: time.Now(),
|
||||||
|
@ -325,8 +353,7 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
|
||||||
ContentLength: n,
|
ContentLength: n,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write metadata to file
|
// Write metadata to file with combined hash name
|
||||||
metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json")
|
|
||||||
metadataJSON, err := json.MarshalIndent(metadata, "", " ")
|
metadataJSON, err := json.MarshalIndent(metadata, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to marshal metadata: %w", err)
|
return fmt.Errorf("failed to marshal metadata: %w", err)
|
||||||
|
@ -337,7 +364,8 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.Verbose {
|
if config.Verbose {
|
||||||
log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath)
|
log.Printf("Successfully processed %s (%d bytes), content stored at %s, metadata at %s",
|
||||||
|
url, n, targetPath, metadataPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -382,25 +410,3 @@ func copyFileAcrossPartitions(srcPath, dstPath string) error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// findExistingFile checks if a file with the given SHA-1 checksum already exists
|
|
||||||
func findExistingFile(metadataDir, sha1sum string) (string, bool) {
|
|
||||||
metadataPath := filepath.Join(metadataDir, sha1sum+".json")
|
|
||||||
_, err := os.Stat(metadataPath)
|
|
||||||
if err != nil {
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read the metadata to get the actual file path
|
|
||||||
data, err := os.ReadFile(metadataPath)
|
|
||||||
if err != nil {
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
|
|
||||||
var metadata FileMetadata
|
|
||||||
if err := json.Unmarshal(data, &metadata); err != nil {
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
|
|
||||||
return metadata.ActualFilename, true
|
|
||||||
}
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue