diff --git a/main.go b/main.go index e29dff2..5d6eb9e 100644 --- a/main.go +++ b/main.go @@ -22,6 +22,7 @@ import ( // FileMetadata stores information about a downloaded file type FileMetadata struct { URL string `json:"url"` + URLHash string `json:"url_hash"` ContentDisposition string `json:"content_disposition,omitempty"` SuggestedFilename string `json:"suggested_filename,omitempty"` ActualFilename string `json:"actual_filename"` @@ -45,6 +46,13 @@ type Config struct { Verbose bool } +// Calculate SHA-1 hash of a string +func hashString(s string) string { + h := sha1.New() + h.Write([]byte(s)) + return fmt.Sprintf("%x", h.Sum(nil)) +} + func main() { // Parse command line flags concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads") @@ -204,6 +212,9 @@ func downloadWithRetry(ctx context.Context, client *http.Client, url string, con } func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error { + // Calculate URL hash upfront + urlHash := hashString(url) + // Create HTTP request with context for cancellation req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { @@ -275,33 +286,49 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co } // Get the SHA-1 checksum - sha1sum := fmt.Sprintf("%x", hash.Sum(nil)) + contentHash := fmt.Sprintf("%x", hash.Sum(nil)) - // Check if we already have this file + // Check if this exact URL and content combination has already been downloaded + metadataFilename := fmt.Sprintf("%s-%s.json", contentHash, urlHash) + metadataPath := filepath.Join(config.MetadataDir, metadataFilename) + + if _, err := os.Stat(metadataPath); err == nil && config.SkipExisting { + if config.Verbose { + log.Printf("This URL and content combination already exists, skipping: %s", url) + } + return nil + } + + // Check if we already have this file content from a different URL + existingFile := "" if config.SkipExisting { - existingPath, exists := findExistingFile(config.MetadataDir, sha1sum) - if exists { + // Look for any file with this content hash + contentFiles, err := filepath.Glob(filepath.Join(config.OutputDir, contentHash+"*")) + if err == nil && len(contentFiles) > 0 { + existingFile = contentFiles[0] if config.Verbose { - log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath) + log.Printf("File with content hash %s already exists at %s, reusing", contentHash, existingFile) } - return nil } } // Create the target filename based on SHA-1 - targetFilename := sha1sum + targetFilename := contentHash if filepath.Ext(suggestedFilename) != "" { // Append original extension if available - targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename)) + targetFilename = fmt.Sprintf("%s%s", contentHash, filepath.Ext(suggestedFilename)) } targetPath := filepath.Join(config.OutputDir, targetFilename) // Close the temp file before copying its contents tempFile.Close() - // Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves - if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil { - return fmt.Errorf("failed to copy file: %w", err) + // Copy the file if it doesn't already exist + if existingFile == "" { + // Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves + if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil { + return fmt.Errorf("failed to copy file: %w", err) + } } // Parse Last-Modified header @@ -314,10 +341,11 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co // Create metadata metadata := FileMetadata{ URL: url, + URLHash: urlHash, ContentDisposition: contentDisposition, SuggestedFilename: suggestedFilename, ActualFilename: targetFilename, - SHA1Checksum: sha1sum, + SHA1Checksum: contentHash, ETag: resp.Header.Get("ETag"), LastModified: lastModified, DownloadedAt: time.Now(), @@ -325,8 +353,7 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co ContentLength: n, } - // Write metadata to file - metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json") + // Write metadata to file with combined hash name metadataJSON, err := json.MarshalIndent(metadata, "", " ") if err != nil { return fmt.Errorf("failed to marshal metadata: %w", err) @@ -337,7 +364,8 @@ func downloadURL(ctx context.Context, client *http.Client, url string, config Co } if config.Verbose { - log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath) + log.Printf("Successfully processed %s (%d bytes), content stored at %s, metadata at %s", + url, n, targetPath, metadataPath) } return nil @@ -382,25 +410,3 @@ func copyFileAcrossPartitions(srcPath, dstPath string) error { return nil } - -// findExistingFile checks if a file with the given SHA-1 checksum already exists -func findExistingFile(metadataDir, sha1sum string) (string, bool) { - metadataPath := filepath.Join(metadataDir, sha1sum+".json") - _, err := os.Stat(metadataPath) - if err != nil { - return "", false - } - - // Read the metadata to get the actual file path - data, err := os.ReadFile(metadataPath) - if err != nil { - return "", false - } - - var metadata FileMetadata - if err := json.Unmarshal(data, &metadata); err != nil { - return "", false - } - - return metadata.ActualFilename, true -}