package main import ( "context" "crypto/sha1" "encoding/json" "flag" "fmt" "io" "log" "net/http" "os" "os/signal" "path/filepath" "regexp" "strings" "sync" "syscall" "time" ) // FileMetadata stores information about a downloaded file type FileMetadata struct { URL string `json:"url"` URLHash string `json:"url_hash"` ContentDisposition string `json:"content_disposition,omitempty"` SuggestedFilename string `json:"suggested_filename,omitempty"` ActualFilename string `json:"actual_filename"` SHA1Checksum string `json:"sha1_checksum"` ETag string `json:"etag,omitempty"` LastModified time.Time `json:"last_modified,omitempty"` DownloadedAt time.Time `json:"downloaded_at"` ContentType string `json:"content_type,omitempty"` ContentLength int64 `json:"content_length,omitempty"` } // Config holds application configuration type Config struct { Concurrency int OutputDir string MetadataDir string Timeout time.Duration RetryCount int RetryDelay time.Duration SkipExisting bool Verbose bool } // Calculate SHA-1 hash of a string func hashString(s string) string { h := sha1.New() h.Write([]byte(s)) return fmt.Sprintf("%x", h.Sum(nil)) } func main() { // Parse command line flags concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads") outputDir := flag.String("output", "downloads", "Directory to store downloaded files") metadataDir := flag.String("metadata", "metadata", "Directory to store metadata files") timeout := flag.Duration("timeout", 5*time.Minute, "Download timeout") retryCount := flag.Int("retries", 3, "Number of retries for failed downloads") retryDelay := flag.Duration("retry-delay", 5*time.Second, "Delay between retries") skipExisting := flag.Bool("skip-existing", true, "Skip download if file with same checksum exists") verbose := flag.Bool("verbose", false, "Enable verbose logging") flag.Parse() // Create configuration config := Config{ Concurrency: *concurrency, OutputDir: *outputDir, MetadataDir: *metadataDir, Timeout: *timeout, RetryCount: *retryCount, RetryDelay: *retryDelay, SkipExisting: *skipExisting, Verbose: *verbose, } // Ensure output directories exist for _, dir := range []string{config.OutputDir, config.MetadataDir} { if err := os.MkdirAll(dir, 0755); err != nil { log.Fatalf("Failed to create directory %s: %v", dir, err) } } // Read URLs from stdin or file var urls []string if flag.NArg() > 0 { // Read from file content, err := os.ReadFile(flag.Arg(0)) if err != nil { log.Fatalf("Failed to read URL file: %v", err) } urls = strings.Split(string(content), "\n") } else { // Read from stdin content, err := io.ReadAll(os.Stdin) if err != nil { log.Fatalf("Failed to read URLs from stdin: %v", err) } urls = strings.Split(string(content), "\n") } // Filter empty lines and deduplicate URLs uniqueURLs := make(map[string]struct{}) var filteredURLs []string for _, url := range urls { url = strings.TrimSpace(url) if url == "" || strings.HasPrefix(url, "#") { continue } if _, exists := uniqueURLs[url]; !exists { uniqueURLs[url] = struct{}{} filteredURLs = append(filteredURLs, url) } } if len(filteredURLs) == 0 { log.Fatal("No valid URLs to download") } if config.Verbose { log.Printf("Found %d unique URLs to download", len(filteredURLs)) } // Setup HTTP client with reasonable defaults httpClient := &http.Client{ Timeout: config.Timeout, Transport: &http.Transport{ MaxIdleConnsPerHost: config.Concurrency, IdleConnTimeout: 90 * time.Second, DisableCompression: true, // To ensure we get the exact file }, } // Set up signal handling for graceful shutdown ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Create a channel to listen for OS signals sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start a goroutine to handle the signal go func() { sig := <-sigChan log.Printf("Received signal %v, initiating shutdown...", sig) cancel() }() // Process URLs concurrently var wg sync.WaitGroup urlChan := make(chan string, len(filteredURLs)) // Start worker goroutines for i := 0; i < config.Concurrency; i++ { wg.Add(1) go func(workerID int) { defer wg.Done() for url := range urlChan { select { case <-ctx.Done(): return // Context cancelled, stop processing default: if config.Verbose { log.Printf("Worker %d processing URL: %s", workerID, url) } downloadWithRetry(ctx, httpClient, url, config, workerID) } } }(i) } // Send URLs to workers for _, url := range filteredURLs { select { case <-ctx.Done(): break case urlChan <- url: // URL sent to worker } } close(urlChan) // Wait for all downloads to complete wg.Wait() log.Println("All downloads completed or cancelled") } func downloadWithRetry(ctx context.Context, client *http.Client, url string, config Config, workerID int) { var err error for attempt := 0; attempt <= config.RetryCount; attempt++ { if attempt > 0 { log.Printf("Retry %d/%d for URL: %s", attempt, config.RetryCount, url) select { case <-ctx.Done(): return case <-time.After(config.RetryDelay): // Continue with retry } } err = downloadURL(ctx, client, url, config, workerID) if err == nil || err == context.Canceled { return } log.Printf("Download error (attempt %d/%d): %v", attempt+1, config.RetryCount+1, err) } log.Printf("Failed to download after %d attempts: %s - %v", config.RetryCount+1, url, err) } func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error { // Calculate URL hash upfront urlHash := hashString(url) // Create HTTP request with context for cancellation req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return fmt.Errorf("failed to create request: %w", err) } // Set appropriate headers req.Header.Set("User-Agent", "URL-Downloader/1.0") // Perform the request resp, err := client.Do(req) if err != nil { return fmt.Errorf("request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { return fmt.Errorf("HTTP error: %s", resp.Status) } // Extract filename from Content-Disposition header or URL suggestedFilename := "" contentDisposition := resp.Header.Get("Content-Disposition") if contentDisposition != "" { re := regexp.MustCompile(`filename=["']?([^"']+)["']?`) matches := re.FindStringSubmatch(contentDisposition) if len(matches) > 1 { suggestedFilename = matches[1] } } // If no filename from header, extract from URL if suggestedFilename == "" { urlPath := strings.Split(url, "/") if len(urlPath) > 0 { suggestedFilename = urlPath[len(urlPath)-1] // Remove query parameters if present suggestedFilename = strings.Split(suggestedFilename, "?")[0] } } // If still no filename, use a generic one with timestamp if suggestedFilename == "" { suggestedFilename = fmt.Sprintf("download-%d-%d", workerID, time.Now().Unix()) } // Create temporary file for download tempFile, err := os.CreateTemp("", "download-*") if err != nil { return fmt.Errorf("failed to create temp file: %w", err) } tempFilePath := tempFile.Name() defer func() { tempFile.Close() // Only remove the temp file if we didn't successfully move it if _, err := os.Stat(tempFilePath); err == nil { os.Remove(tempFilePath) } }() // Calculate SHA-1 while downloading hash := sha1.New() writer := io.MultiWriter(tempFile, hash) // Download the file n, err := io.Copy(writer, resp.Body) if err != nil { return fmt.Errorf("download failed: %w", err) } // Get the SHA-1 checksum contentHash := fmt.Sprintf("%x", hash.Sum(nil)) // Check if this exact URL and content combination has already been downloaded metadataFilename := fmt.Sprintf("%s-%s.json", contentHash, urlHash) metadataPath := filepath.Join(config.MetadataDir, metadataFilename) if _, err := os.Stat(metadataPath); err == nil && config.SkipExisting { if config.Verbose { log.Printf("This URL and content combination already exists, skipping: %s", url) } return nil } // Check if we already have this file content from a different URL existingFile := "" if config.SkipExisting { // Look for any file with this content hash contentFiles, err := filepath.Glob(filepath.Join(config.OutputDir, contentHash+"*")) if err == nil && len(contentFiles) > 0 { existingFile = contentFiles[0] if config.Verbose { log.Printf("File with content hash %s already exists at %s, reusing", contentHash, existingFile) } } } // Create the target filename based on SHA-1 targetFilename := contentHash if filepath.Ext(suggestedFilename) != "" { // Append original extension if available targetFilename = fmt.Sprintf("%s%s", contentHash, filepath.Ext(suggestedFilename)) } targetPath := filepath.Join(config.OutputDir, targetFilename) // Close the temp file before copying its contents tempFile.Close() // Copy the file if it doesn't already exist if existingFile == "" { // Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil { return fmt.Errorf("failed to copy file: %w", err) } } // Parse Last-Modified header var lastModified time.Time lastModifiedStr := resp.Header.Get("Last-Modified") if lastModifiedStr != "" { lastModified, _ = time.Parse(time.RFC1123, lastModifiedStr) } // Create metadata metadata := FileMetadata{ URL: url, URLHash: urlHash, ContentDisposition: contentDisposition, SuggestedFilename: suggestedFilename, ActualFilename: targetFilename, SHA1Checksum: contentHash, ETag: resp.Header.Get("ETag"), LastModified: lastModified, DownloadedAt: time.Now(), ContentType: resp.Header.Get("Content-Type"), ContentLength: n, } // Write metadata to file with combined hash name metadataJSON, err := json.MarshalIndent(metadata, "", " ") if err != nil { return fmt.Errorf("failed to marshal metadata: %w", err) } if err := os.WriteFile(metadataPath, metadataJSON, 0644); err != nil { return fmt.Errorf("failed to write metadata: %w", err) } if config.Verbose { log.Printf("Successfully processed %s (%d bytes), content stored at %s, metadata at %s", url, n, targetPath, metadataPath) } return nil } // copyFileAcrossPartitions safely copies a file across different partitions func copyFileAcrossPartitions(srcPath, dstPath string) error { // Open source file for reading srcFile, err := os.Open(srcPath) if err != nil { return fmt.Errorf("failed to open source file: %w", err) } defer srcFile.Close() // Create destination file dstFile, err := os.Create(dstPath) if err != nil { return fmt.Errorf("failed to create destination file: %w", err) } defer dstFile.Close() // Copy contents _, err = io.Copy(dstFile, srcFile) if err != nil { return fmt.Errorf("failed to copy file contents: %w", err) } // Sync file to ensure all data is written to disk if err := dstFile.Sync(); err != nil { return fmt.Errorf("failed to sync file: %w", err) } // Close the files explicitly before removing the source srcFile.Close() dstFile.Close() // Remove the temporary file if err := os.Remove(srcPath); err != nil { log.Printf("Warning: Failed to remove temporary file %s: %v", srcPath, err) // Continue even if we couldn't remove the temp file } return nil }