I'm honestly shocked that it even compiled first time, and remotely did as expected, much less _exactly_ as expected. Prompt to Claude.ai using 3.7 Sonnet: ``` this is awesome. Though on line 303, the os.Rename fails because /tmp is on a different device partition than the downloads/ directory. This should instead Create and write a new file and remove the old one. ``` Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
406 lines
11 KiB
Go
406 lines
11 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha1"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// FileMetadata stores information about a downloaded file
|
|
type FileMetadata struct {
|
|
URL string `json:"url"`
|
|
ContentDisposition string `json:"content_disposition,omitempty"`
|
|
SuggestedFilename string `json:"suggested_filename,omitempty"`
|
|
ActualFilename string `json:"actual_filename"`
|
|
SHA1Checksum string `json:"sha1_checksum"`
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified time.Time `json:"last_modified,omitempty"`
|
|
DownloadedAt time.Time `json:"downloaded_at"`
|
|
ContentType string `json:"content_type,omitempty"`
|
|
ContentLength int64 `json:"content_length,omitempty"`
|
|
}
|
|
|
|
// Config holds application configuration
|
|
type Config struct {
|
|
Concurrency int
|
|
OutputDir string
|
|
MetadataDir string
|
|
Timeout time.Duration
|
|
RetryCount int
|
|
RetryDelay time.Duration
|
|
SkipExisting bool
|
|
Verbose bool
|
|
}
|
|
|
|
func main() {
|
|
// Parse command line flags
|
|
concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
|
|
outputDir := flag.String("output", "downloads", "Directory to store downloaded files")
|
|
metadataDir := flag.String("metadata", "metadata", "Directory to store metadata files")
|
|
timeout := flag.Duration("timeout", 5*time.Minute, "Download timeout")
|
|
retryCount := flag.Int("retries", 3, "Number of retries for failed downloads")
|
|
retryDelay := flag.Duration("retry-delay", 5*time.Second, "Delay between retries")
|
|
skipExisting := flag.Bool("skip-existing", true, "Skip download if file with same checksum exists")
|
|
verbose := flag.Bool("verbose", false, "Enable verbose logging")
|
|
flag.Parse()
|
|
|
|
// Create configuration
|
|
config := Config{
|
|
Concurrency: *concurrency,
|
|
OutputDir: *outputDir,
|
|
MetadataDir: *metadataDir,
|
|
Timeout: *timeout,
|
|
RetryCount: *retryCount,
|
|
RetryDelay: *retryDelay,
|
|
SkipExisting: *skipExisting,
|
|
Verbose: *verbose,
|
|
}
|
|
|
|
// Ensure output directories exist
|
|
for _, dir := range []string{config.OutputDir, config.MetadataDir} {
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
log.Fatalf("Failed to create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
// Read URLs from stdin or file
|
|
var urls []string
|
|
if flag.NArg() > 0 {
|
|
// Read from file
|
|
content, err := os.ReadFile(flag.Arg(0))
|
|
if err != nil {
|
|
log.Fatalf("Failed to read URL file: %v", err)
|
|
}
|
|
urls = strings.Split(string(content), "\n")
|
|
} else {
|
|
// Read from stdin
|
|
content, err := io.ReadAll(os.Stdin)
|
|
if err != nil {
|
|
log.Fatalf("Failed to read URLs from stdin: %v", err)
|
|
}
|
|
urls = strings.Split(string(content), "\n")
|
|
}
|
|
|
|
// Filter empty lines and deduplicate URLs
|
|
uniqueURLs := make(map[string]struct{})
|
|
var filteredURLs []string
|
|
for _, url := range urls {
|
|
url = strings.TrimSpace(url)
|
|
if url == "" || strings.HasPrefix(url, "#") {
|
|
continue
|
|
}
|
|
if _, exists := uniqueURLs[url]; !exists {
|
|
uniqueURLs[url] = struct{}{}
|
|
filteredURLs = append(filteredURLs, url)
|
|
}
|
|
}
|
|
|
|
if len(filteredURLs) == 0 {
|
|
log.Fatal("No valid URLs to download")
|
|
}
|
|
|
|
if config.Verbose {
|
|
log.Printf("Found %d unique URLs to download", len(filteredURLs))
|
|
}
|
|
|
|
// Setup HTTP client with reasonable defaults
|
|
httpClient := &http.Client{
|
|
Timeout: config.Timeout,
|
|
Transport: &http.Transport{
|
|
MaxIdleConnsPerHost: config.Concurrency,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
DisableCompression: true, // To ensure we get the exact file
|
|
},
|
|
}
|
|
|
|
// Set up signal handling for graceful shutdown
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
// Create a channel to listen for OS signals
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
// Start a goroutine to handle the signal
|
|
go func() {
|
|
sig := <-sigChan
|
|
log.Printf("Received signal %v, initiating shutdown...", sig)
|
|
cancel()
|
|
}()
|
|
|
|
// Process URLs concurrently
|
|
var wg sync.WaitGroup
|
|
urlChan := make(chan string, len(filteredURLs))
|
|
|
|
// Start worker goroutines
|
|
for i := 0; i < config.Concurrency; i++ {
|
|
wg.Add(1)
|
|
go func(workerID int) {
|
|
defer wg.Done()
|
|
for url := range urlChan {
|
|
select {
|
|
case <-ctx.Done():
|
|
return // Context cancelled, stop processing
|
|
default:
|
|
if config.Verbose {
|
|
log.Printf("Worker %d processing URL: %s", workerID, url)
|
|
}
|
|
downloadWithRetry(ctx, httpClient, url, config, workerID)
|
|
}
|
|
}
|
|
}(i)
|
|
}
|
|
|
|
// Send URLs to workers
|
|
for _, url := range filteredURLs {
|
|
select {
|
|
case <-ctx.Done():
|
|
break
|
|
case urlChan <- url:
|
|
// URL sent to worker
|
|
}
|
|
}
|
|
close(urlChan)
|
|
|
|
// Wait for all downloads to complete
|
|
wg.Wait()
|
|
log.Println("All downloads completed or cancelled")
|
|
}
|
|
|
|
func downloadWithRetry(ctx context.Context, client *http.Client, url string, config Config, workerID int) {
|
|
var err error
|
|
for attempt := 0; attempt <= config.RetryCount; attempt++ {
|
|
if attempt > 0 {
|
|
log.Printf("Retry %d/%d for URL: %s", attempt, config.RetryCount, url)
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(config.RetryDelay):
|
|
// Continue with retry
|
|
}
|
|
}
|
|
|
|
err = downloadURL(ctx, client, url, config, workerID)
|
|
if err == nil || err == context.Canceled {
|
|
return
|
|
}
|
|
|
|
log.Printf("Download error (attempt %d/%d): %v", attempt+1, config.RetryCount+1, err)
|
|
}
|
|
log.Printf("Failed to download after %d attempts: %s - %v", config.RetryCount+1, url, err)
|
|
}
|
|
|
|
func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
|
|
// Create HTTP request with context for cancellation
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
// Set appropriate headers
|
|
req.Header.Set("User-Agent", "URL-Downloader/1.0")
|
|
|
|
// Perform the request
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return fmt.Errorf("HTTP error: %s", resp.Status)
|
|
}
|
|
|
|
// Extract filename from Content-Disposition header or URL
|
|
suggestedFilename := ""
|
|
contentDisposition := resp.Header.Get("Content-Disposition")
|
|
if contentDisposition != "" {
|
|
re := regexp.MustCompile(`filename=["']?([^"']+)["']?`)
|
|
matches := re.FindStringSubmatch(contentDisposition)
|
|
if len(matches) > 1 {
|
|
suggestedFilename = matches[1]
|
|
}
|
|
}
|
|
|
|
// If no filename from header, extract from URL
|
|
if suggestedFilename == "" {
|
|
urlPath := strings.Split(url, "/")
|
|
if len(urlPath) > 0 {
|
|
suggestedFilename = urlPath[len(urlPath)-1]
|
|
// Remove query parameters if present
|
|
suggestedFilename = strings.Split(suggestedFilename, "?")[0]
|
|
}
|
|
}
|
|
|
|
// If still no filename, use a generic one with timestamp
|
|
if suggestedFilename == "" {
|
|
suggestedFilename = fmt.Sprintf("download-%d-%d", workerID, time.Now().Unix())
|
|
}
|
|
|
|
// Create temporary file for download
|
|
tempFile, err := os.CreateTemp("", "download-*")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create temp file: %w", err)
|
|
}
|
|
tempFilePath := tempFile.Name()
|
|
defer func() {
|
|
tempFile.Close()
|
|
// Only remove the temp file if we didn't successfully move it
|
|
if _, err := os.Stat(tempFilePath); err == nil {
|
|
os.Remove(tempFilePath)
|
|
}
|
|
}()
|
|
|
|
// Calculate SHA-1 while downloading
|
|
hash := sha1.New()
|
|
writer := io.MultiWriter(tempFile, hash)
|
|
|
|
// Download the file
|
|
n, err := io.Copy(writer, resp.Body)
|
|
if err != nil {
|
|
return fmt.Errorf("download failed: %w", err)
|
|
}
|
|
|
|
// Get the SHA-1 checksum
|
|
sha1sum := fmt.Sprintf("%x", hash.Sum(nil))
|
|
|
|
// Check if we already have this file
|
|
if config.SkipExisting {
|
|
existingPath, exists := findExistingFile(config.MetadataDir, sha1sum)
|
|
if exists {
|
|
if config.Verbose {
|
|
log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath)
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Create the target filename based on SHA-1
|
|
targetFilename := sha1sum
|
|
if filepath.Ext(suggestedFilename) != "" {
|
|
// Append original extension if available
|
|
targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename))
|
|
}
|
|
targetPath := filepath.Join(config.OutputDir, targetFilename)
|
|
|
|
// Close the temp file before copying its contents
|
|
tempFile.Close()
|
|
|
|
// Fix: Instead of using os.Rename, copy the file contents to handle cross-partition moves
|
|
if err := copyFileAcrossPartitions(tempFilePath, targetPath); err != nil {
|
|
return fmt.Errorf("failed to copy file: %w", err)
|
|
}
|
|
|
|
// Parse Last-Modified header
|
|
var lastModified time.Time
|
|
lastModifiedStr := resp.Header.Get("Last-Modified")
|
|
if lastModifiedStr != "" {
|
|
lastModified, _ = time.Parse(time.RFC1123, lastModifiedStr)
|
|
}
|
|
|
|
// Create metadata
|
|
metadata := FileMetadata{
|
|
URL: url,
|
|
ContentDisposition: contentDisposition,
|
|
SuggestedFilename: suggestedFilename,
|
|
ActualFilename: targetFilename,
|
|
SHA1Checksum: sha1sum,
|
|
ETag: resp.Header.Get("ETag"),
|
|
LastModified: lastModified,
|
|
DownloadedAt: time.Now(),
|
|
ContentType: resp.Header.Get("Content-Type"),
|
|
ContentLength: n,
|
|
}
|
|
|
|
// Write metadata to file
|
|
metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json")
|
|
metadataJSON, err := json.MarshalIndent(metadata, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal metadata: %w", err)
|
|
}
|
|
|
|
if err := os.WriteFile(metadataPath, metadataJSON, 0644); err != nil {
|
|
return fmt.Errorf("failed to write metadata: %w", err)
|
|
}
|
|
|
|
if config.Verbose {
|
|
log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// copyFileAcrossPartitions safely copies a file across different partitions
|
|
func copyFileAcrossPartitions(srcPath, dstPath string) error {
|
|
// Open source file for reading
|
|
srcFile, err := os.Open(srcPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open source file: %w", err)
|
|
}
|
|
defer srcFile.Close()
|
|
|
|
// Create destination file
|
|
dstFile, err := os.Create(dstPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create destination file: %w", err)
|
|
}
|
|
defer dstFile.Close()
|
|
|
|
// Copy contents
|
|
_, err = io.Copy(dstFile, srcFile)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to copy file contents: %w", err)
|
|
}
|
|
|
|
// Sync file to ensure all data is written to disk
|
|
if err := dstFile.Sync(); err != nil {
|
|
return fmt.Errorf("failed to sync file: %w", err)
|
|
}
|
|
|
|
// Close the files explicitly before removing the source
|
|
srcFile.Close()
|
|
dstFile.Close()
|
|
|
|
// Remove the temporary file
|
|
if err := os.Remove(srcPath); err != nil {
|
|
log.Printf("Warning: Failed to remove temporary file %s: %v", srcPath, err)
|
|
// Continue even if we couldn't remove the temp file
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// findExistingFile checks if a file with the given SHA-1 checksum already exists
|
|
func findExistingFile(metadataDir, sha1sum string) (string, bool) {
|
|
metadataPath := filepath.Join(metadataDir, sha1sum+".json")
|
|
_, err := os.Stat(metadataPath)
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
|
|
// Read the metadata to get the actual file path
|
|
data, err := os.ReadFile(metadataPath)
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
|
|
var metadata FileMetadata
|
|
if err := json.Unmarshal(data, &metadata); err != nil {
|
|
return "", false
|
|
}
|
|
|
|
return metadata.ActualFilename, true
|
|
}
|