fetch-content/main.go

package main

import (
	"context"
	"crypto/sha1"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"os/signal"
	"path/filepath"
	"regexp"
	"strings"
	"sync"
	"syscall"
	"time"
)

// FileMetadata stores information about a downloaded file
type FileMetadata struct {
	URL                string    `json:"url"`
	ContentDisposition string    `json:"content_disposition,omitempty"`
	SuggestedFilename  string    `json:"suggested_filename,omitempty"`
	ActualFilename     string    `json:"actual_filename"`
	SHA1Checksum       string    `json:"sha1_checksum"`
	ETag               string    `json:"etag,omitempty"`
	LastModified       time.Time `json:"last_modified,omitempty"`
	DownloadedAt       time.Time `json:"downloaded_at"`
	ContentType        string    `json:"content_type,omitempty"`
	ContentLength      int64     `json:"content_length,omitempty"`
}

// Config holds application configuration
type Config struct {
	Concurrency  int
	OutputDir    string
	MetadataDir  string
	Timeout      time.Duration
	RetryCount   int
	RetryDelay   time.Duration
	SkipExisting bool
	Verbose      bool
}

func main() {
	// Parse command line flags
	concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
	outputDir := flag.String("output", "downloads", "Directory to store downloaded files")
	metadataDir := flag.String("metadata", "metadata", "Directory to store metadata files")
	timeout := flag.Duration("timeout", 5*time.Minute, "Download timeout")
	retryCount := flag.Int("retries", 3, "Number of retries for failed downloads")
	retryDelay := flag.Duration("retry-delay", 5*time.Second, "Delay between retries")
	skipExisting := flag.Bool("skip-existing", true, "Skip download if file with same checksum exists")
	verbose := flag.Bool("verbose", false, "Enable verbose logging")
	flag.Parse()

	// Create configuration
	config := Config{
		Concurrency:  *concurrency,
		OutputDir:    *outputDir,
		MetadataDir:  *metadataDir,
		Timeout:      *timeout,
		RetryCount:   *retryCount,
		RetryDelay:   *retryDelay,
		SkipExisting: *skipExisting,
		Verbose:      *verbose,
	}

	// Ensure output directories exist
	for _, dir := range []string{config.OutputDir, config.MetadataDir} {
		if err := os.MkdirAll(dir, 0755); err != nil {
			log.Fatalf("Failed to create directory %s: %v", dir, err)
		}
	}

	// Read URLs from stdin or file
	var urls []string
	if flag.NArg() > 0 {
		// Read from file
		content, err := os.ReadFile(flag.Arg(0))
		if err != nil {
			log.Fatalf("Failed to read URL file: %v", err)
		}
		urls = strings.Split(string(content), "\n")
	} else {
		// Read from stdin
		content, err := io.ReadAll(os.Stdin)
		if err != nil {
			log.Fatalf("Failed to read URLs from stdin: %v", err)
		}
		urls = strings.Split(string(content), "\n")
	}

	// Filter empty lines and deduplicate URLs
	uniqueURLs := make(map[string]struct{})
	var filteredURLs []string
	for _, url := range urls {
		url = strings.TrimSpace(url)
		if url == "" || strings.HasPrefix(url, "#") {
			continue
		}
		if _, exists := uniqueURLs[url]; !exists {
			uniqueURLs[url] = struct{}{}
			filteredURLs = append(filteredURLs, url)
		}
	}

	if len(filteredURLs) == 0 {
		log.Fatal("No valid URLs to download")
	}

	if config.Verbose {
		log.Printf("Found %d unique URLs to download", len(filteredURLs))
	}

	// Setup HTTP client with reasonable defaults
	httpClient := &http.Client{
		Timeout: config.Timeout,
		Transport: &http.Transport{
			MaxIdleConnsPerHost: config.Concurrency,
			IdleConnTimeout:     90 * time.Second,
			DisableCompression:  true, // To ensure we get the exact file
		},
	}

	// Set up signal handling for graceful shutdown
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	// Create a channel to listen for OS signals
	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

	// Start a goroutine to handle the signal
	go func() {
		sig := <-sigChan
		log.Printf("Received signal %v, initiating shutdown...", sig)
		cancel()
	}()

	// Process URLs concurrently
	var wg sync.WaitGroup
	urlChan := make(chan string, len(filteredURLs))

	// Start worker goroutines
	for i := 0; i < config.Concurrency; i++ {
		wg.Add(1)
		go func(workerID int) {
			defer wg.Done()
			for url := range urlChan {
				select {
				case <-ctx.Done():
					return // Context cancelled, stop processing
				default:
					if config.Verbose {
						log.Printf("Worker %d processing URL: %s", workerID, url)
					}
					downloadWithRetry(ctx, httpClient, url, config, workerID)
				}
			}
		}(i)
	}

	// Send URLs to workers
	for _, url := range filteredURLs {
		select {
		case <-ctx.Done():
			break
		case urlChan <- url:
			// URL sent to worker
		}
	}
	close(urlChan)

	// Wait for all downloads to complete
	wg.Wait()
	log.Println("All downloads completed or cancelled")
}

func downloadWithRetry(ctx context.Context, client *http.Client, url string, config Config, workerID int) {
	var err error
	for attempt := 0; attempt <= config.RetryCount; attempt++ {
		if attempt > 0 {
			log.Printf("Retry %d/%d for URL: %s", attempt, config.RetryCount, url)
			select {
			case <-ctx.Done():
				return
			case <-time.After(config.RetryDelay):
				// Continue with retry
			}
		}

		err = downloadURL(ctx, client, url, config, workerID)
		if err == nil || err == context.Canceled {
			return
		}

		log.Printf("Download error (attempt %d/%d): %v", attempt+1, config.RetryCount+1, err)
	}
	log.Printf("Failed to download after %d attempts: %s - %v", config.RetryCount+1, url, err)
}

func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
	// Create HTTP request with context for cancellation
	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return fmt.Errorf("failed to create request: %w", err)
	}

	// Set appropriate headers
	req.Header.Set("User-Agent", "URL-Downloader/1.0")

	// Perform the request
	resp, err := client.Do(req)
	if err != nil {
		return fmt.Errorf("request failed: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return fmt.Errorf("HTTP error: %s", resp.Status)
	}

	// Extract filename from Content-Disposition header or URL
	suggestedFilename := ""
	contentDisposition := resp.Header.Get("Content-Disposition")
	if contentDisposition != "" {
		re := regexp.MustCompile(`filename=["']?([^"']+)["']?`)
		matches := re.FindStringSubmatch(contentDisposition)
		if len(matches) > 1 {
			suggestedFilename = matches[1]
		}
	}

	// If no filename from header, extract from URL
	if suggestedFilename == "" {
		urlPath := strings.Split(url, "/")
		if len(urlPath) > 0 {
			suggestedFilename = urlPath[len(urlPath)-1]
			// Remove query parameters if present
			suggestedFilename = strings.Split(suggestedFilename, "?")[0]
		}
	}

	// If still no filename, use a generic one with timestamp
	if suggestedFilename == "" {
		suggestedFilename = fmt.Sprintf("download-%d-%d", workerID, time.Now().Unix())
	}

	// Create temporary file for download
	tempFile, err := os.CreateTemp("", "download-*")
	if err != nil {
		return fmt.Errorf("failed to create temp file: %w", err)
	}
	tempFilePath := tempFile.Name()
	defer func() {
		tempFile.Close()
		// Only remove the temp file if we didn't successfully move it
		if _, err := os.Stat(tempFilePath); err == nil {
			os.Remove(tempFilePath)
		}
	}()

	// Calculate SHA-1 while downloading
	hash := sha1.New()
	writer := io.MultiWriter(tempFile, hash)

	// Download the file
	n, err := io.Copy(writer, resp.Body)
	if err != nil {
		return fmt.Errorf("download failed: %w", err)
	}

	// Get the SHA-1 checksum
	sha1sum := fmt.Sprintf("%x", hash.Sum(nil))

	// Check if we already have this file
	if config.SkipExisting {
		existingPath, exists := findExistingFile(config.MetadataDir, sha1sum)
		if exists {
			if config.Verbose {
				log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath)
			}
			return nil
		}
	}

	// Create the target filename based on SHA-1
	targetFilename := sha1sum
	if filepath.Ext(suggestedFilename) != "" {
		// Append original extension if available
		targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename))
	}
	targetPath := filepath.Join(config.OutputDir, targetFilename)

	// Close the temp file before moving it
	tempFile.Close()

	// Move the temp file to the target location
	if err := os.Rename(tempFilePath, targetPath); err != nil {
		return fmt.Errorf("failed to move file: %w", err)
	}

	// Parse Last-Modified header
	var lastModified time.Time
	lastModifiedStr := resp.Header.Get("Last-Modified")
	if lastModifiedStr != "" {
		lastModified, _ = time.Parse(time.RFC1123, lastModifiedStr)
	}

	// Create metadata
	metadata := FileMetadata{
		URL:                url,
		ContentDisposition: contentDisposition,
		SuggestedFilename:  suggestedFilename,
		ActualFilename:     targetFilename,
		SHA1Checksum:       sha1sum,
		ETag:               resp.Header.Get("ETag"),
		LastModified:       lastModified,
		DownloadedAt:       time.Now(),
		ContentType:        resp.Header.Get("Content-Type"),
		ContentLength:      n,
	}

	// Write metadata to file
	metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json")
	metadataJSON, err := json.MarshalIndent(metadata, "", "  ")
	if err != nil {
		return fmt.Errorf("failed to marshal metadata: %w", err)
	}

	if err := os.WriteFile(metadataPath, metadataJSON, 0644); err != nil {
		return fmt.Errorf("failed to write metadata: %w", err)
	}

	if config.Verbose {
		log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath)
	}

	return nil
}

// findExistingFile checks if a file with the given SHA-1 checksum already exists
func findExistingFile(metadataDir, sha1sum string) (string, bool) {
	metadataPath := filepath.Join(metadataDir, sha1sum+".json")
	_, err := os.Stat(metadataPath)
	if err != nil {
		return "", false
	}

	// Read the metadata to get the actual file path
	data, err := os.ReadFile(metadataPath)
	if err != nil {
		return "", false
	}

	var metadata FileMetadata
	if err := json.Unmarshal(data, &metadata); err != nil {
		return "", false
	}

	return metadata.ActualFilename, true
}