Prompt: ``` I'm going to give the transcript of me brainstorming an application that I want to write. I was thinking of golang, but it could be in C++ also. Please organize my brainstorming so that I can begin building out this application: This application is going to need to read in my list of URLs and do a WAIT group for the Go routines to go out and fetch each of those URLs. And from that fetch, there will be an HTTP response header with you content disposition that has the file name intended for that file and then for each one of these files, maybe not individually, but a whole JSON object for the whole download set to. Record the URL of the associated content distribution disposition name and then... Okay, so. Url. It's gonna probably be fetched. With some kind of a watcher and writer so that it's calculating a sha-1 sum for that file. That can't all be done in memory, so it might need to like actually ride it to some temp file. As it's calculating the sha-1 sum and successfully downloads it, and if successfully downloaded. Then it writes it to a file. Based on this Shaw one, check some. And also logs that data structure. I don't know Json or whatever. That Maps that Shaw? One, check some and the file name that it's given to it. Maybe even some information, like the e-tag. And if the headers show any kind of last modified date? That would be relevant. As well as the content disposition headers. So that you can see what the file should have been named, and then also the URL that it was fetched from. So there's. Like, correlating metadata, of which file points to which URL and such. Because I'm pretty sure that. The file names would end up having collisions. Just based on the last part of the URL path. And maybe even some of the file names from the content disposition headers might have collisions. So we can't rely on either one of those. I hope that even the Shawne hashes don't, so it might need to check. For the sha one, hash. Files existence before it starts writing. Maybe even just check other? File metadatas for the e-tag. I wonder. I wonder if I should do some kind of a scan first? Unique e-tags. I'm not sure. I feel like even the metadata. Well, yes, it can be in memory. Might not need to be a Json per file. I don't know, but then it makes it clean to keep it as a Json per file. And. It's just harder to. Traverse all of those if you're querying. How to have all that in memory? As. Also. Per file. So that if you have two files referencing two, two metadata is referencing the same file. Then you can. Sort that out. Heck! At that point, it might just be able to. Get past. Original. Um. Output from the go spider. Which has a very predictable kind of file name, you know, whatever each line is adjacent? And. Maybe just output that same Json structure? But including the new key values. In a new file or to standard out. Okay. As for the concurrency, I think this is just a. From the sync library to have a wait group. And. Obviously, some amount of like default number of concurrent. Connections. It would be nice if it would reuse the same HTTP connection for keepa lives. Uh, I don't know if that's possible. But. That would made me make it faster and less work on the web server that we're hitting. And then. Spinning off all those different go routines that would need a context to be passed around. In case you need to cancel them. Uh, probably also needs a signal Handler in case you need to terminate or kill the whole ordeal. To sit and watch for signal. Handling. Yeah, let's see here. So the setup of this program is going to be to read in the list of URLs. Go ahead and scan for only the unique ones in case there's duplicates. And. Check for a the output metadata file, so you might want to have an output directory. Somehow come up with. Check sum of? The output there, maybe? Check some of? Url string, I don't know. Our list of URL strings. ``` Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
366 lines
10 KiB
Go
366 lines
10 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha1"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// FileMetadata stores information about a downloaded file
|
|
type FileMetadata struct {
|
|
URL string `json:"url"`
|
|
ContentDisposition string `json:"content_disposition,omitempty"`
|
|
SuggestedFilename string `json:"suggested_filename,omitempty"`
|
|
ActualFilename string `json:"actual_filename"`
|
|
SHA1Checksum string `json:"sha1_checksum"`
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified time.Time `json:"last_modified,omitempty"`
|
|
DownloadedAt time.Time `json:"downloaded_at"`
|
|
ContentType string `json:"content_type,omitempty"`
|
|
ContentLength int64 `json:"content_length,omitempty"`
|
|
}
|
|
|
|
// Config holds application configuration
|
|
type Config struct {
|
|
Concurrency int
|
|
OutputDir string
|
|
MetadataDir string
|
|
Timeout time.Duration
|
|
RetryCount int
|
|
RetryDelay time.Duration
|
|
SkipExisting bool
|
|
Verbose bool
|
|
}
|
|
|
|
func main() {
|
|
// Parse command line flags
|
|
concurrency := flag.Int("concurrency", 10, "Number of concurrent downloads")
|
|
outputDir := flag.String("output", "downloads", "Directory to store downloaded files")
|
|
metadataDir := flag.String("metadata", "metadata", "Directory to store metadata files")
|
|
timeout := flag.Duration("timeout", 5*time.Minute, "Download timeout")
|
|
retryCount := flag.Int("retries", 3, "Number of retries for failed downloads")
|
|
retryDelay := flag.Duration("retry-delay", 5*time.Second, "Delay between retries")
|
|
skipExisting := flag.Bool("skip-existing", true, "Skip download if file with same checksum exists")
|
|
verbose := flag.Bool("verbose", false, "Enable verbose logging")
|
|
flag.Parse()
|
|
|
|
// Create configuration
|
|
config := Config{
|
|
Concurrency: *concurrency,
|
|
OutputDir: *outputDir,
|
|
MetadataDir: *metadataDir,
|
|
Timeout: *timeout,
|
|
RetryCount: *retryCount,
|
|
RetryDelay: *retryDelay,
|
|
SkipExisting: *skipExisting,
|
|
Verbose: *verbose,
|
|
}
|
|
|
|
// Ensure output directories exist
|
|
for _, dir := range []string{config.OutputDir, config.MetadataDir} {
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
log.Fatalf("Failed to create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
// Read URLs from stdin or file
|
|
var urls []string
|
|
if flag.NArg() > 0 {
|
|
// Read from file
|
|
content, err := os.ReadFile(flag.Arg(0))
|
|
if err != nil {
|
|
log.Fatalf("Failed to read URL file: %v", err)
|
|
}
|
|
urls = strings.Split(string(content), "\n")
|
|
} else {
|
|
// Read from stdin
|
|
content, err := io.ReadAll(os.Stdin)
|
|
if err != nil {
|
|
log.Fatalf("Failed to read URLs from stdin: %v", err)
|
|
}
|
|
urls = strings.Split(string(content), "\n")
|
|
}
|
|
|
|
// Filter empty lines and deduplicate URLs
|
|
uniqueURLs := make(map[string]struct{})
|
|
var filteredURLs []string
|
|
for _, url := range urls {
|
|
url = strings.TrimSpace(url)
|
|
if url == "" || strings.HasPrefix(url, "#") {
|
|
continue
|
|
}
|
|
if _, exists := uniqueURLs[url]; !exists {
|
|
uniqueURLs[url] = struct{}{}
|
|
filteredURLs = append(filteredURLs, url)
|
|
}
|
|
}
|
|
|
|
if len(filteredURLs) == 0 {
|
|
log.Fatal("No valid URLs to download")
|
|
}
|
|
|
|
if config.Verbose {
|
|
log.Printf("Found %d unique URLs to download", len(filteredURLs))
|
|
}
|
|
|
|
// Setup HTTP client with reasonable defaults
|
|
httpClient := &http.Client{
|
|
Timeout: config.Timeout,
|
|
Transport: &http.Transport{
|
|
MaxIdleConnsPerHost: config.Concurrency,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
DisableCompression: true, // To ensure we get the exact file
|
|
},
|
|
}
|
|
|
|
// Set up signal handling for graceful shutdown
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
// Create a channel to listen for OS signals
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
// Start a goroutine to handle the signal
|
|
go func() {
|
|
sig := <-sigChan
|
|
log.Printf("Received signal %v, initiating shutdown...", sig)
|
|
cancel()
|
|
}()
|
|
|
|
// Process URLs concurrently
|
|
var wg sync.WaitGroup
|
|
urlChan := make(chan string, len(filteredURLs))
|
|
|
|
// Start worker goroutines
|
|
for i := 0; i < config.Concurrency; i++ {
|
|
wg.Add(1)
|
|
go func(workerID int) {
|
|
defer wg.Done()
|
|
for url := range urlChan {
|
|
select {
|
|
case <-ctx.Done():
|
|
return // Context cancelled, stop processing
|
|
default:
|
|
if config.Verbose {
|
|
log.Printf("Worker %d processing URL: %s", workerID, url)
|
|
}
|
|
downloadWithRetry(ctx, httpClient, url, config, workerID)
|
|
}
|
|
}
|
|
}(i)
|
|
}
|
|
|
|
// Send URLs to workers
|
|
for _, url := range filteredURLs {
|
|
select {
|
|
case <-ctx.Done():
|
|
break
|
|
case urlChan <- url:
|
|
// URL sent to worker
|
|
}
|
|
}
|
|
close(urlChan)
|
|
|
|
// Wait for all downloads to complete
|
|
wg.Wait()
|
|
log.Println("All downloads completed or cancelled")
|
|
}
|
|
|
|
func downloadWithRetry(ctx context.Context, client *http.Client, url string, config Config, workerID int) {
|
|
var err error
|
|
for attempt := 0; attempt <= config.RetryCount; attempt++ {
|
|
if attempt > 0 {
|
|
log.Printf("Retry %d/%d for URL: %s", attempt, config.RetryCount, url)
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(config.RetryDelay):
|
|
// Continue with retry
|
|
}
|
|
}
|
|
|
|
err = downloadURL(ctx, client, url, config, workerID)
|
|
if err == nil || err == context.Canceled {
|
|
return
|
|
}
|
|
|
|
log.Printf("Download error (attempt %d/%d): %v", attempt+1, config.RetryCount+1, err)
|
|
}
|
|
log.Printf("Failed to download after %d attempts: %s - %v", config.RetryCount+1, url, err)
|
|
}
|
|
|
|
func downloadURL(ctx context.Context, client *http.Client, url string, config Config, workerID int) error {
|
|
// Create HTTP request with context for cancellation
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
// Set appropriate headers
|
|
req.Header.Set("User-Agent", "URL-Downloader/1.0")
|
|
|
|
// Perform the request
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return fmt.Errorf("HTTP error: %s", resp.Status)
|
|
}
|
|
|
|
// Extract filename from Content-Disposition header or URL
|
|
suggestedFilename := ""
|
|
contentDisposition := resp.Header.Get("Content-Disposition")
|
|
if contentDisposition != "" {
|
|
re := regexp.MustCompile(`filename=["']?([^"']+)["']?`)
|
|
matches := re.FindStringSubmatch(contentDisposition)
|
|
if len(matches) > 1 {
|
|
suggestedFilename = matches[1]
|
|
}
|
|
}
|
|
|
|
// If no filename from header, extract from URL
|
|
if suggestedFilename == "" {
|
|
urlPath := strings.Split(url, "/")
|
|
if len(urlPath) > 0 {
|
|
suggestedFilename = urlPath[len(urlPath)-1]
|
|
// Remove query parameters if present
|
|
suggestedFilename = strings.Split(suggestedFilename, "?")[0]
|
|
}
|
|
}
|
|
|
|
// If still no filename, use a generic one with timestamp
|
|
if suggestedFilename == "" {
|
|
suggestedFilename = fmt.Sprintf("download-%d-%d", workerID, time.Now().Unix())
|
|
}
|
|
|
|
// Create temporary file for download
|
|
tempFile, err := os.CreateTemp("", "download-*")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create temp file: %w", err)
|
|
}
|
|
tempFilePath := tempFile.Name()
|
|
defer func() {
|
|
tempFile.Close()
|
|
// Only remove the temp file if we didn't successfully move it
|
|
if _, err := os.Stat(tempFilePath); err == nil {
|
|
os.Remove(tempFilePath)
|
|
}
|
|
}()
|
|
|
|
// Calculate SHA-1 while downloading
|
|
hash := sha1.New()
|
|
writer := io.MultiWriter(tempFile, hash)
|
|
|
|
// Download the file
|
|
n, err := io.Copy(writer, resp.Body)
|
|
if err != nil {
|
|
return fmt.Errorf("download failed: %w", err)
|
|
}
|
|
|
|
// Get the SHA-1 checksum
|
|
sha1sum := fmt.Sprintf("%x", hash.Sum(nil))
|
|
|
|
// Check if we already have this file
|
|
if config.SkipExisting {
|
|
existingPath, exists := findExistingFile(config.MetadataDir, sha1sum)
|
|
if exists {
|
|
if config.Verbose {
|
|
log.Printf("File with SHA-1 %s already exists at %s, skipping", sha1sum, existingPath)
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Create the target filename based on SHA-1
|
|
targetFilename := sha1sum
|
|
if filepath.Ext(suggestedFilename) != "" {
|
|
// Append original extension if available
|
|
targetFilename = fmt.Sprintf("%s%s", sha1sum, filepath.Ext(suggestedFilename))
|
|
}
|
|
targetPath := filepath.Join(config.OutputDir, targetFilename)
|
|
|
|
// Close the temp file before moving it
|
|
tempFile.Close()
|
|
|
|
// Move the temp file to the target location
|
|
if err := os.Rename(tempFilePath, targetPath); err != nil {
|
|
return fmt.Errorf("failed to move file: %w", err)
|
|
}
|
|
|
|
// Parse Last-Modified header
|
|
var lastModified time.Time
|
|
lastModifiedStr := resp.Header.Get("Last-Modified")
|
|
if lastModifiedStr != "" {
|
|
lastModified, _ = time.Parse(time.RFC1123, lastModifiedStr)
|
|
}
|
|
|
|
// Create metadata
|
|
metadata := FileMetadata{
|
|
URL: url,
|
|
ContentDisposition: contentDisposition,
|
|
SuggestedFilename: suggestedFilename,
|
|
ActualFilename: targetFilename,
|
|
SHA1Checksum: sha1sum,
|
|
ETag: resp.Header.Get("ETag"),
|
|
LastModified: lastModified,
|
|
DownloadedAt: time.Now(),
|
|
ContentType: resp.Header.Get("Content-Type"),
|
|
ContentLength: n,
|
|
}
|
|
|
|
// Write metadata to file
|
|
metadataPath := filepath.Join(config.MetadataDir, sha1sum+".json")
|
|
metadataJSON, err := json.MarshalIndent(metadata, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal metadata: %w", err)
|
|
}
|
|
|
|
if err := os.WriteFile(metadataPath, metadataJSON, 0644); err != nil {
|
|
return fmt.Errorf("failed to write metadata: %w", err)
|
|
}
|
|
|
|
if config.Verbose {
|
|
log.Printf("Successfully downloaded %s (%d bytes) to %s", url, n, targetPath)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// findExistingFile checks if a file with the given SHA-1 checksum already exists
|
|
func findExistingFile(metadataDir, sha1sum string) (string, bool) {
|
|
metadataPath := filepath.Join(metadataDir, sha1sum+".json")
|
|
_, err := os.Stat(metadataPath)
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
|
|
// Read the metadata to get the actual file path
|
|
data, err := os.ReadFile(metadataPath)
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
|
|
var metadata FileMetadata
|
|
if err := json.Unmarshal(data, &metadata); err != nil {
|
|
return "", false
|
|
}
|
|
|
|
return metadata.ActualFilename, true
|
|
}
|