curl: check url of previous download (.json metadata w/ url, etag & lastModified)

This commit is contained in:
Olivier Chafik 2024-04-26 14:12:15 +01:00
parent e55dfde3b0
commit 0664e9b321

View file

@ -1888,11 +1888,16 @@ void llama_batch_add(
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
static bool llama_download_file(CURL * curl, const char * url, const char * path) { static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool llama_download_file(CURL * curl, const std::string & url, const std::string & path) {
bool force_download = false; bool force_download = false;
// Set the URL, allow to follow http redirection // Set the URL, allow to follow http redirection
curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
#if defined(_WIN32) #if defined(_WIN32)
@ -1903,44 +1908,48 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
// Check if the file already exists locally // Check if the file already exists locally
struct stat model_file_info; struct stat model_file_info;
auto file_exists = (stat(path, &model_file_info) == 0); auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files // If the file exists, check for ${path_model}.json file
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; // Alternatively check for legacy ${path_model}.etag & ${path_model}.lastModified files
char etag_path[PATH_MAX] = {0}; std::string metadata_path = path + ".json";
snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string etag;
char last_modified_path[PATH_MAX] = {0}; std::string last_modified;
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path); nlohmann::json metadata;
if (file_exists) { if (file_exists) {
auto * f_etag = fopen(etag_path, "r"); // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
if (f_etag) { std::ifstream metadata_in(metadata_path);
if (!fgets(etag, sizeof(etag), f_etag)) { if (metadata_in.good()) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); try {
} else { metadata_in >> metadata;
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata["url"].is_string()) {
auto previous_url = metadata["url"].get<std::string>();
if (previous_url != url) {
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
} }
fclose(f_etag); }
if (metadata.contains("etag") && metadata["etag"].is_string()) {
etag = metadata["etag"];
}
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
last_modified = metadata["lastModified"];
}
} catch (const nlohmann::json::exception & e) {
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
} }
auto * f_last_modified = fopen(last_modified_path, "r");
if (f_last_modified) {
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
} else {
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
last_modified);
}
fclose(f_last_modified);
} }
} }
// Send a HEAD request to retrieve the etag and last-modified headers // Send a HEAD request to retrieve the etag and last-modified headers
struct llama_load_model_from_url_headers { struct llama_load_model_from_url_headers {
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string etag;
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string last_modified;
}; };
llama_load_model_from_url_headers headers; llama_load_model_from_url_headers headers;
{ {
@ -1948,20 +1957,16 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
// Convert header field name to lowercase std::string header(buffer, n_items);
for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { std::smatch match;
buffer[i] = tolower(buffer[i]); if (std::regex_match(header, match, std::regex("([^:]+): (.*)\r\n", std::regex_constants::multiline))) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, std::regex("ETag", std::regex_constants::icase))) {
headers->etag = value;
} else if (std::regex_match(key, match, std::regex("Last-Modified", std::regex_constants::icase))) {
headers->last_modified = value;
} }
const char * etag_prefix = "etag: ";
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
}
const char * last_modified_prefix = "last-modified: ";
if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
} }
return n_items; return n_items;
}; };
@ -1988,28 +1993,29 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
} }
} }
// If the ETag or the Last-Modified headers are different: trigger a new download bool should_download = !file_exists || force_download;
bool should_download = !file_exists if (!should_download && !etag.empty() && !last_modified.empty()) {
|| force_download if (etag != headers.etag || last_modified != headers.last_modified) {
|| (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) fprintf(stderr, "%s: ETag or Last-Modified headers are different: triggering a new download\n", __func__);
|| (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); should_download = true;
}
}
if (should_download) { if (should_download) {
char path_temporary[PATH_MAX] = {0}; std::string path_temporary = path + ".downloadInProgress";
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
if (file_exists) { if (file_exists) {
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path) != 0) { if (remove(path.c_str()) != 0) {
curl_easy_cleanup(curl); curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path); fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
return false; return false;
} }
} }
// Set the output file // Set the output file
auto * outfile = fopen(path_temporary, "wb"); auto * outfile = fopen(path_temporary.c_str(), "wb");
if (!outfile) { if (!outfile) {
curl_easy_cleanup(curl); curl_easy_cleanup(curl);
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path); fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false; return false;
} }
@ -2041,7 +2047,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
// start the download // start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl); auto res = curl_easy_perform(curl);
if (res != CURLE_OK) { if (res != CURLE_OK) {
fclose(outfile); fclose(outfile);
@ -2062,30 +2068,18 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
// Clean up // Clean up
fclose(outfile); fclose(outfile);
// Write the new ETag to the .etag file // Write the updated JSON metadata file.
if (strlen(headers.etag) > 0) { metadata.update({
auto * etag_file = fopen(etag_path, "w"); {"url", url},
if (etag_file) { {"etag", headers.etag},
fputs(headers.etag, etag_file); {"lastModified", headers.last_modified}
fclose(etag_file); });
fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); std::ofstream(metadata_path) << metadata.dump(4);
} fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
}
// Write the new lastModified to the .etag file if (rename(path_temporary.c_str(), path.c_str()) != 0) {
if (strlen(headers.last_modified) > 0) {
auto * last_modified_file = fopen(last_modified_path, "w");
if (last_modified_file) {
fputs(headers.last_modified, last_modified_file);
fclose(last_modified_file);
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
headers.last_modified);
}
}
if (rename(path_temporary, path) != 0) {
curl_easy_cleanup(curl); curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path); fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false; return false;
} }
} }