examples: cache hf model when --model not provided

2024-05-17 22:23:10 +00:00 · 2024-05-17 22:23:10 +00:00 · 5372f9bdb0
commit 5372f9bdb0
parent 6d72ed3fff
2 changed files with 8 additions and 3 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1354,7 +1354,12 @@ void gpt_params_handle_model_default(gpt_params & params) {
            }
            params.hf_file = params.model;
        } else if (params.model.empty()) {
-            params.model = get_cache_directory() + string_split(params.hf_file, '/').back();
+            std::string cache_directory = get_cache_directory();
            const bool success = create_directory_with_parents(cache_directory);
            if (!success) {
                throw std::runtime_error("failed to create cache directory: " + cache_directory);
            }
            params.model = cache_directory + string_split(params.hf_file, '/').back();
        }
    } else if (!params.model_url.empty()) {
        if (params.model.empty()) {
@ -2549,12 +2554,10 @@ std::string get_cache_directory() {
        cache_directory += "llama.cpp";
        cache_directory += DIRECTORY_SEPARATOR;
    }
    const bool success = create_directory_with_parents(cache_directory);
    if (!success) {
        throw std::runtime_error("failed to create cache directory: " + cache_directory);
    }
    return cache_directory;
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.