Merge branch 'master' into add_stop_token

2023-05-11 10:26:00 -07:00 · 2023-05-11 10:26:00 -07:00 · 099a07fb87
commit 099a07fb87
parent 331343ab0e 2510c1831f
26 changed files with 3100 additions and 928 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
@ -243,6 +256,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -31,11 +31,12 @@ struct gpt_params {

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
-
-
+    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base = "";     // base model path for the lora adapter
+
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -15,6 +15,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <thread>
+#include <mutex>

 struct quantize_stats_params {
    std::string model = "models/7B/ggml-model-f16.bin";
@ -27,7 +29,6 @@ struct quantize_stats_params {
    std::vector<enum ggml_type> include_types;
 };

-const int64_t SCRATCH_ELEMENTS = 32*32;
 const size_t HISTOGRAM_BUCKETS = 150;
 const double HISTOGRAM_RANGE = 0.03;

@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
    stats.num_samples += nelements;
 }

+void combine_error_stats(error_stats & into, const error_stats & from) {
+    into.num_samples += from.num_samples;
+    into.total_error += from.total_error;
+    if (from.max_error > into.max_error) into.max_error = from.max_error;
+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
 double find_quantile(const error_stats & stats, double quantile) {
    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);

@ -130,6 +138,36 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }

+void test_roundtrip_on_chunk(
+        const ggml_tensor * layer,
+        int64_t offset,
+        int64_t chunk_size,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        float * input_scratch,
+        char * quantized_scratch,
+        float * output_scratch,
+        error_stats & stats) {
+
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < chunk_size; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer) + offset;
+    }
+
+    if (use_reference) {
+        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+    } else {
+        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+    }
+    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+
+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
 // Run quantization function for a single layer and update error stats
 void test_roundtrip_on_layer(
        std::string & name,
@ -137,40 +175,61 @@ void test_roundtrip_on_layer(
        const quantize_fns_t & qfns,
        bool use_reference,
        const ggml_tensor * layer,
-        float * input_scratch,
-        char *quantized_scratch,
-        float * output_scratch,
-        error_stats & total_error) {
+        std::vector<float> & input_scratch,
+        std::vector<char> & quantized_scratch,
+        std::vector<float> & output_scratch,
+        error_stats & total_error,
+        int max_thread = 0) {

    assert(tensor_is_contiguous(layer));
    error_stats layer_error {};
-    int64_t nelements = ggml_nelements(layer);
+    uint64_t nelements = ggml_nelements(layer);

-    for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
-        int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
-
-        if (layer->type == GGML_TYPE_F16) {
-            for (int i = 0; i < chunk_size; i++) {
-                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
-            }
-        } else {
-            input_scratch = ggml_get_data_f32(layer) + offset;
-        }
-
-        if (use_reference) {
-            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
-        } else {
-            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
-        }
-        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
-
-        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
-        if (print_layer_stats) {
-            update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
-        }
+    float* input_scratch_ptr = nullptr;
+    if (layer->type == GGML_TYPE_F16) {
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        input_scratch_ptr = input_scratch.data();
    }
+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+    int chunk_size = 32*512;
+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+    if (num_chunks < 2 || max_thread < 2) {
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
+    } else {
+        auto & stats = print_layer_stats ? layer_error : total_error;
+        std::mutex mutex;
+        uint64_t counter = 0;
+        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+             &quantized_scratch, &output_scratch, chunk_size] () {
+            error_stats local_stats {};
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                uint64_t offset = counter; counter += chunk_size;
+                if (offset >= nelements) {
+                    combine_error_stats(stats, local_stats);
+                    break;
+                }
+                lock.unlock();
+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+            }
+        };
+        int nthread = std::min(num_chunks, max_thread);
+        std::vector<std::thread> workers(nthread-1);
+        for (auto& w : workers) w = std::thread(compute);
+        compute();
+        for (auto& w : workers) w.join();
+    }
+
    if (print_layer_stats) {
        print_error_stats(name, layer_error, false);
+        combine_error_stats(total_error, layer_error);
    }
 }

@ -181,6 +240,7 @@ int main(int argc, char ** argv) {

    // read command line

+    int max_thread = 0;
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
@ -221,7 +281,7 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
                // find match
            }
            if (j < GGML_TYPE_COUNT) {
@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
                invalid_param = true;
            }
+        } else if (arg == "-n" || arg == "--num-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            max_thread = atoi(argv[i]);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            quantize_stats_print_usage(argc, argv);
@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
    }
    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
    // allocate scratch space
-    std::vector<float> input_scratch(SCRATCH_ELEMENTS);
-    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
-    std::vector<float> output_scratch(SCRATCH_ELEMENTS);
+    std::vector<float> input_scratch;
+    std::vector<char> quantized_scratch;
+    std::vector<float> output_scratch;

    // loop throught quantization types
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
                        qfns,
                        params.reference,
                        kv_tensor.second,
-                        input_scratch.data(),
-                        quantized_scratch.data(),
-                        output_scratch.data(),
-                        global_stats
+                        input_scratch,
+                        quantized_scratch,
+                        output_scratch,
+                        global_stats,
+                        max_thread
                );
            }

--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -10,10 +10,12 @@
 int main(int argc, char ** argv) {
    ggml_time_init();

-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+    if (argc < 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
+        fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
+        fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
        return 1;
    }

@ -28,6 +30,7 @@ int main(int argc, char ** argv) {
    const std::string fname_out = argv[2];

    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
+    int nthread = argc > 4 ? atoi(argv[4]) : 0;

    const int64_t t_main_start_us = ggml_time_us();

@ -37,7 +40,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }