Merge branch 'master' into gg/flash-attn

2024-01-31 18:49:43 +02:00 · 2024-01-31 18:49:43 +02:00 · 2ddc9bbef1
commit 2ddc9bbef1
parent 3d03bcb7af d3bac7d584
30 changed files with 1758 additions and 1597 deletions
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -88,7 +88,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = llama_model_default_params();

-    const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
+    const std::vector<float> t_split(llama_max_devices(), 0.0f);

    model_params.n_gpu_layers = n_gpu_layers;
    model_params.tensor_split = t_split.data();
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -160,7 +160,7 @@ struct cmd_params {
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
    std::vector<bool> mul_mat_q;
-    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
+    std::vector<std::vector<float>> tensor_split;
    int reps;
    bool verbose;
    output_formats output_format;
@ -179,7 +179,7 @@ static const cmd_params cmd_params_defaults = {
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
    /* mul_mat_q     */ {true},
-    /* tensor_split  */ {{}},
+    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
@ -380,10 +380,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
                std::vector<std::string> split_arg{it, {}};
-                GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+                GGML_ASSERT(split_arg.size() <= llama_max_devices());

-                std::array<float, LLAMA_MAX_DEVICES> tensor_split;
-                for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                std::vector<float> tensor_split(llama_max_devices());
+                for (size_t i = 0; i < llama_max_devices(); ++i) {
                    if (i < split_arg.size()) {
                        tensor_split[i] = std::stof(split_arg[i]);
                    } else {
@ -459,7 +459,7 @@ struct cmd_params_instance {
    int main_gpu;
    bool no_kv_offload;
    bool mul_mat_q;
-    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    std::vector<float> tensor_split;

    llama_model_params to_llama_mparams() const {
        llama_model_params mparams = llama_model_default_params();
@ -563,6 +563,7 @@ struct test {
    static const bool cuda;
    static const bool opencl;
    static const bool vulkan;
+    static const bool kompute;
    static const bool metal;
    static const bool gpu_blas;
    static const bool blas;
@ -581,7 +582,7 @@ struct test {
    int main_gpu;
    bool no_kv_offload;
    bool mul_mat_q;
-    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    std::vector<float> tensor_split;
    int n_prompt;
    int n_gen;
    std::string test_time;
@ -647,6 +648,9 @@ struct test {
        if (vulkan) {
            return "Vulkan";
        }
+        if (kompute) {
+            return "Kompute";
+        }
        if (metal) {
            return "Metal";
        }
@ -662,7 +666,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "type_k", "type_v",
@ -686,8 +690,9 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl"  || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
+        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
+            field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
+            field == "mul_mat_q") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@ -699,7 +704,7 @@ struct test {
    std::vector<std::string> get_values() const {
        std::string tensor_split_str;
        int max_nonzero = 0;
-        for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
+        for (size_t i = 0; i < llama_max_devices(); i++) {
            if (tensor_split[i] > 0) {
                max_nonzero = i;
            }
@ -714,7 +719,8 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@ -743,6 +749,7 @@ const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
+const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -111,17 +111,71 @@ llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 m
 llama_print_timings:       total time =   34570.79 ms
 ```

+## Orin compile and run
+### compile
+```sh
+make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+```
+
+### run on Orin
+### case 1
+**input**
+```sh
+./llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    --image /data/local/tmp/demo.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \
+    --n-gpu-layers 999
+```
+**output**
+```sh
+
+encode_image_with_clip: image encoded in   296.62 ms by CLIP (    2.06 ms per image patch)
+
+ Susan Wise Bauer
+
+llama_print_timings:        load time =    1067.64 ms
+llama_print_timings:      sample time =       1.53 ms /     6 runs   (    0.25 ms per token,  3934.43 tokens per second)
+llama_print_timings: prompt eval time =     306.84 ms /   246 tokens (    1.25 ms per token,   801.72 tokens per second)
+llama_print_timings:        eval time =      91.50 ms /     6 runs   (   15.25 ms per token,    65.58 tokens per second)
+llama_print_timings:       total time =    1352.63 ms /   252 tokens
+```
+
+### case 2
+**input**
+```sh
+./llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
+    --n-gpu-layers 999
+
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in   302.15 ms by CLIP (    2.10 ms per image patch)
+
+ The image features a cat lying in the grass.
+
+llama_print_timings:        load time =    1057.07 ms
+llama_print_timings:      sample time =       3.27 ms /    11 runs   (    0.30 ms per token,  3360.83 tokens per second)
+llama_print_timings: prompt eval time =     213.60 ms /   232 tokens (    0.92 ms per token,  1086.14 tokens per second)
+llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 ms per token,    66.01 tokens per second)
+llama_print_timings:       total time =    1365.47 ms /   243 tokens
+```
+
 ## Minor shortcomings
 The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.

 ## TODO

- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
+- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
 - [ ] Optimize LDP projector performance

      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [ ] run MobileVLM on `Jetson Orin`
+- [x] run MobileVLM on `Jetson Orin`
 - [ ] Support more model variants, such as `MobileVLM-3B`.


--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1789,28 +1789,28 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
-    if (llama_mlock_supported())
+    if (llama_supports_mlock())
    {
        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
    }
-    if (llama_mmap_supported())
+    if (llama_supports_mmap())
    {
        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
    printf("  --numa                    attempt optimizations that help on some NUMA systems\n");
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                            number of layers to store in VRAM\n");
-    printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-    printf("                            how to split the model across multiple GPUs, one of:\n");
-    printf("                              - none: use one GPU only\n");
-    printf("                              - layer (default): split layers and KV across GPUs\n");
-    printf("                              - row: split rows across GPUs\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
-    printf("                            or for intermediate results and KV (with split-mode = row)\n");
-#endif
+    if (llama_supports_gpu_offload()) {
+        printf("  -ngl N, --n-gpu-layers N\n");
+        printf("                            number of layers to store in VRAM\n");
+        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+        printf("                            how to split the model across multiple GPUs, one of:\n");
+        printf("                              - none: use one GPU only\n");
+        printf("                              - layer (default): split layers and KV across GPUs\n");
+        printf("                              - row: split rows across GPUs\n");
+        printf("  -ts SPLIT --tensor-split SPLIT\n");
+        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
+        printf("                            or for intermediate results and KV (with split-mode = row)\n");
+    }
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
    printf("  -a ALIAS, --alias ALIAS\n");
@ -2066,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params.n_gpu_layers = std::stoi(argv[i]);
-#else
-            LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+            if (llama_supports_gpu_offload()) {
+                params.n_gpu_layers = std::stoi(argv[i]);
+            } else {
+                LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
                        "See main README.md for information on enabling GPU BLAS support",
                        {{"n_gpu_layers", params.n_gpu_layers}});
-#endif
+            }
        }
        else if (arg == "--split-mode" || arg == "-sm")
        {
@ -2115,9 +2115,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            const std::regex regex{R"([,/]+)"};
            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+            GGML_ASSERT(split_arg.size() <= llama_max_devices());

-            for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
+            for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
            {
                if (i_device < split_arg.size())
                {
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -1,7 +1,9 @@
-/*MIT license
-  Copyright (C) 2024 Intel Corporation
-  SPDX-License-Identifier: MIT
-*/
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+

 #include "ggml-sycl.h"

--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -0,0 +1,23 @@
+
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+::  for FP16
+::  faster for long-prompt inference
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+
+::  for FP32
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+
+
+::  build example/main only
+::  make main
+
+::  build all binary
+make -j
+cd ..
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@ -0,0 +1,13 @@
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+
+set GGML_SYCL_DEVICE=0
+rem set GGML_SYCL_DEBUG=1
+.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+
+