Merge 'origin/master' into hipblas

2023-05-27 18:22:39 +03:00 · 2023-05-27 18:22:39 +03:00 · a4648c1e7c
commit a4648c1e7c
parent 4c8b3fb107 0ecb1bbbeb
6 changed files with 79 additions and 19 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -165,7 +165,7 @@ jobs:
          - build: 'clblast'
            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
+            defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
    steps:
      - name: Clone
@ -187,7 +187,7 @@ jobs:
          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
-          rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
            $txt = Get-Content -Path $f -Raw
            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
@ -213,7 +213,6 @@ jobs:
          cd build
          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release
          cp ../LICENSE ./bin/Release/llama.cpp.txt
      - name: Add clblast.dll
        id: add_clblast_dll
@ -258,6 +257,7 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
      - name: Upload artifacts
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -66,7 +66,7 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                 "llama: enable Accelerate framework"                    ON)
 option(LLAMA_BLAS                       "llama: use BLAS"                                       OFF)
-option(LLAMA_BLAS_VENDOR                "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                     "llama: use cuBLAS"                                     OFF)
 set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING  "llama: y block size for dmmv CUDA kernels")
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
 ### Prompt Caching
-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
 ### Quantization
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
        return 0;
    }
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;
@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
                return 1;
            }
            session_tokens.resize(n_token_count_out);
            llama_set_rng_seed(ctx, params.seed);
            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
    }
    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        // Add a space in front of the first character to match OG llama tokenizer behavior
        params.prompt.insert(0, 1, ' ');
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
        embd_inp = session_tokens;
    }
    const int n_ctx = llama_n_ctx(ctx);
@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
            }
            n_matching_session_tokens++;
        }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
        } else if (n_matching_session_tokens >= embd_inp.size()) {
            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
--- a/ggml.c
+++ b/ggml.c
@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
 };
 static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
-static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
+static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "NONE",
    "DUP",
@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) {
    return GGML_TYPE_NAME[type];
 }
 const char * ggml_op_name(enum ggml_op op) {
    return GGML_OP_NAME[op];
 }
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
    return GGML_TYPE_SIZE[tensor->type];
@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
    return wtype;
 }
 size_t ggml_tensor_overhead(void) {
    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
 }
 static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1];
 }
@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
    return result;
 }
 void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
    ctx->no_alloc = no_alloc;
 }
 // IMPORTANT:
 // when creating "opt" tensors, always save and load the scratch buffer
 // this is an error prone process, but it is necessary to support inplace
@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
    if (ctx->scratch.data == NULL || data != NULL) {
-        size_needed += sizeof(struct ggml_tensor);
+        size_needed += GGML_TENSOR_SIZE;
        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
        };
    } else {
        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
-            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
+            GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
                    __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
            assert(false);
            return NULL;
        }
-        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
+        if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
+                    __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
            assert(false);
            return NULL;
        }
@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
        *obj_new = (struct ggml_object) {
            .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = sizeof(struct ggml_tensor),
+            .size = GGML_TENSOR_SIZE,
            .next = NULL,
        };
@ -13792,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
        // reached a leaf node, not part of the gradient graph (e.g. a constant)
        GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
        if (strlen(node->name) == 0) {
            snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
        }
        cgraph->leafs[cgraph->n_leafs] = node;
        cgraph->n_leafs++;
    } else {
        GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
        if (strlen(node->name) == 0) {
            snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
        }
        cgraph->nodes[cgraph->n_nodes] = node;
        cgraph->grads[cgraph->n_nodes] = node->grad;
        cgraph->n_nodes++;
@ -14510,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
    }
 }
 struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * leaf = cgraph->leafs[i];
        if (strcmp(leaf->name, name) == 0) {
            return leaf;
        }
    }
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
        if (strcmp(node->name, name) == 0) {
            return node;
        }
    }
    return NULL;
 }
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
@ -14527,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                i,
                node->ne[0], node->ne[1], node->ne[2],
-                GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+                GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                (double) node->perf_time_us / 1000.0,
@ -14541,7 +14581,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
                i,
                node->ne[0], node->ne[1],
-                GGML_OP_LABEL[node->op]);
+                GGML_OP_NAME[node->op]);
    }
    for (int i = 0; i < GGML_OP_COUNT; i++) {
@ -14549,7 +14589,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
            continue;
        }
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
    }
    GGML_PRINT("========================================\n");
--- a/ggml.h
+++ b/ggml.h
@ -198,6 +198,7 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
 #define GGML_ASSERT(x) \
@ -372,11 +373,13 @@ extern "C" {
        void * data;
-        char name[32];
+        char name[GGML_MAX_NAME];
        char padding[16];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@ -429,6 +432,7 @@ extern "C" {
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
@ -437,6 +441,9 @@ extern "C" {
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
    // main
    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -445,6 +452,7 @@ extern "C" {
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
@ -970,6 +978,8 @@ extern "C" {
    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
    GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);