diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5c2cdea5..41f2dee28 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -165,7 +165,7 @@ jobs: - build: 'clblast' defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"' + defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -187,7 +187,7 @@ jobs: curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z - rename-item $env:RUNNER_TEMP/clblast_release_dir clblast + rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) { $txt = Get-Content -Path $f -Raw $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 @@ -213,7 +213,6 @@ jobs: cd build cmake .. ${{ matrix.defines }} cmake --build . --config Release - cp ../LICENSE ./bin/Release/llama.cpp.txt - name: Add clblast.dll id: add_clblast_dll @@ -258,6 +257,7 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | + Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* - name: Upload artifacts diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c0fb0573..999bd1742 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) +set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") diff --git a/examples/main/README.md b/examples/main/README.md index 7c03f92c8..e71ba6173 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models. ### Prompt Caching -- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. +- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. ### Quantization diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 47b418d97..c7c591537 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -134,8 +134,6 @@ int main(int argc, char ** argv) { return 0; } - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); std::string path_session = params.path_prompt_cache; std::vector session_tokens; @@ -155,6 +153,7 @@ int main(int argc, char ** argv) { return 1; } session_tokens.resize(n_token_count_out); + llama_set_rng_seed(ctx, params.seed); fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { @@ -163,7 +162,16 @@ int main(int argc, char ** argv) { } // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + std::vector embd_inp; + + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + embd_inp = ::llama_tokenize(ctx, params.prompt, true); + } else { + embd_inp = session_tokens; + } const int n_ctx = llama_n_ctx(ctx); @@ -181,7 +189,9 @@ int main(int argc, char ** argv) { } n_matching_session_tokens++; } - if (n_matching_session_tokens >= embd_inp.size()) { + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { + fprintf(stderr, "%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", diff --git a/ggml.c b/ggml.c index 0fdb4b086..4f801000c 100644 --- a/ggml.c +++ b/ggml.c @@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { }; static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); -static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { +static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "NONE", "DUP", @@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) { return GGML_TYPE_NAME[type]; } +const char * ggml_op_name(enum ggml_op op) { + return GGML_OP_NAME[op]; +} size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; @@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { return wtype; } +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } +void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { + ctx->no_alloc = no_alloc; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace @@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl( struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (ctx->scratch.data == NULL || data != NULL) { - size_needed += sizeof(struct ggml_tensor); + size_needed += GGML_TENSOR_SIZE; if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", @@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl( }; } else { if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + size_needed, ctx->scratch.size); assert(false); return NULL; } - if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) { + if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size); + __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } @@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl( *obj_new = (struct ggml_object) { .offs = cur_end + GGML_OBJECT_SIZE, - .size = sizeof(struct ggml_tensor), + .size = GGML_TENSOR_SIZE, .next = NULL, }; @@ -13792,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + } + cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + } + cgraph->nodes[cgraph->n_nodes] = node; cgraph->grads[cgraph->n_nodes] = node->grad; cgraph->n_nodes++; @@ -14510,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + if (strcmp(node->name, name) == 0) { + return node; + } + } + + return NULL; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; @@ -14527,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -14541,7 +14581,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], - GGML_OP_LABEL[node->op]); + GGML_OP_NAME[node->op]); } for (int i = 0; i < GGML_OP_COUNT; i++) { @@ -14549,7 +14589,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0); } GGML_PRINT("========================================\n"); diff --git a/ggml.h b/ggml.h index c22d93836..558138280 100644 --- a/ggml.h +++ b/ggml.h @@ -198,6 +198,7 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 +#define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 #define GGML_ASSERT(x) \ @@ -372,11 +373,13 @@ extern "C" { void * data; - char name[32]; + char name[GGML_MAX_NAME]; char padding[16]; }; + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // computation graph struct ggml_cgraph { int n_nodes; @@ -429,6 +432,7 @@ extern "C" { GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); @@ -437,6 +441,9 @@ extern "C" { // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); @@ -445,6 +452,7 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, @@ -970,6 +978,8 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);