Merge 'origin/master' into hipblas

This commit is contained in:
Henri Vasserman 2023-05-27 18:22:39 +03:00
commit a4648c1e7c
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
6 changed files with 79 additions and 19 deletions

View file

@ -165,7 +165,7 @@ jobs:
- build: 'clblast' - build: 'clblast'
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"' defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps: steps:
- name: Clone - name: Clone
@ -187,7 +187,7 @@ jobs:
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/clblast_release_dir clblast rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) { foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw $txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8 $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
@ -213,7 +213,6 @@ jobs:
cd build cd build
cmake .. ${{ matrix.defines }} cmake .. ${{ matrix.defines }}
cmake --build . --config Release cmake --build . --config Release
cp ../LICENSE ./bin/Release/llama.cpp.txt
- name: Add clblast.dll - name: Add clblast.dll
id: add_clblast_dll id: add_clblast_dll
@ -258,6 +257,7 @@ jobs:
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts

View file

@ -66,7 +66,7 @@ endif()
# 3rd party libs # 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF) option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")

View file

@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### Prompt Caching ### Prompt Caching
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Quantization ### Quantization

View file

@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
return 0; return 0;
} }
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else { } else {
@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
} }
// tokenize the prompt // tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
} }
n_matching_session_tokens++; n_matching_session_tokens++;
} }
if (n_matching_session_tokens >= embd_inp.size()) { if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) { } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",

58
ggml.c
View file

@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
}; };
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE", "NONE",
"DUP", "DUP",
@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type]; return GGML_TYPE_NAME[type];
} }
const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}
size_t ggml_element_size(const struct ggml_tensor * tensor) { size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type]; return GGML_TYPE_SIZE[tensor->type];
@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
return wtype; return wtype;
} }
size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
}
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1]; return tensor->nb[0] > tensor->nb[1];
} }
@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result; return result;
} }
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}
// IMPORTANT: // IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer // when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace // this is an error prone process, but it is necessary to support inplace
@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (ctx->scratch.data == NULL || data != NULL) { if (ctx->scratch.data == NULL || data != NULL) {
size_needed += sizeof(struct ggml_tensor); size_needed += GGML_TENSOR_SIZE;
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
}; };
} else { } else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) { if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false); assert(false);
return NULL; return NULL;
} }
if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) { if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size); __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false); assert(false);
return NULL; return NULL;
} }
@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
*obj_new = (struct ggml_object) { *obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE, .offs = cur_end + GGML_OBJECT_SIZE,
.size = sizeof(struct ggml_tensor), .size = GGML_TENSOR_SIZE,
.next = NULL, .next = NULL,
}; };
@ -13792,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
// reached a leaf node, not part of the gradient graph (e.g. a constant) // reached a leaf node, not part of the gradient graph (e.g. a constant)
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
}
cgraph->leafs[cgraph->n_leafs] = node; cgraph->leafs[cgraph->n_leafs] = node;
cgraph->n_leafs++; cgraph->n_leafs++;
} else { } else {
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
}
cgraph->nodes[cgraph->n_nodes] = node; cgraph->nodes[cgraph->n_nodes] = node;
cgraph->grads[cgraph->n_nodes] = node->grad; cgraph->grads[cgraph->n_nodes] = node->grad;
cgraph->n_nodes++; cgraph->n_nodes++;
@ -14510,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
} }
} }
struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * leaf = cgraph->leafs[i];
if (strcmp(leaf->name, name) == 0) {
return leaf;
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
if (strcmp(node->name, name) == 0) {
return node;
}
}
return NULL;
}
void ggml_graph_print(const struct ggml_cgraph * cgraph) { void ggml_graph_print(const struct ggml_cgraph * cgraph) {
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
@ -14527,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i, i,
node->ne[0], node->ne[1], node->ne[2], node->ne[0], node->ne[1], node->ne[2],
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0, (double) node->perf_time_us / 1000.0,
@ -14541,7 +14581,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
i, i,
node->ne[0], node->ne[1], node->ne[0], node->ne[1],
GGML_OP_LABEL[node->op]); GGML_OP_NAME[node->op]);
} }
for (int i = 0; i < GGML_OP_COUNT; i++) { for (int i = 0; i < GGML_OP_COUNT; i++) {
@ -14549,7 +14589,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
continue; continue;
} }
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
} }
GGML_PRINT("========================================\n"); GGML_PRINT("========================================\n");

12
ggml.h
View file

@ -198,6 +198,7 @@
#define GGML_MAX_PARAMS 256 #define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
@ -372,11 +373,13 @@ extern "C" {
void * data; void * data;
char name[32]; char name[GGML_MAX_NAME];
char padding[16]; char padding[16];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// computation graph // computation graph
struct ggml_cgraph { struct ggml_cgraph {
int n_nodes; int n_nodes;
@ -429,6 +432,7 @@ extern "C" {
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@ -437,6 +441,9 @@ extern "C" {
// TODO: temporary until model loading of ggml examples is refactored // TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
// main // main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -445,6 +452,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API struct ggml_tensor * ggml_new_tensor( GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -970,6 +978,8 @@ extern "C" {
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
// print info and performance information for the graph // print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);