From a18c19259a3cb9dec332d613e8f15704f678a468 Mon Sep 17 00:00:00 2001 From: Ben Siraphob Date: Wed, 22 Mar 2023 00:37:02 -0500 Subject: [PATCH 01/12] Fix Nix build --- flake.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flake.nix b/flake.nix index da4bd7ba3..4c2717e0d 100644 --- a/flake.nix +++ b/flake.nix @@ -28,8 +28,8 @@ ]; installPhase = '' mkdir -p $out/bin - mv llama $out/bin/llama - mv quantize $out/bin/quantize + mv bin/main $out/bin/llama + mv bin/quantize $out/bin/quantize echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml chmod +x $out/bin/convert-pth-to-ggml From ea10d3ded2994106596ddf8e4ed02741b3e053e6 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Thu, 23 Mar 2023 19:54:28 +0200 Subject: [PATCH 02/12] Command line args bounds checking (#424) * command line args bounds checking * unknown and invalid param exit codes 0 -> 1 --- utils.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/utils.cpp b/utils.cpp index 1d5309c3a..45c9cabb1 100644 --- a/utils.cpp +++ b/utils.cpp @@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); } + bool invalid_param = false; + std::string arg; for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; + arg = argv[i]; if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoi(argv[i]); } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); } else if (arg == "-p" || arg == "--prompt") { - params.prompt = argv[++i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; } else if (arg == "-f" || arg == "--file") { - std::ifstream file(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); if (params.prompt.back() == '\n') { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); } else if (arg == "--top_k") { - params.top_k = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); } else if (arg == "-c" || arg == "--ctx_size") { - params.n_ctx = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); } else if (arg == "--memory_f16") { params.memory_f16 = true; } else if (arg == "--top_p") { - params.top_p = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); } else if (arg == "--temp") { - params.temp = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); } else if (arg == "--repeat_last_n") { - params.repeat_last_n = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); } else if (arg == "--repeat_penalty") { - params.repeat_penalty = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--interactive-first") { @@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--color") { params.use_color = true; } else if (arg == "-r" || arg == "--reverse-prompt") { - params.antiprompt.push_back(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--n_parts") { - params.n_parts = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parts = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { gpt_print_usage(argc, argv, params); exit(0); @@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, params); - exit(0); + exit(1); } } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } return true; } From ad072fc5ad6f6905a7224ff6ea07c0644aa075b1 Mon Sep 17 00:00:00 2001 From: nusu-github <29514220+nusu-github@users.noreply.github.com> Date: Fri, 24 Mar 2023 05:16:48 +0900 Subject: [PATCH 03/12] Generate library with CMake (#430) * Generate library with CMake BUILD_SHARED_LIBS to allow llama library to be generated. * Turn ON PIC when BUILD_SHARED_LIBS is ON --- CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d952afb4f..51af97c4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -218,6 +218,9 @@ add_library(utils OBJECT target_include_directories(utils PUBLIC .) target_compile_features(utils PUBLIC cxx_std_11) # don't bump target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS}) +if (BUILD_SHARED_LIBS) + set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() add_library(ggml OBJECT ggml.c @@ -226,6 +229,9 @@ add_library(ggml OBJECT target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS}) +if (BUILD_SHARED_LIBS) + set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() add_library(llama llama.cpp @@ -234,6 +240,10 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS}) +if (BUILD_SHARED_LIBS) + set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) +endif() # # Executables From 20a1a4e09c522a80e2a0db51643d25fa38326065 Mon Sep 17 00:00:00 2001 From: Timmy Knight Date: Thu, 23 Mar 2023 10:18:13 -1000 Subject: [PATCH 04/12] Fix GPTQ converter (#423) * Fix GPTQ converter * Fix comment --------- Co-authored-by: Georgi Gerganov --- convert-gptq-to-ggml.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py index 7fccb4d56..6c77808fc 100644 --- a/convert-gptq-to-ggml.py +++ b/convert-gptq-to-ggml.py @@ -36,7 +36,8 @@ fname_out = sys.argv[3] fout = open(fname_out, "wb") -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex +fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", n_vocab)) fout.write(struct.pack("i", n_embd)) fout.write(struct.pack("i", n_mult)) @@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4)) # This loop unchanged from convert-pth-to-ggml.py: for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): - # "" token (translated as ??) text = " \u2047 ".encode("utf-8") - fout.write(struct.pack("i", len(text))) - fout.write(text) elif tokenizer.is_control(i): - # ""/"" tokens - fout.write(struct.pack("i", 0)) + text = b"" elif tokenizer.is_byte(i): - # "" tokens (which may be invalid UTF-8) piece = tokenizer.id_to_piece(i) if len(piece) != 6: - print("Invalid token: " + piece) + print(f"Invalid token: {piece}") sys.exit(1) byte_value = int(piece[3:-1], 16) - fout.write(struct.pack("i", 1)) - fout.write(struct.pack("B", byte_value)) + text = struct.pack("B", byte_value) else: - # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") - fout.write(struct.pack("i", len(text))) - fout.write(text) + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) def write_header(shape, dst_name, ftype_cur): sname = dst_name.encode('utf-8') From 2e17dfd80a473099dacc0f41c9146d233c6a5972 Mon Sep 17 00:00:00 2001 From: rabidcopy Date: Thu, 23 Mar 2023 15:22:47 -0500 Subject: [PATCH 05/12] Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333) * Improve interactive mode's coherence after EOS Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached. Not sure what token 13 is or why it seems to help. See conversation for examples. * Make newline token a constant * dynamically determine newline token * relocate previous newline token const * cleanup whitespace * print a new line on end of text in interactive this may need to be looked into further when not using a reverse prompt * only print manual newline with reverse prompt fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise * alternate approach to replace end of text tokens * Inject the reverse prompt again after eos in interactive mode * tokenize reverse prompt when needed makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330 * tokenize and inject only first reverse prompt thanks to tjohnman * tokenize first reverse prompt once * add newline token * add newline token * tokenize/inject reverse prompt for refactor this doesn't seem right though * tokenize nothing for antiprompt if no reverse * Update main.cpp * Update main.cpp * tokenize and inject reverse prompt as needed this doesn't seem to work if the reverse prompt is tokenized outside earlier on * not needed * remove newline token * remove newline token * tokenize newline token * add space to comment * Update main.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com> Co-authored-by: Georgi Gerganov --- main.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/main.cpp b/main.cpp index 431c94b52..5ba6d5a75 100644 --- a/main.cpp +++ b/main.cpp @@ -258,6 +258,9 @@ int main(int argc, char ** argv) { params.interactive = true; } + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); @@ -359,6 +362,16 @@ int main(int argc, char ** argv) { last_n_tokens.push_back(id); } + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos() && params.interactive) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + // add it to the context embd.push_back(id); @@ -451,12 +464,8 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == llama_token_eos()) { - if (params.interactive) { - is_interacting = true; - } else { - fprintf(stderr, " [end of text]\n"); - break; - } + fprintf(stderr, " [end of text]\n"); + break; } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. From 0ba5a3a9a5efedb1aeecbbc70a4e9825542472d5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Mar 2023 22:32:02 +0200 Subject: [PATCH 06/12] Obsolete --- download-pth.py | 66 ------------------------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 download-pth.py diff --git a/download-pth.py b/download-pth.py deleted file mode 100644 index 129532c0c..000000000 --- a/download-pth.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys -from tqdm import tqdm -import requests - -if len(sys.argv) < 3: - print("Usage: download-pth.py dir-model model-type\n") - print(" model-type: Available models 7B, 13B, 30B or 65B") - sys.exit(1) - -modelsDir = sys.argv[1] -model = sys.argv[2] - -num = { - "7B": 1, - "13B": 2, - "30B": 4, - "65B": 8, -} - -if model not in num: - print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") - sys.exit(1) - -print(f"Downloading model {model}") - -files = ["checklist.chk", "params.json"] - -for i in range(num[model]): - files.append(f"consolidated.0{i}.pth") - -resolved_path = os.path.abspath(os.path.join(modelsDir, model)) -os.makedirs(resolved_path, exist_ok=True) - -for file in files: - dest_path = os.path.join(resolved_path, file) - - if os.path.exists(dest_path): - print(f"Skip file download, it already exists: {file}") - continue - - url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" - response = requests.get(url, stream=True) - with open(dest_path, 'wb') as f: - with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: - for chunk in response.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - t.update(len(chunk)) - -files2 = ["tokenizer_checklist.chk", "tokenizer.model"] -for file in files2: - dest_path = os.path.join(modelsDir, file) - - if os.path.exists(dest_path): - print(f"Skip file download, it already exists: {file}") - continue - - url = f"https://agi.gpt4.org/llama/LLaMA/{file}" - response = requests.get(url, stream=True) - with open(dest_path, 'wb') as f: - with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: - for chunk in response.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - t.update(len(chunk)) \ No newline at end of file From 4cc053b6d5e9df7ac21fa06b7208a70c156d4d7a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Mar 2023 22:39:44 +0200 Subject: [PATCH 07/12] Remove oboslete command from Docker script --- .devops/tools.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.devops/tools.sh b/.devops/tools.sh index 352e04942..b0196b60d 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then ./quantize $arg2 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then ./main $arg2 -elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then - python3 ./download-pth.py $arg2 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then - echo "Downloading model..." - python3 ./download-pth.py "$1" "$2" echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do if [ -f "${i/f16/q4_0}" ]; then @@ -39,8 +35,6 @@ else echo " ex: \"/models/7B/\" 1" echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" - echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" - echo " ex: \"/models/\" 7B" - echo " --all-in-one (-a): Execute --download, --convert & --quantize" + echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" fi From 404e1da38ec8025707031a8027da14dc1590f952 Mon Sep 17 00:00:00 2001 From: Jed Fox Date: Thu, 23 Mar 2023 16:42:52 -0400 Subject: [PATCH 08/12] Fix quantize script not finding models in parent directory (#428) --- quantize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/quantize.py b/quantize.py index 6320b0a26..16b5963d3 100644 --- a/quantize.py +++ b/quantize.py @@ -57,6 +57,7 @@ def main(): # ) args = parser.parse_args() + args.models_path = os.path.abspath(args.models_path) if not os.path.isfile(args.quantize_script_path): print( From 483bab2e3d4a868fe679d8bb32827d2a4df214dc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Mar 2023 23:22:01 +0200 Subject: [PATCH 09/12] Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439) Should make results reproducible for different number of threads and batch sizes --- llama.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7de3c19c8..d55219256 100644 --- a/llama.cpp +++ b/llama.cpp @@ -727,11 +727,13 @@ static bool llama_eval_internal( // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V_trans = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3); + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); From 4870e455b3653f7d7769fa5772b2c90ffad088df Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 Mar 2023 00:11:53 +0200 Subject: [PATCH 10/12] Fix memory allocation issues and seg faults --- llama.cpp | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/llama.cpp b/llama.cpp index d55219256..cf796cce3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -102,6 +102,9 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; + + // work buffer for transformer evaluation + std::vector buf_eval; }; struct llama_context_params llama_context_default_params() { @@ -627,27 +630,19 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; + auto & buf_eval = lctx.buf_eval; - // TODO: fix this hardcoded size - static size_t buf_size = 512u*1024*1024; - static void * buf = malloc(buf_size); + if (mem_per_token*(n_past + N + 16) > buf_eval.size()) { + const size_t buf_size_new = 1.618*buf_eval.size(); - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead - //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new); - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } + buf_eval.resize(buf_size_new); } struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, + /*.mem_size =*/ buf_eval.size(), + /*.mem_buffer =*/ buf_eval.data(), }; struct ggml_context * ctx0 = ggml_init(params); @@ -832,10 +827,11 @@ static bool llama_eval_internal( memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + if (N == 1) { + mem_per_token = ggml_used_mem(ctx0)/(n_past + N); } - //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); + + //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024); ggml_free(ctx0); @@ -1416,6 +1412,8 @@ struct llama_context * llama_init_from_file( return nullptr; } + ctx->buf_eval.resize(512u*1024u*1024u); + return ctx; } From 3cd8dde0d1357b7f11bdd25c45d5bf5e97e284a0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 Mar 2023 06:22:28 +0200 Subject: [PATCH 11/12] Revert "Fix memory allocation issues and seg faults" This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df. Will provide the correct fix later --- llama.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/llama.cpp b/llama.cpp index cf796cce3..d55219256 100644 --- a/llama.cpp +++ b/llama.cpp @@ -102,9 +102,6 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; - - // work buffer for transformer evaluation - std::vector buf_eval; }; struct llama_context_params llama_context_default_params() { @@ -630,19 +627,27 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - auto & buf_eval = lctx.buf_eval; - if (mem_per_token*(n_past + N + 16) > buf_eval.size()) { - const size_t buf_size_new = 1.618*buf_eval.size(); + // TODO: fix this hardcoded size + static size_t buf_size = 512u*1024*1024; + static void * buf = malloc(buf_size); - //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new); + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead + //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - buf_eval.resize(buf_size_new); + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } } struct ggml_init_params params = { - /*.mem_size =*/ buf_eval.size(), - /*.mem_buffer =*/ buf_eval.data(), + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, }; struct ggml_context * ctx0 = ggml_init(params); @@ -827,11 +832,10 @@ static bool llama_eval_internal( memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } - if (N == 1) { - mem_per_token = ggml_used_mem(ctx0)/(n_past + N); + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; } - - //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024); + //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -1412,8 +1416,6 @@ struct llama_context * llama_init_from_file( return nullptr; } - ctx->buf_eval.resize(512u*1024u*1024u); - return ctx; } From b6b268d4415fd3b3e53f22b6619b724d4928f713 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 Mar 2023 09:13:35 +0200 Subject: [PATCH 12/12] Add link to Roadmap discussion --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee8dc1dcb..06799b5b3 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457) - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370 -- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105