diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d39cd6bc3..0da01d5ba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -333,6 +333,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -361,6 +362,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml new file mode 100644 index 000000000..bc08a72d0 --- /dev/null +++ b/.github/workflows/close-issue.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "42 0 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index e385e03f3..5e38b3547 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -25,17 +25,14 @@ jobs: strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] + build_type: [Debug] include: - build_type: Release sanitizer: "" - exclude: - - build_type: Release - sanitizer: ADDRESS - - build_type: Release + - build_type: Debug sanitizer: THREAD - - build_type: Release - sanitizer: UNDEFINED + disabled_on_pr: true + fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken container: image: ubuntu:latest @@ -81,13 +78,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests PORT=8888 ./tests.sh - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow @@ -124,13 +122,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/.gitignore b/.gitignore index d28f4d1b8..1ad8d929b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ .vscode/ .idea/ +ggml-metal-embed.metal + lcov-report/ gcovr-report/ diff --git a/CMakeLists.txt b/CMakeLists.txt index a8abf4088..3ac2804a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,9 +200,6 @@ if (LLAMA_METAL) add_compile_definitions(GGML_METAL_NDEBUG) endif() - # get full path to the file - #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - # copy ggml-common.h and ggml-metal.metal to bin directory configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) @@ -211,53 +208,62 @@ if (LLAMA_METAL) enable_language(ASM) add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") add_custom_command( - OUTPUT ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} - DEPENDS ${METALLIB_SOURCE} + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Generate assembly for embedded Metal library" ) - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) - endif() - - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - if (LLAMA_QKK_64) - set(XC_FLAGS ${XC_FLAGS} -DQK_K=64) + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (LLAMA_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) endif() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DEPENDS ggml-metal.metal + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Compiling Metal kernels" - ) + ) add_custom_target( ggml-metal ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() + ) + endif() # LLAMA_METAL_EMBED_LIBRARY set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} diff --git a/Makefile b/Makefile index db9968efb..c0f125036 100644 --- a/Makefile +++ b/Makefile @@ -553,19 +553,20 @@ endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef LLAMA_METAL_EMBED_LIBRARY -ggml-metal-embed.o: ggml-metal.metal +ggml-metal-embed.o: ggml-metal.metal ggml-common.h @echo "Embedding Metal library" + @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @$(AS) $(TEMP_ASSEMBLY) -o $@ @rm -f ${TEMP_ASSEMBLY} endif diff --git a/README.md b/README.md index 61bedc3f8..c2f3342f0 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well. - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) - [x] [Mamba](https://github.com/state-spaces/mamba) +- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) **Multimodal models:** @@ -133,6 +134,7 @@ Typically finetunes of the base models below are supported as well. - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) +- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) diff --git a/common/common.cpp b/common/common.cpp index 73b1b61ba..1b0ba8493 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -151,13 +151,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } + bool arg_found = false; if (arg == "-s" || arg == "--seed") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.seed = std::stoul(argv[i]); - } else if (arg == "-t" || arg == "--threads") { + } + if (arg == "-t" || arg == "--threads") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -166,7 +170,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } - } else if (arg == "-tb" || arg == "--threads-batch") { + } + if (arg == "-tb" || arg == "--threads-batch") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -175,7 +181,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } - } else if (arg == "-td" || arg == "--threads-draft") { + } + if (arg == "-td" || arg == "--threads-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -184,7 +192,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -193,25 +203,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-p" || arg == "--prompt") { + } + if (arg == "-p" || arg == "--prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.prompt = argv[i]; - } else if (arg == "-e" || arg == "--escape") { + } + if (arg == "-e" || arg == "--escape") { + arg_found = true; params.escape = true; - } else if (arg == "--prompt-cache") { + } + if (arg == "--prompt-cache") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.path_prompt_cache = argv[i]; - } else if (arg == "--prompt-cache-all") { + } + if (arg == "--prompt-cache-all") { + arg_found = true; params.prompt_cache_all = true; - } else if (arg == "--prompt-cache-ro") { + } + if (arg == "--prompt-cache-ro") { + arg_found = true; params.prompt_cache_ro = true; - } else if (arg == "-bf" || arg == "--binary-file") { + } + if (arg == "-bf" || arg == "--binary-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -228,7 +250,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { ss << file.rdbuf(); params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } else if (arg == "-f" || arg == "--file") { + } + if (arg == "-f" || arg == "--file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -245,51 +269,67 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else if (arg == "-n" || arg == "--n-predict") { + } + if (arg == "-n" || arg == "--n-predict") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top-k") { + } + if (arg == "--top-k") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx-size") { + } + if (arg == "-c" || arg == "--ctx-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { + } + if (arg == "--grp-attn-n" || arg == "-gan") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { + } + if (arg == "--rope-freq-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { + } + if (arg == "--rope-freq-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-scaling") { + } + if (arg == "--rope-scaling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -299,43 +339,57 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } - } else if (arg == "--rope-scale") { + } + if (arg == "--rope-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--yarn-orig-ctx") { + } + if (arg == "--yarn-orig-ctx") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_orig_ctx = std::stoi(argv[i]); - } else if (arg == "--yarn-ext-factor") { + } + if (arg == "--yarn-ext-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { + } + if (arg == "--yarn-attn-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { + } + if (arg == "--yarn-beta-fast") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { + } + if (arg == "--yarn-beta-slow") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--pooling") { + } + if (arg == "--pooling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -345,118 +399,156 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } - } else if (arg == "--defrag-thold" || arg == "-dt") { + } + if (arg == "--defrag-thold" || arg == "-dt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.defrag_thold = std::stof(argv[i]); - } else if (arg == "--samplers") { + } + if (arg == "--samplers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } else if (arg == "--sampling-seq") { + } + if (arg == "--sampling-seq") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } else if (arg == "--top-p") { + } + if (arg == "--top-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { + } + if (arg == "--min-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.min_p = std::stof(argv[i]); - } else if (arg == "--temp") { + } + if (arg == "--temp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); - } else if (arg == "--tfs") { + } + if (arg == "--tfs") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.tfs_z = std::stof(argv[i]); - } else if (arg == "--typical") { + } + if (arg == "--typical") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat-last-n") { + } + if (arg == "--repeat-last-n") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } else if (arg == "--repeat-penalty") { + } + if (arg == "--repeat-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_repeat = std::stof(argv[i]); - } else if (arg == "--frequency-penalty") { + } + if (arg == "--frequency-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_freq = std::stof(argv[i]); - } else if (arg == "--presence-penalty") { + } + if (arg == "--presence-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_present = std::stof(argv[i]); - } else if (arg == "--dynatemp-range") { + } + if (arg == "--dynatemp-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_range = std::stof(argv[i]); - } else if (arg == "--dynatemp-exp") { + } + if (arg == "--dynatemp-exp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_exponent = std::stof(argv[i]); - } else if (arg == "--mirostat") { + } + if (arg == "--mirostat") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat-lr") { + } + if (arg == "--mirostat-lr") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat-ent") { + } + if (arg == "--mirostat-ent") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_tau = std::stof(argv[i]); - } else if (arg == "--cfg-negative-prompt") { + } + if (arg == "--cfg-negative-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_negative_prompt = argv[i]; - } else if (arg == "--cfg-negative-prompt-file") { + } + if (arg == "--cfg-negative-prompt-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -471,86 +563,114 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { sparams.cfg_negative_prompt.pop_back(); } - } else if (arg == "--cfg-scale") { + } + if (arg == "--cfg-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_scale = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { + } + if (arg == "-b" || arg == "--batch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_batch = std::stoi(argv[i]); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + if (arg == "-ub" || arg == "--ubatch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ubatch = std::stoi(argv[i]); - } else if (arg == "--keep") { + } + if (arg == "--keep") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_keep = std::stoi(argv[i]); - } else if (arg == "--draft") { + } + if (arg == "--draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_draft = std::stoi(argv[i]); - } else if (arg == "--chunks") { + } + if (arg == "--chunks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_chunks = std::stoi(argv[i]); - } else if (arg == "-np" || arg == "--parallel") { + } + if (arg == "-np" || arg == "--parallel") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-ns" || arg == "--sequences") { + } + if (arg == "-ns" || arg == "--sequences") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-split" || arg == "-ps") { + } + if (arg == "--p-split" || arg == "-ps") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.p_split = std::stof(argv[i]); - } else if (arg == "-m" || arg == "--model") { + } + if (arg == "-m" || arg == "--model") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } else if (arg == "-md" || arg == "--model-draft") { + } + if (arg == "-md" || arg == "--model-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_draft = argv[i]; - } else if (arg == "-a" || arg == "--alias") { + } + if (arg == "-a" || arg == "--alias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } else if (arg == "--lora") { + } + if (arg == "--lora") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; - } else if (arg == "--lora-scaled") { + } + if (arg == "--lora-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -562,55 +682,127 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; - } else if (arg == "--lora-base") { + } + if (arg == "--lora-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_base = argv[i]; - } else if (arg == "--mmproj") { + } + if (arg == "--control-vector") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } + if (arg == "--control-vector-scaled") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + const char * fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } + if (arg == "--control-vector-layer-range") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); + } + if (arg == "--mmproj") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.mmproj = argv[i]; - } else if (arg == "--image") { + } + if (arg == "--image") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.image = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { + } + if (arg == "-i" || arg == "--interactive") { + arg_found = true; params.interactive = true; - } else if (arg == "--embedding") { + } + if (arg == "--embedding") { + arg_found = true; params.embedding = true; - } else if (arg == "--interactive-first") { + } + if (arg == "--interactive-first") { + arg_found = true; params.interactive_first = true; - } else if (arg == "-ins" || arg == "--instruct") { + } + if (arg == "-ins" || arg == "--instruct") { + arg_found = true; params.instruct = true; - } else if (arg == "-cml" || arg == "--chatml") { + } + if (arg == "-cml" || arg == "--chatml") { + arg_found = true; params.chatml = true; - } else if (arg == "--infill") { + } + if (arg == "--infill") { + arg_found = true; params.infill = true; - } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + arg_found = true; params.dump_kv_cache = true; - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + arg_found = true; params.no_kv_offload = true; - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + if (arg == "-ctk" || arg == "--cache-type-k") { + arg_found = true; params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + if (arg == "-ctv" || arg == "--cache-type-v") { + arg_found = true; params.cache_type_v = argv[++i]; - } else if (arg == "--multiline-input") { + } + if (arg == "--multiline-input") { + arg_found = true; params.multiline_input = true; - } else if (arg == "--simple-io") { + } + if (arg == "--simple-io") { + arg_found = true; params.simple_io = true; - } else if (arg == "-cb" || arg == "--cont-batching") { + } + if (arg == "-cb" || arg == "--cont-batching") { + arg_found = true; params.cont_batching = true; - } else if (arg == "--color") { + } + if (arg == "--color") { + arg_found = true; params.use_color = true; - } else if (arg == "--mlock") { + } + if (arg == "--mlock") { + arg_found = true; params.use_mlock = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -620,7 +812,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -630,7 +824,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--main-gpu" || arg == "-mg") { + } + if (arg == "--main-gpu" || arg == "-mg") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -639,7 +835,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--split-mode" || arg == "-sm") { + } + if (arg == "--split-mode" || arg == "-sm") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -663,7 +861,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--tensor-split" || arg == "-ts") { + } + if (arg == "--tensor-split" || arg == "-ts") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -688,9 +888,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--no-mmap") { + } + if (arg == "--no-mmap") { + arg_found = true; params.use_mmap = false; - } else if (arg == "--numa") { + } + if (arg == "--numa") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -700,17 +904,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } - } else if (arg == "--verbose-prompt") { + } + if (arg == "--verbose-prompt") { + arg_found = true; params.verbose_prompt = true; - } else if (arg == "--no-display-prompt") { + } + if (arg == "--no-display-prompt") { + arg_found = true; params.display_prompt = false; - } else if (arg == "-r" || arg == "--reverse-prompt") { + } + if (arg == "-r" || arg == "--reverse-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.antiprompt.emplace_back(argv[i]); - } else if (arg == "-ld" || arg == "--logdir") { + } + if (arg == "-ld" || arg == "--logdir") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -720,63 +932,93 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.logits_file = argv[i]; - } else if (arg == "--perplexity" || arg == "--all-logits") { + } + if (arg == "--perplexity" || arg == "--all-logits") { + arg_found = true; params.logits_all = true; - } else if (arg == "--ppl-stride") { + } + if (arg == "--ppl-stride") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-ptc" || arg == "--print-token-count") { + } + if (arg == "-ptc" || arg == "--print-token-count") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_print = std::stoi(argv[i]); - } else if (arg == "--ppl-output-type") { + } + if (arg == "--ppl-output-type") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_output_type = std::stoi(argv[i]); - } else if (arg == "--hellaswag") { + } + if (arg == "--hellaswag") { + arg_found = true; params.hellaswag = true; - } else if (arg == "--hellaswag-tasks") { + } + if (arg == "--hellaswag-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.hellaswag_tasks = std::stoi(argv[i]); - } else if (arg == "--winogrande") { + } + if (arg == "--winogrande") { + arg_found = true; params.winogrande = true; - } else if (arg == "--winogrande-tasks") { + } + if (arg == "--winogrande-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.winogrande_tasks = std::stoi(argv[i]); - } else if (arg == "--multiple-choice") { + } + if (arg == "--multiple-choice") { + arg_found = true; params.multiple_choice = true; - } else if (arg == "--multiple-choice-tasks") { + } + if (arg == "--multiple-choice-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.multiple_choice_tasks = std::stoi(argv[i]); - } else if (arg == "--kl-divergence") { + } + if (arg == "--kl-divergence") { + arg_found = true; params.kl_divergence = true; - } else if (arg == "--ignore-eos") { + } + if (arg == "--ignore-eos") { + arg_found = true; params.ignore_eos = true; - } else if (arg == "--no-penalize-nl") { + } + if (arg == "--no-penalize-nl") { + arg_found = true; sparams.penalize_nl = false; - } else if (arg == "-l" || arg == "--logit-bias") { + } + if (arg == "-l" || arg == "--logit-bias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -795,36 +1037,51 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "-h" || arg == "--help") { + } + if (arg == "-h" || arg == "--help") { + arg_found = true; return false; - - } else if (arg == "--version") { + } + if (arg == "--version") { + arg_found = true; fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } else if (arg == "--random-prompt") { + } + if (arg == "--random-prompt") { + arg_found = true; params.random_prompt = true; - } else if (arg == "--in-prefix-bos") { + } + if (arg == "--in-prefix-bos") { + arg_found = true; params.input_prefix_bos = true; - } else if (arg == "--in-prefix") { + } + if (arg == "--in-prefix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_prefix = argv[i]; - } else if (arg == "--in-suffix") { + } + if (arg == "--in-suffix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_suffix = argv[i]; - } else if (arg == "--grammar") { + } + if (arg == "--grammar") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.grammar = argv[i]; - } else if (arg == "--grammar-file") { + } + if (arg == "--grammar-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -840,7 +1097,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); - } else if (arg == "--override-kv") { + } + if (arg == "--override-kv") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -883,10 +1142,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters - } else if ( log_param_single_parse( argv[i] ) ) { + } + if ( log_param_single_parse( argv[i] ) ) { + arg_found = true; // Do nothing, log_param_single_parse automatically does it's thing // and returns if a match was found and parsed. - } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + } + if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + arg_found = true; // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. @@ -900,7 +1163,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS - } else { + } + + if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); } } @@ -1095,6 +1360,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); @@ -1360,6 +1631,30 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1877,3 +2172,173 @@ void llama_embd_normalize(const float * inp, float * out, int n) { } } +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ + double sum = 0.0; + double sum1 = 0.0; + double sum2 = 0.0; + + for (int i = 0; i < n; i++) { + sum += embd1[i] * embd2[i]; + sum1 += embd1[i] * embd1[i]; + sum2 += embd2[i] * embd2[i]; + } + + return sum / (sqrt(sum1) * sqrt(sum2)); +} + +// +// Control vector utils +// + +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + + size_t n_bytes = 0; + + uint32_t max_direction_layer = 0; + + llama_control_vector_data result = { -1, {} }; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + return result; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(ctx); + return result; + } + + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + + if (tensor) { + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; + } + } else { + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } + } + } + + return result; +} + +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; + + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); + + if (cur.n_embd == -1) { + return result; + } + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; + } + + if (result.n_embd == -1) { + result = std::move(cur); + } else { + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; + } + } + } + + if (result.n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + + return result; +} diff --git a/common/common.h b/common/common.h index 0f178b9eb..687f3425e 100644 --- a/common/common.h +++ b/common/common.h @@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -103,6 +106,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -268,3 +276,25 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); + +// +// Control vector utils +// + +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5eee32016..cf1f98d66 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1965,6 +1965,23 @@ class MambaModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + ###### CONVERSION LOGIC ###### diff --git a/convert.py b/convert.py index c15f8c47e..161430f3e 100755 --- a/convert.py +++ b/convert.py @@ -332,6 +332,9 @@ class Params: # class BpeVocab: + tokenizer_model = "gpt2" + name = "bpe" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) if isinstance(self.bpe_tokenizer.get('model'), dict): @@ -390,6 +393,9 @@ class BpeVocab: class SentencePieceVocab: + tokenizer_model = "llama" + name = "spm" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: dict[str, int] @@ -453,6 +459,9 @@ class SentencePieceVocab: class HfVocab: + tokenizer_model = "llama" + name = "hfft" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: try: from transformers import AutoTokenizer @@ -553,7 +562,15 @@ class HfVocab: return f"" -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" +class NoVocab: + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab" # @@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N # Handle special case where the model's vocab size is not set if params.n_vocab == -1: raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}" ) + if isinstance(vocab, NoVocab): + return # model has no vocab # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: @@ -977,6 +996,7 @@ class OutputFile: name = str(params.path_model.parent).split('/')[-1] self.gguf.add_name (name) + self.gguf.add_vocab_size (params.n_vocab) self.gguf.add_context_length (params.n_ctx) self.gguf.add_embedding_length (params.n_embd) self.gguf.add_block_count (params.n_layer) @@ -1013,21 +1033,9 @@ class OutputFile: if params.ftype is not None: self.gguf.add_file_type(params.ftype) - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: + assert not isinstance(vocab, NoVocab) + tokens = [] scores = [] toktypes = [] @@ -1043,11 +1051,8 @@ class OutputFile: return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) + self.gguf.add_tokenizer_model(vocab.tokenizer_model) # Extract model vocabulary for model conversion tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) @@ -1074,6 +1079,26 @@ class OutputFile: def write_tensor_info(self) -> None: self.gguf.write_ti_data_to_file() + def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) + self.gguf.write_tensor_data(ndarray) + def close(self) -> None: self.gguf.close() @@ -1082,7 +1107,7 @@ class OutputFile: fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: - check_vocab_size(params, vocab, pad_vocab = pad_vocab) + check_vocab_size(params, vocab, pad_vocab=pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1120,8 +1145,11 @@ class OutputFile: # meta data of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) + if isinstance(vocab, NoVocab): + of.gguf.add_tokenizer_model(vocab.tokenizer_model) + else: + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) # tensor info for name, lazy_tensor in model.items(): @@ -1131,24 +1159,7 @@ class OutputFile: of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) + of.write_tensor_data(ftype, model, concurrency) of.close() @@ -1309,8 +1320,8 @@ class VocabFactory: return vtype, path raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") - def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" + def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocab.name == "bpe" n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None return gguf.SpecialVocab( model_parent_path, @@ -1319,30 +1330,34 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: vocab_type, path = self._select_file(vocab_types) print(f"Loading vocab file {path!r}, type {vocab_type!r}") added_tokens_path = path.parent / "added_tokens.json" - vocab: Vocab if vocab_type == "bpe": - vocab = BpeVocab( + return BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "spm": - vocab = SentencePieceVocab( + if vocab_type == "spm": + return SentencePieceVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "hfft": - vocab = HfVocab( + if vocab_type == "hfft": + return HfVocab( path.parent, added_tokens_path if added_tokens_path.exists() else None ) + raise ValueError(vocab_type) + + def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + vocab: Vocab + if len(vocab_types) == 1 and "no_vocab" in vocab_types: + vocab = NoVocab() else: - raise ValueError(vocab_type) + vocab = self._create_vocab_by_path(vocab_types) # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, - vocab_type, model_parent_path, ) return vocab, special_vocab @@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") @@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") args = parser.parse_args(args_in) + if args.no_vocab: + if args.vocab_only: + raise ValueError("no need to specify --vocab-only if using --no-vocab") + args.vocab_type = "no_vocab" if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None: print(f"Wrote {outfile}") return - if model_plus.vocab is not None and args.vocab_dir is None: + if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab print(f"Vocab info: {vocab}") diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 49302a199..cbf9aa2b5 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -112,13 +112,20 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true); + auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { inp.resize(n_batch); } inputs.push_back(inp); } + // add eos if not present + for (auto & inp : inputs) { + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + } + // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { @@ -167,15 +174,26 @@ int main(int argc, char ** argv) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - // print first 3 embeddings - for (int j = 0; j < std::min(3, n_prompts); j++) { - fprintf(stderr, "embedding %d: ", j); - for (int i = 0; i < n_embd; i++) { - fprintf(stderr, "%f ", emb[j * n_embd + i]); + // print the first part of the embeddings + fprintf(stdout, "\n"); + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(16, n_embd); i++) { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); } - fprintf(stderr, "\n\n"); + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "\n"); } - fprintf(stderr, "\n"); // clean up llama_print_timings(ctx); diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index e67be4fb2..5444503a5 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) { for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + gguf_free(ctx); return false; } } diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md new file mode 100644 index 000000000..64cc19204 --- /dev/null +++ b/examples/gritlm/README.md @@ -0,0 +1,62 @@ +## Generative Representational Instruction Tuning (GRIT) Example +[gritlm] a model which can generate embeddings as well as "normal" text +generation depending on the instructions in the prompt. + +* Paper: https://arxiv.org/pdf/2402.09906.pdf + +### Retrieval-Augmented Generation (RAG) use case +One use case for `gritlm` is to use it with RAG. If we recall how RAG works is +that we take documents that we want to use as context, to ground the large +language model (LLM), and we create token embeddings for them. We then store +these token embeddings in a vector database. + +When we perform a query, prompt the LLM, we will first create token embeddings +for the query and then search the vector database to retrieve the most +similar vectors, and return those documents so they can be passed to the LLM as +context. Then the query and the context will be passed to the LLM which will +have to _again_ create token embeddings for the query. But because gritlm is used +the first query can be cached and the second query tokenization generation does +not have to be performed at all. + +### Running the example +Download a Grit model: +```console +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +``` + +Run the example using the downloaded model: +```console +$ ./gritlm -m gritlm-7b_q4_1.gguf + +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 +Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 +Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +When shadows lurk and ghosts do roam, +And darkness reigns, a fearsome sight. + +Thou didst set out, with heart aglow, +To conquer this mountain, so high, +And reach the summit, where the stars do glow, +And the moon shines bright, up in the sky. + +Through the mist and fog, thou didst press on, +With steadfast courage, and a steadfast will, +Through the darkness, thou didst not be gone, +But didst climb on, with a steadfast skill. + +At last, thou didst reach the summit's crest, +And gazed upon the world below, +And saw the beauty of the night's best, +And felt the peace, that only nature knows. + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +Thou art a hero, in the eyes of all, +For thou didst conquer this mountain, so bright. +``` + +[gritlm]: https://github.com/ContextualAI/gritlm diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 3d4b085d6..52fd719b3 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -6,22 +6,6 @@ // #define GRIT_DEBUG -static float dot_product(const std::vector & v1, const std::vector & v2) { - float dot = 0.0f; - for (uint64_t i = 0; i < v1.size(); ++i) { - dot += v1[i] * v2[i]; - } - return dot; -} - -static float norm(const std::vector & v) { - return std::sqrt(dot_product(v, v)); -} - -static float cosine_similarity(const std::vector & v1, const std::vector & v2) { - return dot_product(v1, v2) / (norm(v1) * norm(v2)); -} - static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; @@ -203,10 +187,12 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); - const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]); - const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); - const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); + const int n_embd = llama_n_embd(mdl); + + const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bf94e7e7a..32eea7869 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ static std::string get_cpu_info() { } } } + fclose(f); } #endif // TODO: other platforms @@ -1122,15 +1124,19 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx))); - //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0)); - //GGML_UNUSED(n_batch); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + std::vector tokens(n_batch); - std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } @@ -1141,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - llama_token token = llama_token_bos(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_synchronize(ctx); + token = std::rand() % n_vocab; } } diff --git a/examples/llava/README.md b/examples/llava/README.md index 35e6d9e5d..67cb0f22b 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: + +2) Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: + +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -4) Create the visual gguf model: +5) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -5) Then convert the model to gguf format: +6) Then convert the model to gguf format: ```console python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -6) And finally we can run the llava-cli using the 1.6 model version: +7) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 6653b815d..a0ed82d7e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!new_clip->ctx_data) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("cannot open model file for loading tensors\n"); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); + gguf_free(ctx); return nullptr; } int num_bytes = ggml_nbytes(cur); @@ -1232,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() { void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } -void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } @@ -1494,7 +1497,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1506,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0) { + if (res_imgs->size > 0) { clip_image_f32_batch_free(res_imgs); } - res_imgs.data = nullptr; - res_imgs.size = 0; + res_imgs->data = nullptr; + res_imgs->size = 0; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1565,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); // clip_image_f32_batch_init(patches.size()); - res_imgs.size = patches.size(); - res_imgs.data = new clip_image_f32[res_imgs.size]; + res_imgs->size = patches.size(); + res_imgs->data = new clip_image_f32[res_imgs->size]; int num=0; for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); + normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); num++; } @@ -1657,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // } // res_imgs.push_back(res); - res_imgs.size = 1; - res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = *res; + res_imgs->size = 1; + res_imgs->data = new clip_image_f32[res_imgs->size]; + res_imgs->data[0] = *res; clip_image_f32_free(res); return true; @@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i break; default: printf("Please use an input file in f32 or f16\n"); + gguf_free(ctx_out); return false; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index e5bd54924..45bdad689 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); -CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); -CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); @@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 980128166..29764757a 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli clip_image_f32_batch img_res_v; img_res_v.size = 0; img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { + if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 2d40f3f1d..19212f6e9 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -29,9 +29,9 @@ struct llava_image_embed { }; /** sanity check for clip <-> llava embed size match */ -LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); -LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index cfa9f96ec..a59a52d21 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -119,6 +119,10 @@ def step_server_metrics(context): def step_start_server(context): start_server_background(context) attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + while True: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: result = sock.connect_ex((context.server_fqdn, context.server_port)) @@ -126,7 +130,7 @@ def step_start_server(context): print("\x1b[33;46mserver started!\x1b[0m") return attempts += 1 - if attempts > 20: + if attempts > max_attempts: assert False, "server not started" print(f"waiting for server to start, connect error code = {result}...") time.sleep(0.1) @@ -943,6 +947,9 @@ async def wait_for_health_status(context, print(f"Starting checking for health for expected_health_status={expected_health_status}\n") interval = 0.5 counter = 0 + if 'GITHUB_ACTIONS' in os.environ: + timeout *= 2 + async with aiohttp.ClientSession() as session: while True: async with await session.get(f'{base_url}/health', params=params) as health_response: diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index 26ad2f7da..f20391d7a 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh #for FP32 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -#build example/main only +#build example/main #cmake --build . --config Release --target main +#build example/llama-bench +#cmake --build . --config Release --target llama-bench + #build all binary cmake --build . --config Release -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 52f7c01a4..c979a52f6 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 + GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 fi -echo "use $GGML_SYCL_DEVICE as main GPU" + #export GGML_SYCL_DEBUG=1 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -#use all GPUs with same max compute units -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then + echo "use $GGML_SYCL_DEVICE as main GPU" + #use signle GPU only + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +else + #use multiple GPUs with same max compute units + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +fi #use main GPU only #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +#use multiple GPUs with same max compute units +#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7eafe8515..7d06e401b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model * load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); + gguf_free(fctx); return true; } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d1b5e52ba..db595409a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev if (ggml_backend_is_cuda(event->backend)) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); } else { +#if 0 // untested auto wait_fn = [](void * user_data) { ggml_backend_event_t event = (ggml_backend_event_t)user_data; @@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev }; CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); +#endif + GGML_ASSERT(false); } } diff --git a/ggml-metal.m b/ggml-metal.m index 3a5476c52..c3451a79b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { id metal_library; // load library + // + // - first check if the library is embedded + // - then check if the library is in the bundle + // - if not found, load the source and compile it + // - if that fails, return NULL { NSBundle * bundle = nil; #ifdef SWIFT_PACKAGE @@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { #else bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif + NSError * error = nil; - NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; - if (libPath != nil) { + +#if GGML_METAL_EMBED_LIBRARY + const bool try_metallib = false; +#else + const bool try_metallib = true; +#endif + + NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (try_metallib && path_lib != nil) { // pre-compiled library found - NSURL * libURL = [NSURL fileURLWithPath:libPath]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); + NSURL * libURL = [NSURL fileURLWithPath:path_lib]; + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); @@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { extern const char ggml_metallib_start[]; extern const char ggml_metallib_end[]; - NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; + NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; #else GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - NSString * sourcePath; - NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; + NSString * path_source; + NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); + GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); - if (ggmlMetalPathResources) { - sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; + if (path_resource) { + path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; } else { - sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; } - if (sourcePath == nil) { + + if (path_source == nil) { GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); - sourcePath = @"ggml-metal.metal"; + path_source = @"ggml-metal.metal"; } - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); - NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; + + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + + NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } -#endif +#endif // GGML_METAL_EMBED_LIBRARY @autoreleasepool { // dictionary of preprocessor macros diff --git a/ggml-metal.metal b/ggml-metal.metal index ebf2f5b47..63de56325 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4,9 +4,6 @@ #include -#define GGML_COMMON_IMPL_METAL -#include "ggml-common.h" - using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 15aeb993f..c7bed0c8f 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,10 +25,9 @@ #include #include #include - #include #include - +#include #include #include @@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp #define __dpct_noinline__ __attribute__((noinline)) #endif + +std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + namespace dpct { typedef sycl::queue *queue_ptr; @@ -942,17 +966,65 @@ namespace dpct private: mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "opencl:cpu") return 2; + if (backend == "opencl:acc") return 3; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ASSERT(false); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } dev_mgr() { sycl::device default_device = sycl::device(sycl::default_selector_v); _devs.push_back(std::make_shared(default_device)); - std::vector sycl_all_devs = - sycl::device::get_devices(sycl::info::device_type::all); + std::vector sycl_all_devs; // Collect other devices except for the default device. if (default_device.is_cpu()) _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map DeviceNums; + std::map> backend_devices; + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + for (auto &dev : sycl_all_devs) { if (dev == default_device) @@ -3202,6 +3274,11 @@ static int g_work_group_size = 0; #define GGML_SYCL_MMV_Y 1 #endif +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3401,12 +3478,31 @@ class sycl_gpu_mgr { int work_group_size = 0; std::string gpus_list = ""; + /* + Use all GPUs with same top max compute units + */ sycl_gpu_mgr() { detect_sycl_gpu_list_with_max_cu(); get_allow_gpus(); create_context_with_gpus(); } + /* + Only use the assigned GPU + */ + sycl_gpu_mgr(int main_gpu_id) { + sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id); + dpct::device_info prop; + dpct::get_device_info(prop, device); + gpus.push_back(main_gpu_id); + devices.push_back(device); + work_group_size = prop.get_max_work_group_size(); + max_compute_units = prop.get_max_compute_units(); + + get_allow_gpus(); + create_context_with_gpus(); + } + void create_context_with_gpus() { sycl::context ctx = sycl::context(devices); assert(gpus.size() > 0); @@ -3422,7 +3518,7 @@ class sycl_gpu_mgr { gpus_list += std::to_string(gpus[i]); gpus_list += ","; } - if (gpus_list.length() > 2) { + if (gpus_list.length() > 1) { gpus_list.pop_back(); } } @@ -3451,7 +3547,7 @@ class sycl_gpu_mgr { dpct::device_info prop; dpct::get_device_info(prop, device); if (max_compute_units == prop.get_max_compute_units() && - prop.get_major_version() == 1) { + is_ext_oneapi_device(device)) { gpus.push_back(id); devices.push_back(device); work_group_size = prop.get_max_work_group_size(); @@ -3471,8 +3567,8 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + printf("miss to get device index by id=%d\n", id); + GGML_ASSERT(false); } int get_next_index(int id) { @@ -3481,8 +3577,16 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + GGML_ASSERT(false); + } + + bool is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; } }; @@ -3491,11 +3595,14 @@ static int g_device_count = -1; static int g_all_sycl_device_count = -1; static int g_main_device = -1; static int g_main_device_id = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; static std::array g_default_tensor_split = {}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE; + struct sycl_device_capabilities { int cc; // compute capability bool vmm; // virtual memory support @@ -13190,17 +13297,20 @@ bool ggml_sycl_loaded(void) { return g_sycl_loaded; } -void print_device_detail(int id) { +void print_device_detail(int id, sycl::device &device, std::string device_type) { + dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); - sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + dpct::get_device_info(prop, device))); + std::string version; version += std::to_string(prop.get_major_version()); version += "."; version += std::to_string(prop.get_minor_version()); - fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, + device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + + fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), prop.get_name(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_global_mem_size()); @@ -13208,19 +13318,35 @@ void print_device_detail(int id) { void ggml_backend_sycl_print_sycl_devices() { int device_count = dpct::dev_mgr::instance().device_count(); + std::map DeviceNums; fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); - fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); + fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); + fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); + fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); for (int id = 0; id < device_count; ++id) { - print_device_detail(id); + sycl::device device = dpct::dev_mgr::instance().get_device(id); + sycl::backend backend = device.get_backend(); + std::string backend_type = get_device_backend_and_type(device); + int type_id=DeviceNums[backend_type]++; + std::stringstream device_type; + device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; + print_device_detail(id, device, device_type.str()); } } void print_gpu_device_list() { - fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", - g_sycl_gpu_mgr->get_gpu_count(), - g_sycl_gpu_mgr->gpus_list.c_str(), - g_sycl_gpu_mgr->max_compute_units); + GGML_ASSERT(g_sycl_gpu_mgr); + + char* hint=NULL; + if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) { + hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n"; + } else { + hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n"; + } + fprintf(stderr, hint, + g_sycl_gpu_mgr->get_gpu_count(), + g_sycl_gpu_mgr->gpus_list.c_str(), + g_sycl_gpu_mgr->max_compute_units); } int get_sycl_env(const char *env_name, int default_val) { @@ -13256,23 +13382,6 @@ void ggml_init_sycl() try { #else fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); #endif - if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - initialized = true; - g_sycl_loaded = false; - return; - } - GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); - ggml_backend_sycl_print_sycl_devices(); - - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); - g_work_group_size = g_sycl_gpu_mgr->work_group_size; - - print_gpu_device_list(); - - int64_t total_vram = 0; /* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) @@ -13281,49 +13390,15 @@ void ggml_init_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { - g_device_caps[id].vmm = 0; - g_device_caps[id].device_id = -1; - g_device_caps[id].cc = 0; - g_tensor_split[id] = 0; - g_default_tensor_split[id] = 0; + + if (CHECK_TRY_ERROR(g_all_sycl_device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + initialized = true; + g_sycl_loaded = false; + return; } - - for (int i = 0; i < g_device_count; ++i) { - int device_id = g_sycl_gpu_mgr->gpus[i]; - g_device_caps[i].vmm = 0; - - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device_id)))); - - g_default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - g_device_caps[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } - - for (int i = 0; i < g_device_count; ++i) { - g_default_tensor_split[i] /= total_vram; - } - - for (int i = 0; i < g_device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); - - // create sycl streams - for (int is = 0; is < MAX_STREAMS; ++is) { - SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[i][is] = - dpct::get_current_device().create_queue( - g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); - } - - const dpct::queue_ptr stream = g_syclStreams[i][0]; - // create sycl handle - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); - } - + GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); initialized = true; g_sycl_loaded = true; } @@ -13334,6 +13409,63 @@ catch (sycl::exception const &exc) { std::exit(1); } +void ggml_init_by_gpus(int device_count) try { + g_device_count = device_count; + g_work_group_size = g_sycl_gpu_mgr->work_group_size; + + int64_t total_vram = 0; + + print_gpu_device_list(); + + for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { + g_device_caps[id].vmm = 0; + g_device_caps[id].device_id = -1; + g_device_caps[id].cc = 0; + g_tensor_split[id] = 0; + g_default_tensor_split[id] = 0; + } + + for (int i = 0; i < g_device_count; ++i) { + int device_id = g_sycl_gpu_mgr->gpus[i]; + g_device_caps[i].vmm = 0; + + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device_id)))); + + g_default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); + + g_device_caps[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + } + + for (int i = 0; i < g_device_count; ++i) { + g_default_tensor_split[i] /= total_vram; + } + + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); + + // create sycl streams + for (int is = 0; is < MAX_STREAMS; ++is) { + SYCL_CHECK(CHECK_TRY_ERROR( + g_syclStreams[i][is] = + dpct::get_current_device().create_queue( + g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); + } + + const dpct::queue_ptr stream = g_syclStreams[i][0]; + // create sycl handle + SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + void *ggml_sycl_host_malloc(size_t size) try { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { return nullptr; @@ -16737,22 +16869,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .is_host = */ nullptr, }; -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + if (device_index>=g_device_count or device_index<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_index, g_device_count-1); + GGML_ASSERT(device_indexgpus[i])}, }; } - ggml_backend_sycl_buffer_type_initialized = true; + g_ggml_backend_sycl_buffer_type_initialized = true; } - - return &ggml_backend_sycl_buffer_types[device]; + return &ggml_backend_sycl_buffer_types[device_index]; } // sycl split buffer type @@ -17504,11 +17638,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { return g_sycl_gpu_mgr->get_index(device_id); } +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) { + return g_sycl_gpu_mgr->gpus[device_index]; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + GGML_ASSERT(main_gpu_idget_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() { + if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) { + return; + } + + fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n"); + + if (g_sycl_gpu_mgr) { + delete g_sycl_gpu_mgr; + } + g_sycl_gpu_mgr = new sycl_gpu_mgr(); + g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE; + ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + ggml_backend_sycl_set_mul_device_mode(); assert(g_device_count>0); for (int i = 0; i < g_device_count; i++) { int id = g_sycl_gpu_mgr->gpus[i]; diff --git a/ggml-sycl.h b/ggml-sycl.h index bf5b11b36..c549a64a1 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); +// TODO: these are temporary +// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670 +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index fbc66f65b..fa23cb3c4 100644 --- a/ggml.c +++ b/ggml.c @@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(int32_t), .is_quantized = false, }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, + .nrows = 1, + }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, @@ -918,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif +#elif defined(__AVX512F__) + +#define GGML_SIMD + +// F32 AVX512 + +#define GGML_F32_STEP 64 +#define GGML_F32_EPR 16 + +#define GGML_F32x16 __m512 +#define GGML_F32x16_ZERO _mm512_setzero_ps() +#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) +#define GGML_F32x16_LOAD _mm512_loadu_ps +#define GGML_F32x16_STORE _mm512_storeu_ps +// _mm512_fmadd_ps is defined in AVX512F so no guard is required +#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32x16_ADD _mm512_add_ps +#define GGML_F32x16_MUL _mm512_mul_ps +#define GGML_F32x16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x16 +#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD +#define GGML_F32_VEC_STORE GGML_F32x16_STORE +#define GGML_F32_VEC_FMA GGML_F32x16_FMA +#define GGML_F32_VEC_ADD GGML_F32x16_ADD +#define GGML_F32_VEC_MUL GGML_F32x16_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE + +// F16 AVX512 + +// F16 AVX + +#define GGML_F16_STEP 64 +#define GGML_F16_EPR 16 + +// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead + +#define GGML_F32Cx16 __m512 +#define GGML_F32Cx16_ZERO _mm512_setzero_ps() +#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) + +// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F +// so F16C guard isn't required +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) +#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) + +#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32Cx16_ADD _mm512_add_ps +#define GGML_F32Cx16_MUL _mm512_mul_ps +#define GGML_F32Cx16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F32Cx16 +#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + #elif defined(__AVX__) #define GGML_SIMD @@ -12418,6 +12526,8 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); @@ -12504,6 +12614,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); diff --git a/ggml.h b/ggml.h index 1171088a9..c937d4a53 100644 --- a/ggml.h +++ b/ggml.h @@ -337,24 +337,24 @@ extern "C" { struct ggml_object; struct ggml_context; + // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { - GGML_TYPE_F32 = 0, - GGML_TYPE_F16 = 1, - GGML_TYPE_Q4_0 = 2, - GGML_TYPE_Q4_1 = 3, + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, // GGML_TYPE_Q4_2 = 4, support has been removed - // GGML_TYPE_Q4_3 (5) support has been removed - GGML_TYPE_Q5_0 = 6, - GGML_TYPE_Q5_1 = 7, - GGML_TYPE_Q8_0 = 8, - GGML_TYPE_Q8_1 = 9, - // k-quantizations - GGML_TYPE_Q2_K = 10, - GGML_TYPE_Q3_K = 11, - GGML_TYPE_Q4_K = 12, - GGML_TYPE_Q5_K = 13, - GGML_TYPE_Q6_K = 14, - GGML_TYPE_Q8_K = 15, + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, @@ -363,9 +363,11 @@ extern "C" { GGML_TYPE_IQ3_S = 21, GGML_TYPE_IQ2_S = 22, GGML_TYPE_IQ4_XS = 23, - GGML_TYPE_I8, - GGML_TYPE_I16, - GGML_TYPE_I32, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, GGML_TYPE_COUNT, }; @@ -383,20 +385,20 @@ extern "C" { // model file types enum ggml_ftype { - GGML_FTYPE_UNKNOWN = -1, - GGML_FTYPE_ALL_F32 = 0, - GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors - GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors - GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b23badb10..4a4facb06 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -32,6 +32,7 @@ class Keys: FILE_TYPE = "general.file_type" class LLM: + VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" BLOCK_COUNT = "{arch}.block_count" @@ -41,6 +42,7 @@ class Keys: EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -120,6 +122,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() STARCODER2 = auto() MAMBA = auto() + COMMAND_R = auto() class MODEL_TENSOR(IntEnum): @@ -186,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -578,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.COMMAND_R: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -661,6 +677,11 @@ class GGMLQuantizationType(IntEnum): IQ3_S = 21 IQ2_S = 22 IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 class GGUFEndian(IntEnum): @@ -727,6 +748,11 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } @@ -746,6 +772,7 @@ KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 5b6d4ba6b..33afac552 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -242,12 +242,27 @@ class GGUFReader: n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: - item_count = n_elems - item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: + if ggml_type == GGMLQuantizationType.F16: item_count = n_elems item_type = np.float16 + elif ggml_type == GGMLQuantizationType.F32: + item_count = n_elems + item_type = np.float32 + elif ggml_type == GGMLQuantizationType.F64: + item_count = n_elems + item_type = np.float64 + elif ggml_type == GGMLQuantizationType.I8: + item_count = n_elems + item_type = np.int8 + elif ggml_type == GGMLQuantizationType.I16: + item_count = n_elems + item_type = np.int16 + elif ggml_type == GGMLQuantizationType.I32: + item_count = n_elems + item_type = np.int32 + elif ggml_type == GGMLQuantizationType.I64: + item_count = n_elems + item_type = np.int64 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e49c5db68..2ae6c814b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -196,9 +196,6 @@ class GGUFWriter: if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') - if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): - raise ValueError("Only F32 and F16 tensors are supported for now") - encoded_name = name.encode("utf8") self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += encoded_name @@ -207,7 +204,22 @@ class GGUFWriter: for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 + if tensor_dtype == np.float16: + dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float64: + dtype = GGMLQuantizationType.F64 + elif tensor_dtype == np.int8: + dtype = GGMLQuantizationType.I8 + elif tensor_dtype == np.int16: + dtype = GGMLQuantizationType.I16 + elif tensor_dtype == np.int32: + dtype = GGMLQuantizationType.I32 + elif tensor_dtype == np.int64: + dtype = GGMLQuantizationType.I64 + else: + raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) @@ -313,6 +325,9 @@ class GGUFWriter: self.data_alignment = alignment self.add_uint32(Keys.General.ALIGNMENT, alignment) + def add_vocab_size(self, size: int) -> None: + self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size) + def add_context_length(self, length: int) -> None: self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) @@ -346,6 +361,9 @@ class GGUFWriter: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9789c2c87..96396e04e 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.7.0" +version = "0.8.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/llama.cpp b/llama.cpp index 38e7036a7..e4db288dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -214,6 +214,7 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, + LLM_ARCH_COMMAND_R, LLM_ARCH_UNKNOWN, }; @@ -243,6 +244,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -258,6 +260,7 @@ enum llm_kv { LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_HF_REPO, + LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_BLOCK_COUNT, @@ -267,6 +270,7 @@ enum llm_kv { LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, + LLM_KV_LOGIT_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -321,6 +325,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, + { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, @@ -330,6 +335,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -836,6 +842,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_COMMAND_R, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1595,6 +1616,7 @@ enum e_model { MODEL_20B, MODEL_30B, MODEL_34B, + MODEL_35B, MODEL_40B, MODEL_65B, MODEL_70B, @@ -1641,6 +1663,7 @@ struct llama_hparams { float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; + float f_logit_scale = 0.0f; bool causal_attn = true; bool need_kq_pos = false; @@ -1871,6 +1894,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = -1; + int32_t layer_end = -1; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2085,6 +2133,9 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + // control vectors + struct llama_control_vector cvec; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -3229,6 +3280,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; + case MODEL_35B: return "35B"; case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; @@ -3242,10 +3294,11 @@ static const char * llama_model_type_name(e_model type) { static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } @@ -3277,14 +3330,14 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); - ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); - ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); - ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); - ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -3620,6 +3673,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_COMMAND_R: + { + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_35B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3645,30 +3707,25 @@ static void llm_load_vocab( const auto kv = LLM_KV(model.arch); - const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } - - const float * scores = nullptr; - const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx != -1) { - scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - } - - const int * toktypes = nullptr; - const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx != -1) { - toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - } - // determine vocab type { std::string tokenizer_name; ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); - if (tokenizer_name == "llama") { + if (tokenizer_name == "no_vocab") { + vocab.type = LLAMA_VOCAB_TYPE_NONE; + + // default special tokens + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.linefeed_id = -1; + + return; + } else if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -3734,6 +3791,23 @@ static void llm_load_vocab( } } + const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } + + const float * scores = nullptr; + const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + } + + const int * toktypes = nullptr; + const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + } + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); vocab.id_to_token.resize(n_vocab); @@ -3929,10 +4003,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); + LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); - LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn); + LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); @@ -4903,6 +4978,37 @@ static bool llm_load_tensors( layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}); } } break; + case LLM_ARCH_COMMAND_R: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + // init output from the input tok embed + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -5023,7 +5129,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam llm_load_print_meta(ml, model); - if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && + model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); } @@ -5048,6 +5155,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } #endif +#ifdef GGML_USE_SYCL + if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + ggml_backend_sycl_set_single_device_mode(params.main_gpu); + //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index. + params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu); + } else { + ggml_backend_sycl_set_mul_device_mode(); + } +#endif + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data @@ -5842,6 +5959,12 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } cb(cur, "l_out", il); // input for next layer @@ -5877,7 +6000,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr; // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -5927,7 +6050,6 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -8289,6 +8411,121 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_command_r() { + + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = llm_build_ffn(ctx0, ffn_inp, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -8471,6 +8708,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_mamba(); } break; + case LLM_ARCH_COMMAND_R: + { + result = llm.build_command_r(); + } break; default: GGML_ASSERT(false); } @@ -9036,8 +9277,8 @@ static int llama_decode_internal( //llama_synchronize(&lctx); // decide if we need to defrag the kv cache - if (cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f; + if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { @@ -9069,6 +9310,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // number of cells moved uint32_t n_moves = 0; + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + // determine which KV cells to move where // // cell i moves to ids[i] @@ -9095,15 +9341,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { nh++; } - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - // - if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) { - // the graph is too big, we cannot move more cells - break; - } - uint32_t nf = 0; uint32_t is = n_kv - 1; @@ -9133,11 +9370,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // are we moving a continuous block of memory? bool cont = false; + // should we stop searching for the next move? + bool stop = false; + // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { auto & cell1 = kv_self.cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + cont = false; continue; } @@ -9164,6 +9409,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { } } + if (stop || n_moves == max_moves) { + break; + } + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); i0 += nh - 1; @@ -9353,26 +9602,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { } static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { @@ -9393,6 +9648,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { @@ -10224,6 +10480,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } } break; + case LLAMA_VOCAB_TYPE_NONE: + GGML_ASSERT(false); } return output; @@ -11944,7 +12202,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { +static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { std::mutex mutex; int counter = 0; size_t new_size = 0; @@ -12888,23 +13146,22 @@ struct llama_context * llama_new_context_with_model( if (model->n_gpu_layers > 0) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); - ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); } else { // LLAMA_SPLIT_LAYER requires a backend for each GPU - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - int device_id = id_list[i]; ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } @@ -13105,6 +13362,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: + case LLM_ARCH_COMMAND_R: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -13130,7 +13388,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { } int32_t llama_n_vocab(const struct llama_model * model) { - return model->vocab.id_to_token.size(); + return model->hparams.n_vocab; } int32_t llama_n_ctx_train(const struct llama_model * model) { @@ -13141,6 +13399,10 @@ int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -13240,6 +13502,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } + + // allocate contexts + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; + } + ctx_map[it.first] = ctx; + } + + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; + + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, @@ -13954,14 +14306,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id } const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].text.c_str(); } float llama_token_get_score(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].score; } llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].type; } @@ -14206,6 +14561,26 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "model\n"; } + } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) { + // OrionStarAI/Orion-14B-Chat + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message support, we will merge it with user prompt + system_prompt = message->content; + continue; + } else if (role == "user") { + ss << "Human: "; + if (!system_prompt.empty()) { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << message->content << "\n\nAssistant: "; + } else { + ss << message->content << ""; + } + } } else { // template not supported return -1; diff --git a/llama.h b/llama.h index 2d16cc9b9..40dcf54e3 100644 --- a/llama.h +++ b/llama.h @@ -59,9 +59,10 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab + LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece }; // note: these values should be synchronized with ggml_rope @@ -387,6 +388,7 @@ extern "C" { LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_n_layer (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -434,10 +436,24 @@ extern "C" { // Returns 0 on success LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + const char * path_lora, + float scale, + const char * path_base_model, + int32_t n_threads); + + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); // // KV cache diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index fa2eb577b..6e9e4bd1e 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -31,6 +31,8 @@ int main(void) { "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", // google/gemma-7b-it "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\\n' + message['content'] | trim + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\\n'}}{% endif %}", + // OrionStarAI/Orion-14B-Chat + "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -45,6 +47,8 @@ int main(void) { "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", // google/gemma-7b-it "user\nYou are a helpful assistant\n\nHello\nmodel\nHi there\nuser\nWho are you\nmodel\nI am an assistant\nuser\nAnother question\nmodel\n", + // OrionStarAI/Orion-14B-Chat + "Human: You are a helpful assistant\n\nHello\n\nAssistant: Hi thereHuman: Who are you\n\nAssistant: I am an assistant Human: Another question\n\nAssistant: ", }; std::vector formatted_chat(1024); int32_t res;