diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f3326a5fb..886d33d2d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -552,35 +552,44 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO -# TODO: tmp disabled. see for possible re-enable: -# https://github.com/ggerganov/llama.cpp/pull/10525 -# macOS-latest-swift: -# runs-on: macos-latest -# -# strategy: -# matrix: -# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] -# -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v4 -# -# - name: Dependencies -# id: depends -# continue-on-error: true -# run: | -# brew update -# -# - name: xcodebuild for swift package -# id: xcodebuild -# run: | -# xcodebuild -scheme llama -destination "${{ matrix.destination }}" -# -# - name: Build Swift Example -# id: make_build_swift_example -# run: | -# make swift + macOS-latest-swift: + runs-on: macos-latest + + strategy: + matrix: + destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build llama.cpp with CMake + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}" windows-msys2: runs-on: windows-latest @@ -1104,6 +1113,29 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination 'generic/platform=iOS' + - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build @@ -1131,23 +1163,6 @@ jobs: ./gradlew build --no-daemon -# freeBSD-latest: -# runs-on: macos-12 -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Build -# uses: cross-platform-actions/action@v0.19.0 -# with: -# operating_system: freebsd -# version: '13.2' -# hypervisor: 'qemu' -# run: | -# sudo pkg update -# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas -# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu` - release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} diff --git a/Package.swift b/Package.swift index 3afeb2f19..01c996d24 100644 --- a/Package.swift +++ b/Package.swift @@ -2,60 +2,6 @@ import PackageDescription -var sources = [ - "src/llama.cpp", - "src/llama-vocab.cpp", - "src/llama-grammar.cpp", - "src/llama-sampling.cpp", - "src/unicode.cpp", - "src/unicode-data.cpp", - "ggml/src/ggml.c", - "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.cpp", - "ggml/src/ggml-backend-reg.cpp", - "ggml/src/ggml-cpu/ggml-cpu.c", - "ggml/src/ggml-cpu/ggml-cpu.cpp", - "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp", - "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp", - "ggml/src/ggml-cpu/ggml-cpu-quants.c", - "ggml/src/ggml-cpu/ggml-cpu-traits.cpp", - "ggml/src/ggml-threading.cpp", - "ggml/src/ggml-quants.c", -] - -var resources: [Resource] = [] -var linkerSettings: [LinkerSetting] = [] -var cSettings: [CSetting] = [ - .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .unsafeFlags(["-fno-objc-arc"]), - .headerSearchPath("ggml/src"), - .headerSearchPath("ggml/src/ggml-cpu"), - // NOTE: NEW_LAPACK will required iOS version 16.4+ - // We should consider add this in the future when we drop support for iOS 14 - // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") - .define("GGML_USE_CPU"), -] - - -#if canImport(Darwin) -sources.append("ggml/src/ggml-common.h") -sources.append("ggml/src/ggml-metal/ggml-metal.m") -resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal")) -linkerSettings.append(.linkedFramework("Accelerate")) -cSettings.append( - contentsOf: [ - .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL"), - ] -) -#endif - -#if os(Linux) - cSettings.append(.define("_GNU_SOURCE")) -#endif - let package = Package( name: "llama", platforms: [ @@ -68,26 +14,6 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], targets: [ - .target( - name: "llama", - path: ".", - exclude: [ - "build", - "cmake", - "examples", - "scripts", - "models", - "tests", - "CMakeLists.txt", - "Makefile", - "ggml/src/ggml-metal-embed.metal" - ], - sources: sources, - resources: resources, - publicHeadersPath: "spm-headers", - cSettings: cSettings, - linkerSettings: linkerSettings - ) - ], - cxxLanguageStandard: .cxx17 + .systemLibrary(name: "llama", pkgConfig: "llama"), + ] ) diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h new file mode 100644 index 000000000..41725880e --- /dev/null +++ b/Sources/llama/llama.h @@ -0,0 +1,4 @@ +#pragma once + +#include + diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap new file mode 100644 index 000000000..d010555b1 --- /dev/null +++ b/Sources/llama/module.modulemap @@ -0,0 +1,5 @@ +module llama [system] { + header "llama.h" + link "llama" + export * +} diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in index 326acbb61..0b2b6bcfa 100644 --- a/cmake/llama.pc.in +++ b/cmake/llama.pc.in @@ -6,5 +6,5 @@ includedir=${prefix}/include Name: llama Description: Port of Facebook's LLaMA model in C/C++ Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lllama +Libs: -L${libdir} -lggml -lggml-base -lllama Cflags: -I${includedir} diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 65cd4eb51..998c673d5 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -210,20 +210,20 @@ actor LlamaContext { llama_kv_cache_clear(context) - let t_pp_start = ggml_time_us() + let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; if llama_decode(context, batch) != 0 { print("llama_decode() failed during prompt") } llama_synchronize(context) - let t_pp_end = ggml_time_us() + let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000; // bench text generation llama_kv_cache_clear(context) - let t_tg_start = ggml_time_us() + let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; for i in 0..= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + return ret; + } }; struct server_task_result_cmpl_partial : server_task_result { int index = 0; std::string content; - bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; - stop_type stop = STOP_TYPE_NONE; - std::vector probs_output; result_timings timings; @@ -573,20 +607,19 @@ struct server_task_result_cmpl_partial : server_task_result { } virtual bool is_stop() override { - return stop != STOP_TYPE_NONE; + return false; // in stream mode, partial responses are not considered stop } virtual json to_json() override { - if (oaicompat) { - return to_json_oaicompat(); - } - bool is_stop = stop != STOP_TYPE_NONE; + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { // non-OAI-compat JSON json res = json { {"index", index}, {"content", content}, - {"stop_type", stop_type_to_str(stop)}, - {"stop", is_stop}, + {"stop", false}, {"id_slot", id_slot}, {"tokens_predicted", n_decoded}, {"tokens_evaluated", n_prompt_tokens}, @@ -598,72 +631,54 @@ struct server_task_result_cmpl_partial : server_task_result { if (!probs_output.empty()) { res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); } - if (is_stop) { - res.push_back({"truncated", truncated}); - } return res; } json to_json_oaicompat() { bool first = n_decoded == 0; - - std::string finish_reason; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } else if (stop == STOP_TYPE_LIMIT) { - finish_reason = "length"; - } - std::time_t t = std::time(0); - json choices; - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } + {"delta", json{{"role", "assistant"}}}}}); } else { - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); } + } else { + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); } json ret = json { @@ -678,14 +693,6 @@ struct server_task_result_cmpl_partial : server_task_result { ret.push_back({"timings", timings.to_json()}); } - if (!finish_reason.empty()) { - ret.push_back({"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}); - } - return std::vector({ret}); } }; @@ -1888,12 +1895,9 @@ struct server_context { res->index = slot.index; res->content = tkn.text_to_send; - res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->stop = slot.stop; - res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; @@ -1924,12 +1928,6 @@ struct server_context { } void send_final_response(server_slot & slot) { - if (slot.params.stream) { - // if in stream mode, send the last partial response - send_partial_response(slot, {0, "", {}}); - return; - } - auto res = std::make_unique(); res->id = slot.id_task; res->id_slot = slot.id; @@ -1948,6 +1946,7 @@ struct server_context { res->stop = slot.stop; res->verbose = slot.params.verbose; + res->stream = slot.params.stream; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; @@ -2100,7 +2099,10 @@ struct server_context { return; } - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); if (!result_handler(result)) { cancel_tasks(id_tasks); break; @@ -3482,6 +3484,11 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); // validate input + if (data.contains("prompt") && !data.at("prompt").is_string()) { + // prompt is optional + res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + } + if (!data.contains("input_prefix")) { res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); } @@ -3491,9 +3498,11 @@ int main(int argc, char ** argv) { } if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + // input_extra is optional res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); return; } + json input_extra = json_value(data, "input_extra", json::array()); for (const auto & chunk : input_extra) { // { "text": string, "filename": string } @@ -3509,6 +3518,21 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist + std::string prompt = json_value(data, "prompt", std::string()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + data["prompt"] = format_infill( + ctx_server.ctx, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + ctx_server.params_base.n_batch, + ctx_server.params_base.n_predict, + ctx_server.slots[0].n_ctx, // TODO: there should be a better way + ctx_server.params_base.spm_infill, + tokenized_prompts[0] + ); + return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); }; diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 1c3aa77de..7f4f9cd03 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -42,10 +42,16 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp }) content = "" for data in res: + assert "stop" in data and type(data["stop"]) == bool if data["stop"]: assert data["timings"]["prompt_n"] == n_prompt assert data["timings"]["predicted_n"] == n_predicted assert data["truncated"] == truncated + assert data["stop_type"] == "limit" + assert "generation_settings" in data + assert server.n_predict is not None + assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) + assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: content += data["content"] diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index 6a6d40a1c..ad4b8192a 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -13,28 +13,28 @@ def test_infill_without_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"]) + assert match_regex("(Ann|small|shiny)+", res.body["content"]) def test_infill_with_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [{ "filename": "llama.h", "text": "LLAMA_API int32_t llama_n_threads();\n" }], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"]) + assert match_regex("(Dad|excited|park)+", res.body["content"]) @pytest.mark.parametrize("input_extra", [ @@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [input_extra], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 400 assert "error" in res.body + + +@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") +def test_with_qwen_model(): + global server + server.model_file = None + server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF" + server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf" + server.start(timeout_seconds=600) + res = server.make_request("POST", "/infill", data={ + "input_extra": [{ + "filename": "llama.h", + "text": "LLAMA_API int32_t llama_n_threads();\n" + }], + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", + "input_suffix": "}\n", + }) + assert res.status_code == 200 + assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n" diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 69215eaa4..7c89b9cd3 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -371,3 +371,6 @@ def match_regex(regex: str, text: str) -> bool: ).search(text) is not None ) + +def is_slow_test_allowed(): + return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp index 74630dc7f..495f966bd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp @@ -16,6 +16,5 @@ void main() { if (i >= p.KX) { return; } - - data_d[i] = D_TYPE(tanh(data_a[i])); + data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.)); }