Merge https://github.com/ggerganov/llama.cpp into vulkan
This commit is contained in:
commit
c7bc42cea0
12 changed files with 229 additions and 227 deletions
107
.github/workflows/build.yml
vendored
107
.github/workflows/build.yml
vendored
|
@ -552,35 +552,44 @@ jobs:
|
|||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
# TODO: tmp disabled. see for possible re-enable:
|
||||
# https://github.com/ggerganov/llama.cpp/pull/10525
|
||||
# macOS-latest-swift:
|
||||
# runs-on: macos-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# continue-on-error: true
|
||||
# run: |
|
||||
# brew update
|
||||
#
|
||||
# - name: xcodebuild for swift package
|
||||
# id: xcodebuild
|
||||
# run: |
|
||||
# xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||
#
|
||||
# - name: Build Swift Example
|
||||
# id: make_build_swift_example
|
||||
# run: |
|
||||
# make swift
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
|
||||
- name: Build llama.cpp with CMake
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
sudo cmake --install . --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
|
@ -1104,6 +1113,29 @@ jobs:
|
|||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
sudo cmake --install . --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
|
@ -1131,23 +1163,6 @@ jobs:
|
|||
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build
|
||||
# uses: cross-platform-actions/action@v0.19.0
|
||||
# with:
|
||||
# operating_system: freebsd
|
||||
# version: '13.2'
|
||||
# hypervisor: 'qemu'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
|
|
|
@ -2,60 +2,6 @@
|
|||
|
||||
import PackageDescription
|
||||
|
||||
var sources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-backend-reg.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-hbm.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
|
||||
"ggml/src/ggml-threading.cpp",
|
||||
"ggml/src/ggml-quants.c",
|
||||
]
|
||||
|
||||
var resources: [Resource] = []
|
||||
var linkerSettings: [LinkerSetting] = []
|
||||
var cSettings: [CSetting] = [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
.headerSearchPath("ggml/src"),
|
||||
.headerSearchPath("ggml/src/ggml-cpu"),
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
.define("GGML_USE_CPU"),
|
||||
]
|
||||
|
||||
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml/src/ggml-common.h")
|
||||
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
||||
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
|
||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL"),
|
||||
]
|
||||
)
|
||||
#endif
|
||||
|
||||
#if os(Linux)
|
||||
cSettings.append(.define("_GNU_SOURCE"))
|
||||
#endif
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
|
@ -68,26 +14,6 @@ let package = Package(
|
|||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"build",
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile",
|
||||
"ggml/src/ggml-metal-embed.metal"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings
|
||||
)
|
||||
],
|
||||
cxxLanguageStandard: .cxx17
|
||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
||||
]
|
||||
)
|
||||
|
|
4
Sources/llama/llama.h
Normal file
4
Sources/llama/llama.h
Normal file
|
@ -0,0 +1,4 @@
|
|||
#pragma once
|
||||
|
||||
#include <llama.h>
|
||||
|
5
Sources/llama/module.modulemap
Normal file
5
Sources/llama/module.modulemap
Normal file
|
@ -0,0 +1,5 @@
|
|||
module llama [system] {
|
||||
header "llama.h"
|
||||
link "llama"
|
||||
export *
|
||||
}
|
|
@ -6,5 +6,5 @@ includedir=${prefix}/include
|
|||
Name: llama
|
||||
Description: Port of Facebook's LLaMA model in C/C++
|
||||
Version: @PROJECT_VERSION@
|
||||
Libs: -L${libdir} -lllama
|
||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -210,20 +210,20 @@ actor LlamaContext {
|
|||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_pp_start = ggml_time_us()
|
||||
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed during prompt")
|
||||
}
|
||||
llama_synchronize(context)
|
||||
|
||||
let t_pp_end = ggml_time_us()
|
||||
let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
// bench text generation
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_tg_start = ggml_time_us()
|
||||
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
for i in 0..<tg {
|
||||
llama_batch_clear(&batch)
|
||||
|
@ -238,7 +238,7 @@ actor LlamaContext {
|
|||
llama_synchronize(context)
|
||||
}
|
||||
|
||||
let t_tg_end = ggml_time_us()
|
||||
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
|
@ -17,7 +18,6 @@
|
|||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
|
||||
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
|
@ -42,7 +42,7 @@
|
|||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
DF810E132B4A5BA200301144 /* llama in Frameworks */,
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */,
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
|
@ -151,7 +151,7 @@
|
|||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
DF810E122B4A5BA200301144 /* llama */,
|
||||
1809696C2D05A39F00400EE8 /* llama */,
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
|
@ -429,7 +429,7 @@
|
|||
/* End XCConfigurationList section */
|
||||
|
||||
/* Begin XCSwiftPackageProductDependency section */
|
||||
DF810E122B4A5BA200301144 /* llama */ = {
|
||||
1809696C2D05A39F00400EE8 /* llama */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
productName = llama;
|
||||
};
|
||||
|
|
|
@ -392,7 +392,7 @@ struct server_task_result {
|
|||
return false;
|
||||
}
|
||||
virtual bool is_stop() {
|
||||
// only used by server_task_result_cmpl_partial
|
||||
// only used by server_task_result_cmpl_*
|
||||
return false;
|
||||
}
|
||||
virtual int get_index() {
|
||||
|
@ -478,14 +478,20 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||
return index;
|
||||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return true; // in stream mode, final responses are considered stop
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat();
|
||||
return oaicompat
|
||||
? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat())
|
||||
: to_json_non_oaicompat();
|
||||
}
|
||||
|
||||
json to_json_non_oaicompat() {
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk
|
||||
{"id_slot", id_slot},
|
||||
{"stop", true},
|
||||
{"model", oaicompat_model},
|
||||
|
@ -546,18 +552,46 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||
|
||||
return res;
|
||||
}
|
||||
|
||||
json to_json_oaicompat_chat_stream() {
|
||||
std::time_t t = std::time(0);
|
||||
std::string finish_reason = "length";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
|
||||
json choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
|
||||
json ret = json {
|
||||
{"choices", choices},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"},
|
||||
{"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
};
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_partial : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
std::vector<completion_token_output> probs_output;
|
||||
result_timings timings;
|
||||
|
||||
|
@ -573,20 +607,19 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return stop != STOP_TYPE_NONE;
|
||||
return false; // in stream mode, partial responses are not considered stop
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
if (oaicompat) {
|
||||
return to_json_oaicompat();
|
||||
}
|
||||
bool is_stop = stop != STOP_TYPE_NONE;
|
||||
return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
|
||||
}
|
||||
|
||||
json to_json_non_oaicompat() {
|
||||
// non-OAI-compat JSON
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"stop_type", stop_type_to_str(stop)},
|
||||
{"stop", is_stop},
|
||||
{"stop", false},
|
||||
{"id_slot", id_slot},
|
||||
{"tokens_predicted", n_decoded},
|
||||
{"tokens_evaluated", n_prompt_tokens},
|
||||
|
@ -598,72 +631,54 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||
if (!probs_output.empty()) {
|
||||
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
|
||||
}
|
||||
if (is_stop) {
|
||||
res.push_back({"truncated", truncated});
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
json to_json_oaicompat() {
|
||||
bool first = n_decoded == 0;
|
||||
|
||||
std::string finish_reason;
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
} else if (stop == STOP_TYPE_LIMIT) {
|
||||
finish_reason = "length";
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json choices;
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
} else {
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
}
|
||||
|
||||
json ret = json {
|
||||
|
@ -678,14 +693,6 @@ struct server_task_result_cmpl_partial : server_task_result {
|
|||
ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
ret.push_back({"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}});
|
||||
}
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
};
|
||||
|
@ -1888,12 +1895,9 @@ struct server_context {
|
|||
res->index = slot.index;
|
||||
res->content = tkn.text_to_send;
|
||||
|
||||
res->truncated = slot.truncated;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
|
||||
res->stop = slot.stop;
|
||||
|
||||
res->verbose = slot.params.verbose;
|
||||
res->oaicompat = slot.params.oaicompat;
|
||||
res->oaicompat_chat = slot.params.oaicompat_chat;
|
||||
|
@ -1924,12 +1928,6 @@ struct server_context {
|
|||
}
|
||||
|
||||
void send_final_response(server_slot & slot) {
|
||||
if (slot.params.stream) {
|
||||
// if in stream mode, send the last partial response
|
||||
send_partial_response(slot, {0, "", {}});
|
||||
return;
|
||||
}
|
||||
|
||||
auto res = std::make_unique<server_task_result_cmpl_final>();
|
||||
res->id = slot.id_task;
|
||||
res->id_slot = slot.id;
|
||||
|
@ -1948,6 +1946,7 @@ struct server_context {
|
|||
res->stop = slot.stop;
|
||||
|
||||
res->verbose = slot.params.verbose;
|
||||
res->stream = slot.params.stream;
|
||||
res->oaicompat = slot.params.oaicompat;
|
||||
res->oaicompat_chat = slot.params.oaicompat_chat;
|
||||
res->oaicompat_model = slot.params.oaicompat_model;
|
||||
|
@ -2100,7 +2099,10 @@ struct server_context {
|
|||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr);
|
||||
GGML_ASSERT(
|
||||
dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
|
||||
|| dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
|
||||
);
|
||||
if (!result_handler(result)) {
|
||||
cancel_tasks(id_tasks);
|
||||
break;
|
||||
|
@ -3482,6 +3484,11 @@ int main(int argc, char ** argv) {
|
|||
json data = json::parse(req.body);
|
||||
|
||||
// validate input
|
||||
if (data.contains("prompt") && !data.at("prompt").is_string()) {
|
||||
// prompt is optional
|
||||
res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
||||
if (!data.contains("input_prefix")) {
|
||||
res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
@ -3491,9 +3498,11 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
|
||||
// input_extra is optional
|
||||
res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
json input_extra = json_value(data, "input_extra", json::array());
|
||||
for (const auto & chunk : input_extra) {
|
||||
// { "text": string, "filename": string }
|
||||
|
@ -3509,6 +3518,21 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
||||
|
||||
std::string prompt = json_value(data, "prompt", std::string());
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
||||
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
||||
data["prompt"] = format_infill(
|
||||
ctx_server.ctx,
|
||||
data.at("input_prefix"),
|
||||
data.at("input_suffix"),
|
||||
data.at("input_extra"),
|
||||
ctx_server.params_base.n_batch,
|
||||
ctx_server.params_base.n_predict,
|
||||
ctx_server.slots[0].n_ctx, // TODO: there should be a better way
|
||||
ctx_server.params_base.spm_infill,
|
||||
tokenized_prompts[0]
|
||||
);
|
||||
|
||||
return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res);
|
||||
};
|
||||
|
||||
|
|
|
@ -42,10 +42,16 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
|
|||
})
|
||||
content = ""
|
||||
for data in res:
|
||||
assert "stop" in data and type(data["stop"]) == bool
|
||||
if data["stop"]:
|
||||
assert data["timings"]["prompt_n"] == n_prompt
|
||||
assert data["timings"]["predicted_n"] == n_predicted
|
||||
assert data["truncated"] == truncated
|
||||
assert data["stop_type"] == "limit"
|
||||
assert "generation_settings" in data
|
||||
assert server.n_predict is not None
|
||||
assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
|
||||
assert data["generation_settings"]["seed"] == server.seed
|
||||
assert match_regex(re_content, content)
|
||||
else:
|
||||
content += data["content"]
|
||||
|
|
|
@ -13,28 +13,28 @@ def test_infill_without_input_extra():
|
|||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
|
||||
assert match_regex("(Ann|small|shiny)+", res.body["content"])
|
||||
|
||||
|
||||
def test_infill_with_input_extra():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [{
|
||||
"filename": "llama.h",
|
||||
"text": "LLAMA_API int32_t llama_n_threads();\n"
|
||||
}],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
|
||||
assert match_regex("(Dad|excited|park)+", res.body["content"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_extra", [
|
||||
|
@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra):
|
|||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [input_extra],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
|
||||
def test_with_qwen_model():
|
||||
global server
|
||||
server.model_file = None
|
||||
server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
|
||||
server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
|
||||
server.start(timeout_seconds=600)
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"input_extra": [{
|
||||
"filename": "llama.h",
|
||||
"text": "LLAMA_API int32_t llama_n_threads();\n"
|
||||
}],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n"
|
||||
|
|
|
@ -371,3 +371,6 @@ def match_regex(regex: str, text: str) -> bool:
|
|||
).search(text)
|
||||
is not None
|
||||
)
|
||||
|
||||
def is_slow_test_allowed():
|
||||
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
|
||||
|
|
|
@ -16,6 +16,5 @@ void main() {
|
|||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[i] = D_TYPE(tanh(data_a[i]));
|
||||
data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue