Merge commit '81bc9214a3' into nomic-vulkan

This commit is contained in:
Jared Van Bortel 2023-12-13 17:25:15 -05:00
commit 747e1eafcf
61 changed files with 3432 additions and 591 deletions

View file

@ -32,6 +32,7 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)

View file

@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -1,4 +1,4 @@
This is a swift clone of `examples/batched`.
$ `make`
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`

View file

@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(context) || n_cur == n_len {
if new_token_id == llama_token_eos(model) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0)
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])
@ -230,18 +231,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
if nTokens < 0 {
if result.count >= -Int(nTokens) {
result.removeLast(-Int(nTokens))
} else {
result.removeAll()
}
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
let check = llama_token_to_piece(
model,
token,
&result,
Int32(result.count)
)
assert(check == nTokens)
assert(check == actualTokensCount)
} else {
result.removeLast(result.count - Int(nTokens))
}
@ -259,5 +257,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
buffer = []
return bufferString
}
return nil
}

1
examples/llama.swiftui/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
xcuserdata

View file

@ -0,0 +1,7 @@
# llama.swiftui
Local inference of llama.cpp on an iPhone.
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545

View file

@ -0,0 +1,208 @@
import Foundation
// import llama
enum LlamaError: Error {
case couldNotInitializeContext
}
actor LlamaContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 512
var n_cur: Int32 = 0
var n_decode: Int32 = 0
init(model: OpaquePointer, context: OpaquePointer) {
self.model = model
self.context = context
self.tokens_list = []
self.batch = llama_batch_init(512, 0, 1)
self.temporary_invalid_cchars = []
}
deinit {
llama_free(context)
llama_free_model(model)
llama_backend_free()
}
static func createContext(path: String) throws -> LlamaContext {
llama_backend_init(false)
let model_params = llama_model_default_params()
let model = llama_load_model_from_file(path, model_params)
guard let model else {
print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext
}
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = 8
ctx_params.n_threads_batch = 8
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
print("Could not load context!")
throw LlamaError.couldNotInitializeContext
}
return LlamaContext(model: model, context: context)
}
func get_n_tokens() -> Int32 {
return batch.n_tokens;
}
func completion_init(text: String) {
print("attempting to complete \"\(text)\"")
tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []
let n_ctx = llama_n_ctx(context)
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
if n_kv_req > n_ctx {
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
}
for id in tokens_list {
print(String(cString: token_to_piece(token: id) + [0]))
}
// batch = llama_batch_init(512, 0) // done in init()
batch.n_tokens = Int32(tokens_list.count)
for i1 in 0..<batch.n_tokens {
let i = Int(i1)
batch.token[i] = tokens_list[i]
batch.pos[i] = i1
batch.n_seq_id[Int(i)] = 1
batch.seq_id[Int(i)]![0] = 0
batch.logits[i] = 0
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
}
n_cur = batch.n_tokens
}
func completion_loop() -> String {
var new_token_id: llama_token = 0
let n_vocab = llama_n_vocab(model)
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
var candidates = Array<llama_token_data>()
candidates.reserveCapacity(Int(n_vocab))
for token_id in 0..<n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
candidates.withUnsafeMutableBufferPointer() { buffer in
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
if new_token_id == llama_token_eos(context) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str
}
let new_token_cchars = token_to_piece(token: new_token_id)
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
let new_token_str: String
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
temporary_invalid_cchars.removeAll()
new_token_str = string
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
let string = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
new_token_str = string
} else {
new_token_str = ""
}
print(new_token_str)
// tokens_list.append(new_token_id)
batch.n_tokens = 0
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
batch.seq_id[Int(batch.n_tokens)]![0] = 0
batch.logits[Int(batch.n_tokens)] = 1 // true
batch.n_tokens += 1
n_decode += 1
n_cur += 1
if llama_decode(context, batch) != 0 {
print("failed to evaluate llama!")
}
return new_token_str
}
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = []
for i in 0..<tokenCount {
swiftTokens.append(tokens[Int(i)])
}
tokens.deallocate()
return swiftTokens
}
/// - note: The result does not contain null-terminator
private func token_to_piece(token: llama_token) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
result.initialize(repeating: Int8(0), count: 8)
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
}
}
}

View file

@ -0,0 +1,5 @@
//
// Use this file to import your target's public headers that you would like to expose to Swift.
//
#import "llama.h"

View file

@ -0,0 +1,481 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 56;
objects = {
/* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
isa = PBXGroup;
children = (
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
542376092B0D9C40008E6A1C /* ggml-backend.h */,
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
542EA09B2AC8723900A8AEE9 /* ggml.c */,
542EA09C2AC8723900A8AEE9 /* ggml.h */,
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
542EA0A22AC8729100A8AEE9 /* llama.h */,
);
name = llama.cpp;
sourceTree = "<group>";
};
8A1C836A2AC328BD0096AF73 = {
isa = PBXGroup;
children = (
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
8A907F312AC7134E006146EA /* llama.cpp.swift */,
8A3F84232AC4C891005E2EE8 /* models */,
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
8A1C83742AC328BD0096AF73 /* Products */,
8A39BE082AC7601000BFEB40 /* Frameworks */,
);
sourceTree = "<group>";
};
8A1C83742AC328BD0096AF73 /* Products */ = {
isa = PBXGroup;
children = (
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
);
name = Products;
sourceTree = "<group>";
};
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXGroup;
children = (
8A3F84102AC4BD85005E2EE8 /* Resources */,
8A9F7C4B2AC332DC008AE1EA /* Models */,
8A9F7C4A2AC332BF008AE1EA /* UI */,
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
8A1C837C2AC328BE0096AF73 /* Preview Content */,
);
path = llama.swiftui;
sourceTree = "<group>";
};
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
isa = PBXGroup;
children = (
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
);
path = "Preview Content";
sourceTree = "<group>";
};
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup;
children = (
549479CA2AC9E16000E0F78B /* Metal.framework */,
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
);
name = Frameworks;
sourceTree = "<group>";
};
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
isa = PBXGroup;
children = (
8A3F84112AC4BD8C005E2EE8 /* models */,
);
path = Resources;
sourceTree = "<group>";
};
8A3F84112AC4BD8C005E2EE8 /* models */ = {
isa = PBXGroup;
children = (
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
);
path = models;
sourceTree = "<group>";
};
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
isa = PBXGroup;
children = (
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
8A907F322AC7134E006146EA /* LibLlama.swift */,
);
path = llama.cpp.swift;
sourceTree = "<group>";
};
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
isa = PBXGroup;
children = (
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
);
path = UI;
sourceTree = "<group>";
};
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
isa = PBXGroup;
children = (
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
);
path = Models;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXNativeTarget;
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
buildPhases = (
8A1C836F2AC328BD0096AF73 /* Sources */,
8A1C83702AC328BD0096AF73 /* Frameworks */,
8A1C83712AC328BD0096AF73 /* Resources */,
);
buildRules = (
);
dependencies = (
);
name = llama.swiftui;
packageProductDependencies = (
);
productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
productType = "com.apple.product-type.application";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
8A1C836B2AC328BD0096AF73 /* Project object */ = {
isa = PBXProject;
attributes = {
BuildIndependentTargetsInParallel = 1;
LastSwiftUpdateCheck = 1500;
LastUpgradeCheck = 1500;
TargetAttributes = {
8A1C83722AC328BD0096AF73 = {
CreatedOnToolsVersion = 15.0;
LastSwiftMigration = 1500;
};
};
};
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
compatibilityVersion = "Xcode 14.0";
developmentRegion = en;
hasScannedForEncodings = 0;
knownRegions = (
en,
Base,
);
mainGroup = 8A1C836A2AC328BD0096AF73;
packageReferences = (
);
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
);
};
/* End PBXProject section */
/* Begin PBXResourcesBuildPhase section */
8A1C83712AC328BD0096AF73 /* Resources */ = {
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXResourcesBuildPhase section */
/* Begin PBXSourcesBuildPhase section */
8A1C836F2AC328BD0096AF73 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
8A1C837F2AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = iphoneos;
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
};
name = Debug;
};
8A1C83802AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES;
SDKROOT = iphoneos;
SWIFT_COMPILATION_MODE = wholemodule;
VALIDATE_PRODUCT = YES;
};
name = Release;
};
8A1C83822AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Debug;
};
8A1C83832AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
isa = XCConfigurationList;
buildConfigurations = (
8A1C837F2AC328BE0096AF73 /* Debug */,
8A1C83802AC328BE0096AF73 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
isa = XCConfigurationList;
buildConfigurations = (
8A1C83822AC328BE0096AF73 /* Debug */,
8A1C83832AC328BE0096AF73 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
}

View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<Workspace
version = "1.0">
<FileRef
location = "self:">
</FileRef>
</Workspace>

View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>IDEDidComputeMac32BitWarning</key>
<true/>
</dict>
</plist>

View file

@ -0,0 +1,11 @@
{
"colors" : [
{
"idiom" : "universal"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,13 @@
{
"images" : [
{
"idiom" : "universal",
"platform" : "ios",
"size" : "1024x1024"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,45 @@
import Foundation
@MainActor
class LlamaState: ObservableObject {
@Published var messageLog = ""
private var llamaContext: LlamaContext?
private var modelUrl: URL? {
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
}
init() {
do {
try loadModel()
} catch {
messageLog += "Error!\n"
}
}
private func loadModel() throws {
messageLog += "Loading model...\n"
if let modelUrl {
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else {
messageLog += "Could not locate model\n"
}
}
func complete(text: String) async {
guard let llamaContext else {
return
}
messageLog += "Attempting to complete text...\n"
await llamaContext.completion_init(text: text)
messageLog += "\(text)"
while await llamaContext.n_cur <= llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
}
await llamaContext.clear()
messageLog += "\n\ndone\n"
}
}

View file

@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,42 @@
import SwiftUI
struct ContentView: View {
@StateObject var llamaState = LlamaState()
@State private var multiLineText = ""
var body: some View {
VStack {
ScrollView(.vertical) {
Text(llamaState.messageLog)
}
TextEditor(text: $multiLineText)
.frame(height: 200)
.padding()
.border(Color.gray, width: 0.5)
Button(action: {
sendText()
}) {
Text("Send")
.padding()
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
}
.padding()
}
func sendText() {
Task {
await llamaState.complete(text: multiLineText)
multiLineText = ""
}
}
}
/*
#Preview {
ContentView()
}
*/

View file

@ -0,0 +1,10 @@
import SwiftUI
@main
struct llama_swiftuiApp: App {
var body: some Scene {
WindowGroup {
ContentView()
}
}
}

View file

@ -5,7 +5,7 @@ import json
import torch
import numpy as np
from gguf import *
from transformers import CLIPModel, CLIPProcessor
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
TEXT = "clip.text"
VISION = "clip.vision"
@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
help="Save a text-only model. It can't be used to encode images")
ap.add_argument("--vision-only", action="store_true", required=False,
help="Save a vision-only model. It can't be used to encode texts")
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
# with proper
args = ap.parse_args()
@ -96,15 +104,22 @@ if args.use_f32:
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
vocab = json.load(f)
tokens = [key for key in vocab]
if args.clip_model_is_vision:
vocab = None
tokens = None
else:
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
vocab = json.load(f)
tokens = [key for key in vocab]
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
config = json.load(f)
v_hparams = config["vision_config"]
t_hparams = config["text_config"]
if args.clip_model_is_vision:
v_hparams = config
t_hparams = None
else:
v_hparams = config["vision_config"]
t_hparams = config["text_config"]
# possible data types
# ftype == 0 -> float32
@ -117,9 +132,12 @@ ftype = 1
if args.use_f32:
ftype = 0
model = CLIPModel.from_pretrained(dir_model)
processor = CLIPProcessor.from_pretrained(dir_model)
if args.clip_model_is_vision:
model = CLIPVisionModel.from_pretrained(dir_model)
processor = None
else:
model = CLIPModel.from_pretrained(dir_model)
processor = CLIPProcessor.from_pretrained(dir_model)
fname_middle = None
has_text_encoder = True
@ -128,13 +146,13 @@ has_llava_projector = False
if args.text_only:
fname_middle = "text-"
has_vision_encoder = False
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False
elif args.llava_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_llava_projector = True
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False
else:
fname_middle = ""
@ -182,8 +200,12 @@ if has_vision_encoder:
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
else:
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
image_std = args.image_std if args.image_std is not None else default_image_std
fout.add_array("clip.vision.image_mean", image_mean)
fout.add_array("clip.vision.image_std", image_std)

View file

@ -0,0 +1,5 @@
set(TARGET lookahead)
add_executable(${TARGET} lookahead.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,7 @@
# llama.cpp/examples/lookahead
Demonstartion of lookahead decoding technique:
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
More info: https://github.com/ggerganov/llama.cpp/pull/4207

View file

@ -0,0 +1,487 @@
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
struct ngram_data {
bool active = false;
llama_seq_id seq_id = -1;
std::vector<int> i_batch;
std::vector<llama_token> tokens;
};
// n-gram container
struct ngram_container {
ngram_container(int n_vocab, int N, int G) {
cnt.resize(n_vocab);
head.resize(n_vocab);
tokens.resize(n_vocab * G * (N - 1));
}
int n_total = 0;
std::vector<int> cnt;
std::vector<int> head;
// [n_vocab][G][N - 1]
// for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
std::vector<llama_token> tokens;
};
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
const int W = 15; // lookahead window
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookahead", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init(params.numa);
llama_model * model = NULL;
llama_context * ctx = NULL;
// load the target model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// Tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp;
std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
all = inp;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
// eval the prompt
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
}
const auto t_enc_end = ggml_time_us();
int n_predict = 0;
int n_accept = 0;
int n_past = inp.size();
llama_token id = 0;
// used to determine end of generation
bool has_eos = false;
// for each decoded batch, we have at most W + G + 1 distinct sequences:
// seq_id == 0 : the current input token
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
// seq_id [W + 1, W + G] : verification n-grams
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
// target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
// verification n-grams
std::vector<ngram_data> ngrams_cur(G);
// tokens for the past N - 1 Jacobi iterations
std::vector<llama_token> tokens_j_prev(W);
std::vector<std::vector<llama_token>> tokens_j(N - 1);
for (int j = 0; j < N - 1; j++) {
tokens_j[j].resize(W);
for (int i = 0; i < W; i++) {
// there are different ways to init these tokens
if (0) {
// initialize randomly from the prompt tokens
tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
} else {
// initialize with a sequence of increasing numbers
tokens_j[j][i] = 100 + i;
}
}
}
std::vector<llama_seq_id> seq_id_look;
// the input token belongs both to all sequences
std::vector<llama_seq_id> seq_id_all(W + G + 1);
for (int i = 0; i < W + G + 1; i++) {
seq_id_all[i] = i;
}
// here we keep adding new n-grams as we go
ngram_container ngrams_observed(llama_n_vocab(model), N, G);
// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
const auto t_dec_start = ggml_time_us();
// sample first token
{
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
llama_sampling_accept(ctx_sampling, ctx, id, true);
{
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
fflush(stdout);
}
}
while (true) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40);
}
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
//
// Example for W = 5, N = 4, G = 2:
// (I = input, L = lookahead, V = verification)
//
// Batch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
// T: -2 -2 -2 -2 -1 -1 -1 -1 -1 0 0 0 0 0 0
// Info: I L L L L L L L L L L L L L L V V V V V V
// Pos: 0 1 2 3 4 1 2 3 4 5 2 3 4 5 6 1 2 3 1 2 3 (+ n_past)
// Logits: 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
// ---------------------------------------------------------------------
// Seq: 0
// 1 1 1
// 2 2 2 2
// 3 3 3 3 3
// 4 4 4 4 4 4
// 5 5 5 5 5 5 5
// 6 6 6 6
// 7 7 7 7
// ---------------------------------------------------------------------
// | | | | | | | | | | |
// V V V V V | | | | | |
// j_tokens | | | | | |
// V V V V V V
// id
{
llama_batch_clear(batch);
// current token - first token of the first level
llama_batch_add(batch, id, n_past, seq_id_all, true);
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
{
const int g_cur = ngrams_observed.cnt[id];
ngrams_cur.resize(g_cur);
for (int g = 0; g < g_cur; g++) {
ngrams_cur[g].active = true;
ngrams_cur[g].tokens.resize(N);
ngrams_cur[g].i_batch.resize(N);
ngrams_cur[g].seq_id = W + 1 + g;
ngrams_cur[g].i_batch[0] = 0;
ngrams_cur[g].tokens [0] = id;
}
for (int j = 0; j < N - 1; j++) {
for (int g = 0; g < g_cur; g++) {
const int idx = id*(N - 1)*G + g*(N - 1);
const llama_token t = ngrams_observed.tokens[idx + j];
ngrams_cur[g].tokens [j + 1] = t;
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
}
}
}
// fill the remaining W - 1 tokens for the first level
for (int i = 1; i < W; i++) {
seq_id_look.resize(W - i);
for (int j = 0; j < W - i; j++) {
seq_id_look[j] = i + j + 1;
}
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
}
// fill the rest of the levels
for (int j = 1; j < N - 1; j++) {
for (int i = 0; i < W; i++) {
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
}
}
}
if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
return 1;
}
int seq_id_best = 0;
for (int v = 0; v < N; ++v) {
int i_batch = 0;
// if no active ngrams are left, it means the sampled token does not pass the verification
if (v > 0) {
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
if (ngrams_cur[g].active) {
i_batch = ngrams_cur[g].i_batch[v];
seq_id_best = ngrams_cur[g].seq_id;
++n_accept;
break;
}
}
// no more matches -> create a new batch
if (i_batch == 0) {
break;
}
}
// sample the next token
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
llama_sampling_accept(ctx_sampling, ctx, id, true);
// print
{
const std::string token_str = llama_token_to_piece(ctx, id);
if (v == 0) {
printf("%s", token_str.c_str());
} else {
// print light cyan
printf("\033[0;96m%s\033[0m", token_str.c_str());
}
fflush(stdout);
if (id == llama_token_eos(model)) {
has_eos = true;
}
all.push_back(id);
}
++n_predict;
++n_past;
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
break;
}
// verify across active n-grams
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
if (ngrams_cur[g].active) {
if (v == N - 1) {
ngrams_cur[g].active = false;
} else {
if (id != ngrams_cur[g].tokens[v + 1]) {
ngrams_cur[g].active = false;
}
}
}
}
// print known n-grams starting with token id (debug)
if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) {
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
}
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
printf(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
printf("%s", token_str.c_str());
}
printf("\n");
}
}
// update lookahead tokens
{
for (int i = 0; i < W; i++) {
tokens_j_prev[i] = tokens_j[0][i];
}
for (int j = 0; j < N - 2; j++) {
tokens_j[j] = tokens_j[j + 1];
}
if (v == 0) {
// sample from the last level
for (int i = 0; i < W; i++) {
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
}
} else {
for (int i = 0; i < W; i++) {
// there are different ways to init these tokens
if (0) {
// random init
tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
} else {
// init from the previous level
tokens_j[N - 2][i] = tokens_j[0][i];
}
}
}
}
// update observed ngrams
if (v == 0) {
// the first token of the n-gram is determined by the index in the container so it is not stored
std::vector<llama_token> ngram(N - 1);
// n-gram generation
// ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
for (int f = 0; f < W; ++f) {
const int ft = tokens_j_prev[f]; // first token of the n-gram
for (int j = 0; j < N - 1; ++j) {
ngram[j] = tokens_j[j][f];
}
// filter-out repeating n-grams
{
bool is_unique = true;
for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
const int idx = ft*(N - 1)*G + k*(N - 1);
bool is_match = true;
for (int j = 0; j < N - 1; ++j) {
if (ngrams_observed.tokens[idx + j] != ngram[j]) {
is_match = false;
break;
}
}
if (is_match) {
is_unique = false;
break;
}
}
if (!is_unique) {
continue;
}
}
const int head = ngrams_observed.head[ft];
const int idx = ft*(N - 1)*G + head*(N - 1);
for (int i = 0; i < N - 1; i++) {
ngrams_observed.tokens[idx + i] = ngram[i];
}
ngrams_observed.cnt[ft] = std::min(G, ngrams_observed.cnt[ft] + 1);
ngrams_observed.head[ft] = (head + 1) % G;
ngrams_observed.n_total++;
}
}
}
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
break;
}
// KV cache management
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
if (seq_id_best != 0) {
// if a verification token matched, we keep the best sequence and remove the rest
// this leads to some KV cache fragmentation
llama_kv_cache_seq_keep(ctx, seq_id_best);
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
}
}
}
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("W = %2d\n", W);
LOG_TEE("N = %2d\n", N);
LOG_TEE("G = %2d\n", G);
LOG_TEE("\n");
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_accept = %d\n", n_accept);
llama_print_timings(ctx);
llama_kv_cache_view_free(&kvc_view);
llama_sampling_free(ctx_sampling);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View file

@ -104,6 +104,12 @@ static void sigint_handler(int signo) {
}
#endif
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
@ -117,6 +123,7 @@ int main(int argc, char ** argv) {
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
@ -438,6 +445,7 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");

View file

@ -234,6 +234,55 @@ node index.js
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
*Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
*Examples:*
You can use either Python `openai` library with appropriate checkpoints:
```python
import openai
client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
{"role": "user", "content": "Write a limerick about python exceptions"}
]
)
print(completion.choices[0].message)
```
... or raw HTTP requests:
```shell
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
},
{
"role": "user",
"content": "Write a limerick about python exceptions"
}
]
}'
```
## More examples
### Change system prompt on runtime

View file

@ -11,10 +11,10 @@ app = Flask(__name__)
slot_id = -1
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
@ -34,19 +34,19 @@ def is_present(json, key):
#convert chat to prompt
def convert_chat(messages):
prompt = "" + args.chat_prompt.replace("\\n", "\n")
system_n = args.system_name.replace("\\n", "\n")
user_n = args.user_name.replace("\\n", "\n")
ai_n = args.ai_name.replace("\\n", "\n")
stop = args.stop.replace("\\n", "\n")
system_n = args.system_name
user_n = args.user_name
ai_n = args.ai_name
stop = args.stop
prompt = "" + args.chat_prompt + stop
for line in messages:
if (line["role"] == "system"):
prompt += f"{system_n}{line['content']}"
prompt += f"{system_n}{line['content']}{stop}"
if (line["role"] == "user"):
prompt += f"{user_n}{line['content']}"
prompt += f"{user_n}{line['content']}{stop}"
if (line["role"] == "assistant"):
prompt += f"{ai_n}{line['content']}{stop}"
prompt += ai_n.rstrip()
@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
if(is_present(body, "seed")): postData["seed"] = body["seed"]
if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
if (args.stop != ""):
postData["stop"] = [args.stop]
@ -130,7 +131,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
}
]
}
slot_id = data["slot_id"]
slot_id = data.get("slot_id")
if (chat):
if (start):
resData["choices"][0]["delta"] = {
@ -150,11 +151,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
return resData
@app.route('/chat/completions', methods=['POST'])
@app.route('/v1/chat/completions', methods=['POST'])
@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
def chat_completions():
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
return Response(status=403)
if request.method == 'OPTIONS':
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
body = request.get_json()
stream = False
tokenize = False
@ -177,20 +180,22 @@ def chat_completions():
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
time_now = int(time.time())
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
yield 'data: {}\n'.format(json.dumps(resData))
yield 'data: {}\n\n'.format(json.dumps(resData))
for line in data.iter_lines():
if line:
decoded_line = line.decode('utf-8')
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
yield 'data: {}\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream')
yield 'data: {}\n\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
@app.route('/completions', methods=['POST'])
@app.route('/v1/completions', methods=['POST'])
@app.route('/completions', methods=['POST', 'OPTIONS'])
@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
def completion():
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
return Response(status=403)
if request.method == 'OPTIONS':
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
body = request.get_json()
stream = False
tokenize = False
@ -216,8 +221,8 @@ def completion():
if line:
decoded_line = line.decode('utf-8')
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
yield 'data: {}\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream')
yield 'data: {}\n\n'.format(json.dumps(resData))
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
if __name__ == '__main__':
app.run(args.host, port=args.port)

View file

@ -29,6 +29,8 @@
#define SERVER_VERBOSE 1
#endif
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
struct server_params
@ -59,6 +61,10 @@ static bool server_verbose = false;
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
json oaicompat_completion_params_parse(const json &body);
std::string format_chatml(std::vector<json> messages);
//
// base64 utils (TODO: move to common in the future)
//
@ -149,15 +155,23 @@ struct task_server {
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states
enum slot_state
{
@ -378,6 +392,9 @@ struct llama_client_slot
bool stopped_word = false;
bool stopped_limit = false;
bool oaicompat = false;
std::string oaicompat_model;
std::string stopping_word;
// sampling
@ -397,6 +414,9 @@ struct llama_client_slot
double t_prompt_processing; // ms
double t_token_generation; // ms
// multitasks
int multitask_id = -1;
void reset() {
num_prompt_tokens = 0;
generated_text = "";
@ -477,7 +497,7 @@ struct llama_client_slot
};
}
void print_timings() {
void print_timings() const {
LOG_TEE("\n");
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@ -520,7 +540,8 @@ struct llama_server_context
std::vector<task_server> queue_tasks;
std::vector<task_result> queue_results;
std::mutex mutex_tasks;
std::vector<task_multi> queue_multitasks;
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
std::mutex mutex_results;
~llama_server_context()
@ -609,6 +630,11 @@ struct llama_server_context
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true;
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
@ -624,12 +650,12 @@ struct llama_server_context
std::vector<llama_token> p;
if (first)
{
p = ::llama_tokenize(ctx, s, add_bos);
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false;
}
else
{
p = ::llama_tokenize(ctx, s, false);
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
@ -646,7 +672,7 @@ struct llama_server_context
else
{
auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
}
return prompt_tokens;
@ -677,6 +703,14 @@ struct llama_server_context
slot_params default_params;
llama_sampling_params default_sparams;
if (data.count("__oaicompat") != 0) {
slot->oaicompat = true;
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else {
slot->oaicompat = false;
slot->oaicompat_model = "";
}
slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
@ -1090,16 +1124,40 @@ struct llama_server_context
return slot.images.size() > 0;
}
void send_error(int id, std::string error)
void send_error(task_server& task, std::string error)
{
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = id;
res.id = task.id;
res.multitask_id = task.multitask_id;
res.stop = false;
res.error = true;
res.result_json = { { "content", error } };
queue_results.push_back(res);
}
void add_multi_task(int id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
json get_model_props()
{
return get_formated_generation(slots[0]);
@ -1144,6 +1202,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = false;
@ -1169,6 +1228,12 @@ struct llama_server_context
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
}
if (slot.oaicompat)
{
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
res.result_json["model"] = slot.oaicompat_model;
}
queue_results.push_back(res);
}
@ -1177,6 +1242,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = true;
@ -1216,6 +1282,18 @@ struct llama_server_context
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
}
if (slot.oaicompat)
{
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
res.result_json["model"] = slot.oaicompat_model;
}
// parent multitask, if any, needs to be updated
if (slot.multitask_id != -1)
{
update_multi_task(slot.multitask_id, slot.task_id, res);
}
queue_results.push_back(res);
}
@ -1224,6 +1302,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = true;
@ -1250,15 +1329,26 @@ struct llama_server_context
queue_results.push_back(res);
}
int request_completion(json data, bool infill, bool embedding)
int request_completion(json data, bool infill, bool embedding, int multitask_id)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
std::unique_lock<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.data = data;
task.target_id = 0;
task.data = std::move(data);
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK;
task.multitask_id = multitask_id;
// when a completion task's prompt array is not a singleton, we split it into multiple requests
if (task.data.at("prompt").size() > 1)
{
lock.unlock(); // entering new func scope
return split_multiprompt_task(task);
}
// otherwise, it's a single-prompt task, we actually queue it
queue_tasks.push_back(task);
return task.id;
}
@ -1277,8 +1367,17 @@ struct llama_server_context
for (int i = 0; i < (int) queue_results.size(); i++)
{
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (queue_results[i].multitask_id == task_id)
{
update_multi_task(task_id, queue_results[i].id, queue_results[i]);
queue_results.erase(queue_results.begin() + i);
continue;
}
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
@ -1368,6 +1467,27 @@ struct llama_server_context
queue_tasks.push_back(task);
}
int split_multiprompt_task(task_server& multiprompt_task)
{
int prompt_count = multiprompt_task.data.at("prompt").size();
assert(prompt_count > 1);
int multitask_id = id_gen++;
std::vector<int> subtask_ids(prompt_count);
for (int i = 0; i < prompt_count; i++)
{
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
}
// queue up the multitask so we can track its subtask progression
add_multi_task(multitask_id, subtask_ids);
return multitask_id;
}
void process_tasks()
{
std::lock_guard<std::mutex> lock(mutex_tasks);
@ -1383,7 +1503,7 @@ struct llama_server_context
{
LOG_TEE("slot unavailable\n");
// send error result
send_error(task.id, "slot unavailable");
send_error(task, "slot unavailable");
return;
}
@ -1397,11 +1517,12 @@ struct llama_server_context
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
slot->multitask_id = task.multitask_id;
if (!launch_slot_with_data(slot, task.data))
{
// send error result
send_error(task.id, "internal_error");
send_error(task, "internal_error");
break;
}
} break;
@ -1417,6 +1538,38 @@ struct llama_server_context
} break;
}
}
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_result aggregate_result;
aggregate_result.id = queue_iterator->id;
aggregate_result.stop = true;
aggregate_result.error = false;
// collect json results into one json result
std::vector<json> result_jsons;
for (auto& subres : queue_iterator->results)
{
result_jsons.push_back(subres.result_json);
aggregate_result.error = aggregate_result.error && subres.error;
}
aggregate_result.result_json = json{ "results", result_jsons };
std::lock_guard<std::mutex> lock(mutex_results);
queue_results.push_back(aggregate_result);
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
}
bool update_slots() {
@ -1808,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf("\n");
}
@ -2162,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.mmproj = argv[i];
}
else if (arg == "--log-disable")
{
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
else
{
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -2178,6 +2337,232 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::string result(32, ' ');
for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}
return result;
}
static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}
std::string format_chatml(std::vector<json> messages)
{
std::ostringstream chatml_msgs;
for (auto it = messages.begin(); it != messages.end(); ++it) {
chatml_msgs << "<|im_start|>"
<< json_value(*it, "role", std::string("user")) << '\n';
chatml_msgs << json_value(*it, "content", std::string(""))
<< "<|im_end|>\n";
}
chatml_msgs << "<|im_start|>assistant" << '\n';
return chatml_msgs.str();
}
/* llama.cpp completion api semantics */
json oaicompat_completion_params_parse(
const json &body /* openai api json semantics */)
{
json llama_params;
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", 0);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", false);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
if (llama_params.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}
// Handle 'stop' field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Ensure there is ChatML-specific end sequence among stop words
llama_params["stop"].push_back("<|im_end|>");
return llama_params;
}
static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
{
json result = response.result_json;
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res =
json{{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage",
json{{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"id", gen_chatcmplid()}};
if (server_verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
json result = response.result_json;
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({response.result_json});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json{{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({ret});
}
static json format_partial_response(
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
) {
@ -2333,7 +2718,7 @@ int main(int argc, char **argv)
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
{
json data = json::parse(req.body);
const int task_id = llama.request_completion(data, false, false);
const int task_id = llama.request_completion(data, false, false, -1);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
@ -2354,9 +2739,9 @@ int main(int argc, char **argv)
task_result result = llama.next_result(task_id);
if (!result.error) {
const std::string str =
"data: " +
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
"data: " +
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
@ -2369,9 +2754,9 @@ int main(int argc, char **argv)
}
} else {
const std::string str =
"error: " +
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
"error: " +
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
@ -2396,10 +2781,102 @@ int main(int argc, char **argv)
}
});
svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
{
std::time_t t = std::time(0);
json models = {
{"object", "list"},
{"data", {
{
{"id", params.model_alias},
{"object", "model"},
{"created", t},
{"owned_by", "llamacpp"}
},
}}
};
res.set_content(models.dump(), "application/json");
});
// TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
{
json data = oaicompat_completion_params_parse(json::parse(req.body));
const int task_id = llama.request_completion(data, false, false, -1);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
if (!result.error && result.stop) {
json oaicompat_result = format_final_response_oaicompat(data, result);
res.set_content(oaicompat_result.dump(-1, ' ', false,
json::error_handler_t::replace),
"application/json");
} else {
res.status = 500;
res.set_content(result.result_json["content"], "text/plain");
return;
}
} else {
const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
while (true) {
task_result llama_result = llama.next_result(task_id);
if (!llama_result.error) {
std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
for (auto it = result_array.begin(); it != result_array.end(); ++it)
{
if (!it->empty()) {
const std::string str =
"data: " +
it->dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
return false;
}
}
}
if (llama_result.stop) {
break;
}
} else {
const std::string str =
"error: " +
llama_result.result_json.dump(-1, ' ', false,
json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
return false;
}
break;
}
}
sink.done();
return true;
};
auto on_complete = [task_id, &llama](bool) {
// cancel request
llama.request_cancel(task_id);
};
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
}
});
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
{
json data = json::parse(req.body);
const int task_id = llama.request_completion(data, true, false);
const int task_id = llama.request_completion(data, true, false, -1);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
@ -2503,7 +2980,7 @@ int main(int argc, char **argv)
{
prompt = "";
}
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
task_result result = llama.next_result(task_id);
return res.set_content(result.result_json.dump(), "application/json");
});

View file

@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
return 1;
}

View file

@ -0,0 +1,8 @@
# llama.cpp/examples/speculative
Demonstartion of speculative decoding and tree-based speculative decoding techniques
More info:
- https://github.com/ggerganov/llama.cpp/pull/2926
- https://github.com/ggerganov/llama.cpp/pull/3624

View file

@ -203,8 +203,9 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str());
fflush(stdout);
if (!params.use_color) {
printf("%s", token_str.c_str());
}
if (id == llama_token_eos(model_tgt)) {
has_eos = true;
@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
++n_past_tgt;
++n_past_dft;
++i_dft;
if (params.use_color) {
// Color token according to its origin sequence
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
fflush(stdout);
}
continue;
}
}
if (params.use_color) {
printf("%s", token_str.c_str());
}
fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

View file

@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
opt_cb_data.last_save_iter = opt->iter;
}
if (alloc) {
ggml_allocr_free(alloc);
}
ggml_free(opt->ctx);
free_train_state(train);
ggml_free(model.ctx);