Merge branch 'master' into gg/per-layer-kv
ggml-ci
This commit is contained in:
commit
680a99e792
15 changed files with 569 additions and 176 deletions
|
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
|
|||
llama_print_timings(context)
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0 ..< tokenCount {
|
||||
swiftTokens.append(tokens[Int(i)])
|
||||
|
|
|
@ -11,6 +11,8 @@ actor LlamaContext {
|
|||
private var context: OpaquePointer
|
||||
private var batch: llama_batch
|
||||
private var tokens_list: [llama_token]
|
||||
/// This variable is used to store temporarily invalid cchars
|
||||
private var temporary_invalid_cchars: [CChar]
|
||||
|
||||
var n_len: Int32 = 512
|
||||
var n_cur: Int32 = 0
|
||||
|
@ -21,6 +23,7 @@ actor LlamaContext {
|
|||
self.context = context
|
||||
self.tokens_list = []
|
||||
self.batch = llama_batch_init(512, 0, 1)
|
||||
self.temporary_invalid_cchars = []
|
||||
}
|
||||
|
||||
deinit {
|
||||
|
@ -61,6 +64,7 @@ actor LlamaContext {
|
|||
print("attempting to complete \"\(text)\"")
|
||||
|
||||
tokens_list = tokenize(text: text, add_bos: true)
|
||||
temporary_invalid_cchars = []
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||
|
@ -72,7 +76,7 @@ actor LlamaContext {
|
|||
}
|
||||
|
||||
for id in tokens_list {
|
||||
print(token_to_piece(token: id))
|
||||
print(String(cString: token_to_piece(token: id) + [0]))
|
||||
}
|
||||
|
||||
// batch = llama_batch_init(512, 0) // done in init()
|
||||
|
@ -115,10 +119,25 @@ actor LlamaContext {
|
|||
|
||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||
print("\n")
|
||||
return ""
|
||||
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||
temporary_invalid_cchars.removeAll()
|
||||
return new_token_str
|
||||
}
|
||||
|
||||
let new_token_str = token_to_piece(token: new_token_id)
|
||||
let new_token_cchars = token_to_piece(token: new_token_id)
|
||||
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
|
||||
let new_token_str: String
|
||||
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
|
||||
temporary_invalid_cchars.removeAll()
|
||||
new_token_str = string
|
||||
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
|
||||
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
|
||||
let string = String(cString: temporary_invalid_cchars + [0])
|
||||
temporary_invalid_cchars.removeAll()
|
||||
new_token_str = string
|
||||
} else {
|
||||
new_token_str = ""
|
||||
}
|
||||
print(new_token_str)
|
||||
// tokens_list.append(new_token_id)
|
||||
|
||||
|
@ -144,12 +163,14 @@ actor LlamaContext {
|
|||
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
temporary_invalid_cchars.removeAll()
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0..<tokenCount {
|
||||
|
@ -161,7 +182,8 @@ actor LlamaContext {
|
|||
return swiftTokens
|
||||
}
|
||||
|
||||
private func token_to_piece(token: llama_token) -> String {
|
||||
/// - note: The result does not contain null-terminator
|
||||
private func token_to_piece(token: llama_token) -> [CChar] {
|
||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||
result.initialize(repeating: Int8(0), count: 8)
|
||||
defer {
|
||||
|
@ -175,10 +197,12 @@ actor LlamaContext {
|
|||
defer {
|
||||
newResult.deallocate()
|
||||
}
|
||||
_ = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||
return String(cString: newResult)
|
||||
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||
return Array(bufferPointer)
|
||||
} else {
|
||||
return String(cString: result)
|
||||
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||
return Array(bufferPointer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -437,6 +437,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
|
|
|
@ -2383,6 +2383,7 @@ json oaicompat_completion_params_parse(
|
|||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||
|
|
|
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
|
|||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -203,8 +203,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
|
@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
|
|||
++n_past_tgt;
|
||||
++n_past_dft;
|
||||
++i_dft;
|
||||
|
||||
if (params.use_color) {
|
||||
// Color token according to its origin sequence
|
||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
|
||||
|
|
|
@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
|
|||
opt_cb_data.last_save_iter = opt->iter;
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
ggml_allocr_free(alloc);
|
||||
}
|
||||
|
||||
ggml_free(opt->ctx);
|
||||
free_train_state(train);
|
||||
ggml_free(model.ctx);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue