From d7b800b8bc490a221acbd83c575206a907f2f6e2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 3 Dec 2023 10:58:16 +0200 Subject: [PATCH 01/10] llama : pad KV cache size (#4280) * llama : pad KV cache size to 32 * metal : try to improve batched decoding --- ggml-metal.m | 2 +- llama.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 6cfacf64f..3343bc8a3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute( // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = 1; + int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16; #if 0 // the numbers below are measured on M2 Ultra for 7B and 13B models diff --git a/llama.cpp b/llama.cpp index 3f5d663cf..fd905ade7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5744,8 +5744,7 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? - kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); + kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); From 6949b50df56ee58a2d76d45487942cb211c08629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Ed=C3=A9n?= Date: Sun, 3 Dec 2023 10:03:25 +0100 Subject: [PATCH 02/10] py : add grammar to oai like api (#4294) --- examples/server/api_like_OAI.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index 830c056d4..607fe49d3 100755 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False): if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"] if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"] if(is_present(body, "seed")): postData["seed"] = body["seed"] + if(is_present(body, "grammar")): postData["grammar"] = body["grammar"] if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()] if (args.stop != ""): postData["stop"] = [args.stop] From 33e171d1e9fc4903f9314b490d77fb8d58331b63 Mon Sep 17 00:00:00 2001 From: Ed Lee Date: Sun, 3 Dec 2023 01:10:43 -0800 Subject: [PATCH 03/10] server : fix OpenAI API `stop` field to be optional (#4299) (cherry picked from commit Mozilla-Ocho/llamafile@e8c92bcb84ae3bcbf0d617b7ee6a5413bcbd58af) --- examples/server/server.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0fd42dcba..911f7bbe1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2410,9 +2410,7 @@ json oaicompat_completion_params_parse( } // Handle 'stop' field - if (body["stop"].is_null()) { - llama_params["stop"] = json::array({}); - } else if (body["stop"].is_string()) { + if (body.contains("stop") && body["stop"].is_string()) { llama_params["stop"] = json::array({body["stop"].get()}); } else { llama_params["stop"] = json_value(body, "stop", json::array()); From adf3de4f69ff7e44131222f05f9c7447ac0be3cb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 3 Dec 2023 15:56:22 +0200 Subject: [PATCH 04/10] ggml : fix soft max out-of-bounds access (#4307) ggml-ci --- ggml.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index e2687ef4f..cecb12700 100644 --- a/ggml.c +++ b/ggml.c @@ -15629,7 +15629,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: @@ -15645,6 +15644,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = 1; //TODO } break; + case GGML_OP_SOFT_MAX: + { + n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); + } break; case GGML_OP_CONV_TRANSPOSE_1D: { n_tasks = n_threads; From fbbc42827b2949b95bcde23ce47bb47d006c895d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 3 Dec 2023 15:56:35 +0200 Subject: [PATCH 05/10] ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() (#4308) * ggml : fix soft max out-of-bounds access ggml-ci * ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() ggml-ci --- ggml.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/ggml.c b/ggml.c index cecb12700..f743df1f3 100644 --- a/ggml.c +++ b/ggml.c @@ -15879,18 +15879,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { // thread scheduling for the different operations + work buffer size estimation for (int i = 0; i < cgraph->n_nodes; i++) { - int n_tasks = 1; - struct ggml_tensor * node = cgraph->nodes[i]; + const int n_tasks = ggml_get_n_tasks(node, n_threads); + size_t cur = 0; switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - n_tasks = n_threads; - if (ggml_is_quantized(node->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } @@ -15898,16 +15896,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_ADD: case GGML_OP_ADD1: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } } break; case GGML_OP_ACC: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } @@ -15935,16 +15929,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_OUT_PROD: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } } break; case GGML_OP_SOFT_MAX: { - n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; case GGML_OP_CONV_TRANSPOSE_1D: @@ -15974,7 +15964,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_IM2COL: { - n_tasks = n_threads; } break; case GGML_OP_CONV_TRANSPOSE_2D: { @@ -15992,8 +15981,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_ATTN: { - n_tasks = n_threads; - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src[1]->type == GGML_TYPE_F32) { @@ -16006,8 +15993,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_FF: { - n_tasks = n_threads; - if (node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 @@ -16018,8 +16003,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_ATTN_BACK: { - n_tasks = n_threads; - const int64_t D = node->src[0]->ne[0]; const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back @@ -16034,8 +16017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_CROSS_ENTROPY_LOSS: { - n_tasks = n_threads; - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); } break; case GGML_OP_COUNT: From 4fa44e84adb4c78e1885694cc3513982d4af2b08 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Mon, 4 Dec 2023 16:57:35 +0900 Subject: [PATCH 06/10] grammar-parser : fix typo (#4318) preceeding -> preceding --- common/grammar-parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index ff51cc803..bf89a96f3 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -190,7 +190,7 @@ namespace grammar_parser { pos = parse_space(pos + 1, is_nested); } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator if (last_sym_start == out_elements.size()) { - throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); + throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos); } // apply transformation to previous symbol (last_sym_start to end) according to From 5c9f90cba1cc6b0a2a7d19ee5dcb73cad6331d30 Mon Sep 17 00:00:00 2001 From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Date: Mon, 4 Dec 2023 22:43:45 +0900 Subject: [PATCH 07/10] swift : fix prompt tokenization logic (#4321) --- examples/batched.swift/Sources/main.swift | 5 +++-- examples/llama.swiftui/llama.cpp.swift/LibLlama.swift | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index ce9d80d9b..4d0005349 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end llama_print_timings(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let n_tokens = text.count + (add_bos ? 1 : 0) + let utf8Count = text.utf8.count + let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 09b36d9e6..f828106fb 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -147,9 +147,10 @@ actor LlamaContext { } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let n_tokens = text.count + (add_bos ? 1 : 0) + let utf8Count = text.utf8.count + let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) var swiftTokens: [llama_token] = [] for i in 0.. Date: Tue, 5 Dec 2023 01:03:49 +0900 Subject: [PATCH 08/10] swift : fix concatenation method to avoid invalid UTF8 stringfication (#4325) --- .../llama.cpp.swift/LibLlama.swift | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index f828106fb..3754f0551 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -11,6 +11,8 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + /// This variable is used to store temporarily invalid cchars + private var temporary_invalid_cchars: [CChar] var n_len: Int32 = 512 var n_cur: Int32 = 0 @@ -21,6 +23,7 @@ actor LlamaContext { self.context = context self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) + self.temporary_invalid_cchars = [] } deinit { @@ -61,6 +64,7 @@ actor LlamaContext { print("attempting to complete \"\(text)\"") tokens_list = tokenize(text: text, add_bos: true) + temporary_invalid_cchars = [] let n_ctx = llama_n_ctx(context) let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) @@ -72,7 +76,7 @@ actor LlamaContext { } for id in tokens_list { - print(token_to_piece(token: id)) + print(String(cString: token_to_piece(token: id) + [0])) } // batch = llama_batch_init(512, 0) // done in init() @@ -115,10 +119,25 @@ actor LlamaContext { if new_token_id == llama_token_eos(context) || n_cur == n_len { print("\n") - return "" + let new_token_str = String(cString: temporary_invalid_cchars + [0]) + temporary_invalid_cchars.removeAll() + return new_token_str } - let new_token_str = token_to_piece(token: new_token_id) + let new_token_cchars = token_to_piece(token: new_token_id) + temporary_invalid_cchars.append(contentsOf: new_token_cchars) + let new_token_str: String + if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) { + temporary_invalid_cchars.removeAll() + new_token_str = string + } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) { + // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string + let string = String(cString: temporary_invalid_cchars + [0]) + temporary_invalid_cchars.removeAll() + new_token_str = string + } else { + new_token_str = "" + } print(new_token_str) // tokens_list.append(new_token_id) @@ -144,6 +163,7 @@ actor LlamaContext { func clear() { tokens_list.removeAll() + temporary_invalid_cchars.removeAll() } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { @@ -162,7 +182,8 @@ actor LlamaContext { return swiftTokens } - private func token_to_piece(token: llama_token) -> String { + /// - note: The result does not contain null-terminator + private func token_to_piece(token: llama_token) -> [CChar] { let result = UnsafeMutablePointer.allocate(capacity: 8) result.initialize(repeating: Int8(0), count: 8) defer { @@ -176,10 +197,12 @@ actor LlamaContext { defer { newResult.deallocate() } - _ = llama_token_to_piece(model, token, newResult, -nTokens) - return String(cString: newResult) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens) + let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) + return Array(bufferPointer) } else { - return String(cString: result) + let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens)) + return Array(bufferPointer) } } } From 23b5e12eb5a76489b4c3ee22213a081da68b1809 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 4 Dec 2023 17:04:21 +0100 Subject: [PATCH 09/10] simple : update error message for KV cache check (#4324) This commit updates the error message that is printed when the KV cache is not big enough to hold all the prompt and generated tokens. Specifically it removes the reference to n_parallel and replaces it with n_len. Signed-off-by: Daniel Bevenius --- examples/simple/simple.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 374aef6f1..9cfde8308 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -75,7 +75,7 @@ int main(int argc, char ** argv) { // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__); return 1; } From e4b76bbe316ee50fb17d9ac29e654c0edf830eba Mon Sep 17 00:00:00 2001 From: kchro3 <62481661+kchro3@users.noreply.github.com> Date: Mon, 4 Dec 2023 23:29:46 -0800 Subject: [PATCH 10/10] swift : revert compiler checks for swift package (#4332) --- Package.swift | 46 ++++++++++++++++------------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/Package.swift b/Package.swift index 5b3bd72ca..18d610d69 100644 --- a/Package.swift +++ b/Package.swift @@ -2,33 +2,14 @@ import PackageDescription -#if arch(arm) || arch(arm64) -let platforms: [SupportedPlatform]? = [ - .macOS(.v12), - .iOS(.v14), - .watchOS(.v4), - .tvOS(.v14) -] -let exclude: [String] = [] -let resources: [Resource] = [ - .process("ggml-metal.metal") -] -let additionalSources: [String] = ["ggml-metal.m"] -let additionalSettings: [CSetting] = [ - .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_USE_METAL") -] -#else -let platforms: [SupportedPlatform]? = nil -let exclude: [String] = ["ggml-metal.metal"] -let resources: [Resource] = [] -let additionalSources: [String] = [] -let additionalSettings: [CSetting] = [] -#endif - let package = Package( name: "llama", - platforms: platforms, + platforms: [ + .macOS(.v12), + .iOS(.v14), + .watchOS(.v4), + .tvOS(.v14) + ], products: [ .library(name: "llama", targets: ["llama"]), ], @@ -36,25 +17,30 @@ let package = Package( .target( name: "llama", path: ".", - exclude: exclude, + exclude: [], sources: [ "ggml.c", "llama.cpp", "ggml-alloc.c", "ggml-backend.c", "ggml-quants.c", - ] + additionalSources, - resources: resources, + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") + ], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_ACCELERATE") + .define("GGML_USE_ACCELERATE"), + .unsafeFlags(["-fno-objc-arc"]), + .define("GGML_USE_METAL"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_LAPACK_ILP64") - ] + additionalSettings, + ], linkerSettings: [ .linkedFramework("Accelerate") ]