diff --git a/Package.swift b/Package.swift index 5b3bd72ca..18d610d69 100644 --- a/Package.swift +++ b/Package.swift @@ -2,33 +2,14 @@ import PackageDescription -#if arch(arm) || arch(arm64) -let platforms: [SupportedPlatform]? = [ - .macOS(.v12), - .iOS(.v14), - .watchOS(.v4), - .tvOS(.v14) -] -let exclude: [String] = [] -let resources: [Resource] = [ - .process("ggml-metal.metal") -] -let additionalSources: [String] = ["ggml-metal.m"] -let additionalSettings: [CSetting] = [ - .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_USE_METAL") -] -#else -let platforms: [SupportedPlatform]? = nil -let exclude: [String] = ["ggml-metal.metal"] -let resources: [Resource] = [] -let additionalSources: [String] = [] -let additionalSettings: [CSetting] = [] -#endif - let package = Package( name: "llama", - platforms: platforms, + platforms: [ + .macOS(.v12), + .iOS(.v14), + .watchOS(.v4), + .tvOS(.v14) + ], products: [ .library(name: "llama", targets: ["llama"]), ], @@ -36,25 +17,30 @@ let package = Package( .target( name: "llama", path: ".", - exclude: exclude, + exclude: [], sources: [ "ggml.c", "llama.cpp", "ggml-alloc.c", "ggml-backend.c", "ggml-quants.c", - ] + additionalSources, - resources: resources, + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") + ], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_ACCELERATE") + .define("GGML_USE_ACCELERATE"), + .unsafeFlags(["-fno-objc-arc"]), + .define("GGML_USE_METAL"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_LAPACK_ILP64") - ] + additionalSettings, + ], linkerSettings: [ .linkedFramework("Accelerate") ] diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index ff51cc803..bf89a96f3 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -190,7 +190,7 @@ namespace grammar_parser { pos = parse_space(pos + 1, is_nested); } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator if (last_sym_start == out_elements.size()) { - throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); + throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos); } // apply transformation to previous symbol (last_sym_start to end) according to diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index ce9d80d9b..4d0005349 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end llama_print_timings(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let n_tokens = text.count + (add_bos ? 1 : 0) + let utf8Count = text.utf8.count + let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 09b36d9e6..3754f0551 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -11,6 +11,8 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + /// This variable is used to store temporarily invalid cchars + private var temporary_invalid_cchars: [CChar] var n_len: Int32 = 512 var n_cur: Int32 = 0 @@ -21,6 +23,7 @@ actor LlamaContext { self.context = context self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) + self.temporary_invalid_cchars = [] } deinit { @@ -61,6 +64,7 @@ actor LlamaContext { print("attempting to complete \"\(text)\"") tokens_list = tokenize(text: text, add_bos: true) + temporary_invalid_cchars = [] let n_ctx = llama_n_ctx(context) let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) @@ -72,7 +76,7 @@ actor LlamaContext { } for id in tokens_list { - print(token_to_piece(token: id)) + print(String(cString: token_to_piece(token: id) + [0])) } // batch = llama_batch_init(512, 0) // done in init() @@ -115,10 +119,25 @@ actor LlamaContext { if new_token_id == llama_token_eos(context) || n_cur == n_len { print("\n") - return "" + let new_token_str = String(cString: temporary_invalid_cchars + [0]) + temporary_invalid_cchars.removeAll() + return new_token_str } - let new_token_str = token_to_piece(token: new_token_id) + let new_token_cchars = token_to_piece(token: new_token_id) + temporary_invalid_cchars.append(contentsOf: new_token_cchars) + let new_token_str: String + if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) { + temporary_invalid_cchars.removeAll() + new_token_str = string + } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) { + // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string + let string = String(cString: temporary_invalid_cchars + [0]) + temporary_invalid_cchars.removeAll() + new_token_str = string + } else { + new_token_str = "" + } print(new_token_str) // tokens_list.append(new_token_id) @@ -144,12 +163,14 @@ actor LlamaContext { func clear() { tokens_list.removeAll() + temporary_invalid_cchars.removeAll() } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let n_tokens = text.count + (add_bos ? 1 : 0) + let utf8Count = text.utf8.count + let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) var swiftTokens: [llama_token] = [] for i in 0.. String { + /// - note: The result does not contain null-terminator + private func token_to_piece(token: llama_token) -> [CChar] { let result = UnsafeMutablePointer.allocate(capacity: 8) result.initialize(repeating: Int8(0), count: 8) defer { @@ -175,10 +197,12 @@ actor LlamaContext { defer { newResult.deallocate() } - _ = llama_token_to_piece(model, token, newResult, -nTokens) - return String(cString: newResult) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens) + let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) + return Array(bufferPointer) } else { - return String(cString: result) + let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens)) + return Array(bufferPointer) } } } diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index 830c056d4..607fe49d3 100755 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False): if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"] if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"] if(is_present(body, "seed")): postData["seed"] = body["seed"] + if(is_present(body, "grammar")): postData["grammar"] = body["grammar"] if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()] if (args.stop != ""): postData["stop"] = [args.stop] diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0fd42dcba..911f7bbe1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2410,9 +2410,7 @@ json oaicompat_completion_params_parse( } // Handle 'stop' field - if (body["stop"].is_null()) { - llama_params["stop"] = json::array({}); - } else if (body["stop"].is_string()) { + if (body.contains("stop") && body["stop"].is_string()) { llama_params["stop"] = json::array({body["stop"].get()}); } else { llama_params["stop"] = json_value(body, "stop", json::array()); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 374aef6f1..9cfde8308 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -75,7 +75,7 @@ int main(int argc, char ** argv) { // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__); return 1; } diff --git a/ggml-metal.m b/ggml-metal.m index 6cfacf64f..3343bc8a3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute( // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = 1; + int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16; #if 0 // the numbers below are measured on M2 Ultra for 7B and 13B models diff --git a/ggml.c b/ggml.c index e2687ef4f..f743df1f3 100644 --- a/ggml.c +++ b/ggml.c @@ -15629,7 +15629,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: @@ -15645,6 +15644,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = 1; //TODO } break; + case GGML_OP_SOFT_MAX: + { + n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); + } break; case GGML_OP_CONV_TRANSPOSE_1D: { n_tasks = n_threads; @@ -15876,18 +15879,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { // thread scheduling for the different operations + work buffer size estimation for (int i = 0; i < cgraph->n_nodes; i++) { - int n_tasks = 1; - struct ggml_tensor * node = cgraph->nodes[i]; + const int n_tasks = ggml_get_n_tasks(node, n_threads); + size_t cur = 0; switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - n_tasks = n_threads; - if (ggml_is_quantized(node->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } @@ -15895,16 +15896,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_ADD: case GGML_OP_ADD1: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } } break; case GGML_OP_ACC: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } @@ -15932,16 +15929,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_OUT_PROD: { - n_tasks = n_threads; - if (ggml_is_quantized(node->src[0]->type)) { cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } } break; case GGML_OP_SOFT_MAX: { - n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; case GGML_OP_CONV_TRANSPOSE_1D: @@ -15971,7 +15964,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_IM2COL: { - n_tasks = n_threads; } break; case GGML_OP_CONV_TRANSPOSE_2D: { @@ -15989,8 +15981,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_ATTN: { - n_tasks = n_threads; - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src[1]->type == GGML_TYPE_F32) { @@ -16003,8 +15993,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_FF: { - n_tasks = n_threads; - if (node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 @@ -16015,8 +16003,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_FLASH_ATTN_BACK: { - n_tasks = n_threads; - const int64_t D = node->src[0]->ne[0]; const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back @@ -16031,8 +16017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_CROSS_ENTROPY_LOSS: { - n_tasks = n_threads; - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); } break; case GGML_OP_COUNT: diff --git a/llama.cpp b/llama.cpp index 3f5d663cf..fd905ade7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5744,8 +5744,7 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? - kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); + kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);