From 8a5be3bd5885d79ad84aadf32bb8c1a67bd43c19 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 15 Dec 2023 22:16:15 -0500 Subject: [PATCH 01/20] llama : sanity checks for access to logits (#4274) Co-authored-by: Georgi Gerganov --- llama.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llama.cpp b/llama.cpp index eddb70859..58fe7492e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1505,6 +1505,10 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; +#ifndef NDEBUG + // guard against access to unset logits + std::vector logits_valid; +#endif bool logits_all = false; // input embedding (1-dimensional array: [n_embd]) @@ -6150,6 +6154,14 @@ static int llama_decode_internal( { auto & logits_out = lctx.logits; +#ifndef NDEBUG + auto & logits_valid = lctx.logits_valid; + logits_valid.clear(); + logits_valid.resize(n_tokens); + + logits_out.clear(); +#endif + if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { @@ -6157,13 +6169,22 @@ static int llama_decode_internal( continue; } memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[i] = true; +#endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); +#ifndef NDEBUG + std::fill(logits_valid.begin(), logits_valid.end(), true); +#endif } else { logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[n_tokens - 1] = true; +#endif } } @@ -10052,6 +10073,7 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + assert(ctx->logits_valid.at(i)); return ctx->logits.data() + i*ctx->model.hparams.n_vocab; } From c6c4fc081c1df1c60a9bfe3e6a3fd086f1a29ec7 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 16 Dec 2023 18:58:46 +0100 Subject: [PATCH 02/20] lora : add support for non-llama models (#3333) * lora : add support for non-llama models ggml-ci * avoid leaking ggml_context on failure cleanup ggml-ci * lora : allow 1d tensors * lora : include embd and output layers in size calculation * fix style --- convert-lora-to-ggml.py | 86 +++++++++++++------------- llama.cpp | 133 ++++++++++++++++++++-------------------- llama.h | 1 + 3 files changed, 114 insertions(+), 106 deletions(-) diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a937410dd..53bb8a3d9 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -3,7 +3,6 @@ from __future__ import annotations import json import os -import re import struct import sys from typing import Any, BinaryIO, Sequence @@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence import numpy as np import torch +from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} -HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attn_q", - "self_attn.k_proj": "attn_k", - "self_attn.v_proj": "attn_v", - "self_attn.o_proj": "attn_output", - "mlp.gate_proj": "ffn_gate", - "mlp.down_proj": "ffn_down", - "mlp.up_proj": "ffn_up", - "input_layernorm": "attn_norm", - "post_attention_layernorm": "ffn_norm", -} - - -def translate_tensor_name(t: str) -> str: - match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) - if match: - nn = match.group(1) - sub_layer = match.group(2) - lora_type = match.group(3) - - sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) - if sub_layer_renamed is None: - print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") - sys.exit(1) - - output_string = ( - f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" - ) - return output_string - else: - print(f"Error: unrecognized tensor {t}") - sys.exit(1) - - def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version @@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(struct.pack("i", int(params["lora_alpha"]))) -def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: np.dtype[Any] -) -> None: +def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None: sname = name.encode("utf-8") fout.write( struct.pack( @@ -78,11 +47,12 @@ def write_tensor_header( fout.seek((fout.tell() + 31) & -32) -if len(sys.argv) != 2: - print(f"Usage: python {sys.argv[0]} ") +if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} [arch]") print( "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" ) + print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") model = torch.load(input_model, map_location="cpu") +arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" + +if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + print(f"Error: unsupported architecture {arch_name}") + sys.exit(1) + +arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] +name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone with open(input_json, "r") as f: params = json.load(f) @@ -117,6 +95,7 @@ with open(output_path, "wb") as fout: write_file_header(fout, params) for k, v in model.items(): + orig_k = k if k.endswith(".default.weight"): k = k.replace(".default.weight", ".weight") if k in ["llama_proj.weight", "llama_proj.bias"]: @@ -129,7 +108,32 @@ with open(output_path, "wb") as fout: v = v.float() t = v.detach().numpy() - tname = translate_tensor_name(k) + + prefix = "base_model.model." + if k.startswith(prefix): + k = k[len(prefix) :] + + lora_suffixes = (".lora_A.weight", ".lora_B.weight") + if k.endswith(lora_suffixes): + suffix = k[-len(lora_suffixes[0]):] + k = k[: -len(lora_suffixes[0])] + else: + print(f"Error: unrecognized tensor name {orig_k}") + sys.exit(1) + + tname = name_map.get_name(k) + if tname is None: + print(f"Error: could not map tensor name {orig_k}") + print(" Note: the arch parameter must be specified if the model is not llama") + sys.exit(1) + + if suffix == ".lora_A.weight": + tname += ".weight.loraA" + elif suffix == ".lora_B.weight": + tname += ".weight.loraB" + else: + assert False + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) diff --git a/llama.cpp b/llama.cpp index 58fe7492e..f49214c13 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8647,53 +8647,60 @@ static int llama_apply_lora_from_file_internal( const int64_t t_start_lora_us = ggml_time_us(); - auto fin = std::ifstream(path_lora, std::ios::binary); - if (!fin) { - LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); - return 1; - } + llama_file fin(path_lora, "rb"); // verify magic and version { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); + uint32_t magic = fin.read_u32(); + if (magic != LLAMA_FILE_MAGIC_GGLA) { + LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version = fin.read_u32(); if (format_version != 1) { LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); return 1; } } - int32_t lora_r; - int32_t lora_alpha; - fin.read((char *) &lora_r, sizeof(lora_r)); - fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + int32_t lora_r = fin.read_u32(); + int32_t lora_alpha = fin.read_u32(); float scaling = scale * (float)lora_alpha / (float)lora_r; LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + // create a name -> tensor map of the model to accelerate lookups + // find the max tensor size to estimate the required temporary buffer size + size_t max_tensor_size = 0; + std::unordered_map model_tensors; + for (const auto & kv : model.tensors_by_name) { + model_tensors.insert(kv); + size_t f32_size = ggml_nelements(kv.second) * sizeof(float); + max_tensor_size = std::max(max_tensor_size, f32_size); + } + // create a temporary ggml context to store the lora tensors - // todo: calculate size from biggest possible tensor - std::vector lora_buf(1024ull * 1024ull * 1024ull); + // TODO: use ggml-alloc + size_t lora_ctx_size = max_tensor_size * 3; + LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); + std::vector lora_buf(lora_ctx_size); + struct ggml_init_params params; params.mem_size = lora_buf.size(); params.mem_buffer = lora_buf.data(); params.no_alloc = false; - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; + using unique_context = std::unique_ptr; - // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - } + unique_context lora_ctx(nullptr, ggml_free); + lora_ctx.reset(ggml_init(params)); + std::unordered_map lora_tensors; // load base model std::unique_ptr ml; - ggml_context * base_ctx = NULL; + + unique_context base_ctx(nullptr, ggml_free); std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); @@ -8702,6 +8709,7 @@ static int llama_apply_lora_from_file_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(ctx_size, mmapped_size); + base_buf.resize(ctx_size); ggml_init_params base_params; @@ -8709,9 +8717,9 @@ static int llama_apply_lora_from_file_internal( base_params.mem_buffer = base_buf.data(); base_params.no_alloc = ml->use_mmap; - base_ctx = ggml_init(base_params); + base_ctx.reset(ggml_init(base_params)); - // maybe this should in llama_model_loader + // maybe this should be in llama_model_loader if (ml->use_mmap) { ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); } @@ -8724,27 +8732,35 @@ static int llama_apply_lora_from_file_internal( std::vector work_buffer; while (true) { + if (fin.tell() == fin.size) { + // eof + break; + } + int32_t n_dims; - int32_t length; + int32_t name_len; int32_t ftype; - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; + fin.read_raw(&n_dims, sizeof(n_dims)); + fin.read_raw(&name_len, sizeof(name_len)); + fin.read_raw(&ftype, sizeof(ftype)); + + if (n_dims != 1 && n_dims != 2) { + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; } int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + fin.read_raw(&ne[i], sizeof(ne[i])); } std::string name; { + GGML_ASSERT(name_len <= 1024); char buf[1024]; - fin.read(buf, length); - name = std::string(buf, length); + fin.read_raw(buf, name_len); + name = std::string(buf, name_len); } // check for lora suffix and get the type of tensor @@ -8758,7 +8774,7 @@ static int llama_apply_lora_from_file_internal( std::string lora_type = name.substr(pos + lora_suffix.length()); std::string base_name = name; base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); if (model_tensors.find(base_name) == model_tensors.end()) { LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); @@ -8777,22 +8793,15 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor; - if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); - } - else { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - ggml_set_name(lora_tensor, "lora_tensor"); + ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); + ggml_set_name(lora_tensor, name.c_str()); // load tensor data - size_t offset = fin.tellg(); + size_t offset = fin.tell(); size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seekg(offset); - fin.read((char*)lora_tensor->data, tensor_data_size); + fin.seek(offset, SEEK_SET); + fin.read_raw(lora_tensor->data, tensor_data_size); lora_tensors[name] = lora_tensor; @@ -8822,13 +8831,11 @@ static int llama_apply_lora_from_file_internal( // load from base model if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - // TODO: throw LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } - // TODO: not tested!! maybe not working! - base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); ml->load_data_for(base_t); } else { base_t = dest_t; @@ -8857,43 +8864,45 @@ static int llama_apply_lora_from_file_internal( } // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); offload_func(BA); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling); ggml_set_name(scale_tensor, "scale_tensor"); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor); offload_func(BA); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); + r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); offload_func_force_inplace(r); ggml_set_name(r, "r_add_inplace"); } else { - r = ggml_add(lora_ctx, base_t, BA); + r = ggml_add(lora_ctx.get(), base_t, BA); offload_func(r); ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx, r, dest_t); + r = ggml_cpy(lora_ctx.get(), r, dest_t); offload_func(r); ggml_set_name(r, "r_cpy"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); + struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); ggml_build_forward_expand(gf, r); ggml_graph_compute_helper(work_buffer, gf, n_threads); + // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other + GGML_ASSERT(lora_tensors.size() == 2); + // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); + lora_ctx.reset(ggml_init(params)); lora_tensors.clear(); n_tensors++; @@ -8903,12 +8912,6 @@ static int llama_apply_lora_from_file_internal( } } - // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); - if (base_ctx) { - ggml_free(base_ctx); - } - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); diff --git a/llama.h b/llama.h index 45a65cacb..15ab4f80e 100644 --- a/llama.h +++ b/llama.h @@ -39,6 +39,7 @@ #define LLAMA_MAX_RNG_STATE (64*1024) +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN From 5daa5f54fdcd2b5228add1a4c43a1897b2168f35 Mon Sep 17 00:00:00 2001 From: Bach Le Date: Sun, 17 Dec 2023 18:57:33 +0800 Subject: [PATCH 03/20] Link to cublas dynamically on Windows even with LLAMA_STATIC (#4506) --- CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57b43c136..e3cd43ab3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -291,7 +291,12 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (WIN32) + # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() else() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() From 62bd52b7bf90819e75f427a95a484cd5eee0b3c7 Mon Sep 17 00:00:00 2001 From: mzcu Date: Sun, 17 Dec 2023 15:54:37 +0100 Subject: [PATCH 04/20] server : allow requests larger than 8K (#4500) --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5f93dcb66..a9f8b3747 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -10,7 +10,8 @@ // crash the server in debug mode, otherwise send an http 500 error #define CPPHTTPLIB_NO_EXCEPTIONS 1 #endif - +// increase max payload length to allow use of larger context size +#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 #include "httplib.h" #include "json.hpp" From eb16dae7e70ca97396190698b29c0f9ee3388e88 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Sun, 17 Dec 2023 14:56:09 +0000 Subject: [PATCH 05/20] server : fix possible ambiguity in content type charset (#4501) --- examples/server/server.cpp | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9f8b3747..be7b5b95e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2699,7 +2699,7 @@ int main(int argc, char **argv) } // API key is invalid or not provided - res.set_content("Unauthorized: Invalid API Key", "text/plain"); + res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8"); res.status = 401; // Unauthorized LOG_WARNING("Unauthorized: Invalid API Key", {}); @@ -2714,28 +2714,28 @@ int main(int argc, char **argv) // this is only called if no index.html is found in the public --path svr.Get("/", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html"); + res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8"); return false; }); // this is only called if no index.js is found in the public --path svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript"); + res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript"); + res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript"); + res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8"); return false; }); @@ -2746,7 +2746,7 @@ int main(int argc, char **argv) { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() } }; - res.set_content(data.dump(), "application/json"); + res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) @@ -2760,12 +2760,12 @@ int main(int argc, char **argv) std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { - res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { res.status = 404; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2836,7 +2836,7 @@ int main(int argc, char **argv) }} }; - res.set_content(models.dump(), "application/json"); + res.set_content(models.dump(), "application/json; charset=utf-8"); }); // TODO: add mount point without "/v1" prefix -- how? @@ -2858,10 +2858,10 @@ int main(int argc, char **argv) res.set_content(oaicompat_result.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + "application/json; charset=utf-8"); } else { res.status = 500; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2925,12 +2925,12 @@ int main(int argc, char **argv) task_result result = llama.next_result(task_id); if (!result.error && result.stop) { - res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { res.status = 404; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2979,11 +2979,11 @@ int main(int argc, char **argv) svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res) { const json data = llama.get_model_props(); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) - { return res.set_content("", "application/json"); }); + { return res.set_content("", "application/json; charset=utf-8"); }); svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) { @@ -2994,7 +2994,7 @@ int main(int argc, char **argv) tokens = llama.tokenize(body["content"], false); } const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) @@ -3008,7 +3008,7 @@ int main(int argc, char **argv) } const json data = format_detokenized_response(content); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) @@ -3025,7 +3025,7 @@ int main(int argc, char **argv) } const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); task_result result = llama.next_result(task_id); - return res.set_content(result.result_json.dump(), "application/json"); + return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); svr.set_logger(log_server_request); @@ -3046,7 +3046,7 @@ int main(int argc, char **argv) { snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); } - res.set_content(buf, "text/plain"); + res.set_content(buf, "text/plain; charset=utf-8"); res.status = 500; }); @@ -3054,15 +3054,15 @@ int main(int argc, char **argv) { if (res.status == 401) { - res.set_content("Unauthorized", "text/plain"); + res.set_content("Unauthorized", "text/plain; charset=utf-8"); } if (res.status == 400) { - res.set_content("Invalid request", "text/plain"); + res.set_content("Invalid request", "text/plain; charset=utf-8"); } else if (res.status == 404) { - res.set_content("File Not Found", "text/plain"); + res.set_content("File Not Found", "text/plain; charset=utf-8"); res.status = 404; } }); From 8edd2b40fdbcafbf630f2cf29306b29d5cb48c42 Mon Sep 17 00:00:00 2001 From: AdithyanI Date: Sun, 17 Dec 2023 15:57:56 +0100 Subject: [PATCH 06/20] server : fix grammar being ignored (#4494) Fix bug in identifying the grammar. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index be7b5b95e..c97efe97d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2414,7 +2414,7 @@ json oaicompat_completion_params_parse( llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); - if (llama_params.count("grammar") != 0) { + if (body.count("grammar") != 0) { llama_params["grammar"] = json_value(body, "grammar", json::object()); } From 0ffc92d2d23a789625f018840469af045be1e3c0 Mon Sep 17 00:00:00 2001 From: olexiyb Date: Sun, 17 Dec 2023 17:02:16 +0200 Subject: [PATCH 07/20] server : disable llm logs if SERVER_VERBOSE is off (#3792) --- examples/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c97efe97d..04038530f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2645,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con int main(int argc, char **argv) { +#if SERVER_VERBOSE != 1 + log_disable(); +#endif // own arguments required by this example gpt_params params; server_params sparams; From 45668633fdb522a925c3dafc1ecf426f539efb27 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 17 Dec 2023 16:05:56 +0100 Subject: [PATCH 08/20] finetune : keep allocs alive until all allocations are done (#4486) --- examples/finetune/finetune.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b9849e8c9..6a668d764 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) { opt->params.adam.gclip = params.common.adam_gclip; opt->params.adam.eps_f = params.common.adam_eps_f; - ggml_allocr * alloc = NULL; - printf("%s: init model\n", __func__); bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); @@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) { // allocate input tensors mem_input_data.resize(max_input_size); - alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc, tokens_input); - ggml_allocr_alloc(alloc, target_probs); - ggml_allocr_free(alloc); + ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); + ggml_allocr_alloc(alloc_inps, tokens_input); + ggml_allocr_alloc(alloc_inps, target_probs); // context for compute tensors without their data const size_t estimated_compute_size_wo_data = ( @@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) { // allocate compute tensors mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) { params.common.use_checkpointing ); ggml_allocr_free(alloc); + ggml_allocr_free(alloc_inps); + // tokenize data std::vector train_tokens; From 919c40660fd27157b391b5832d2a577d5afef4cb Mon Sep 17 00:00:00 2001 From: Matheus Gabriel Alves Silva Date: Sun, 17 Dec 2023 12:23:33 -0300 Subject: [PATCH 09/20] build : Check the ROCm installation location (#4485) * build : Check the ROCm installation location * more generic approach * fixup! It was returning the path instead of the command output * fixup! Trailing whitespace --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index fb775ae5b..8273f8400 100644 --- a/Makefile +++ b/Makefile @@ -439,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h endif # LLAMA_CLBLAST ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - HIPCC ?= $(ROCM_PATH)/bin/hipcc - GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + + ifeq ($(wildcard /opt/rocm),) + ROCM_PATH ?= /usr + GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) + else + ROCM_PATH ?= /opt/rocm + GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + endif + HIPCC ?= $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 From f7f468a97dceec2f8fe8b1ed7a2091083446ebc7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 17 Dec 2023 10:45:46 -0500 Subject: [PATCH 10/20] gguf-py : fail fast on nonsensical special token IDs (#4489) --- gguf-py/gguf/vocab.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index de3e5edb5..76924d8f2 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -109,8 +109,10 @@ class SpecialVocab: return True def _set_special_token(self, typ: str, tid: Any) -> None: - if not isinstance(tid, int) or tid < 0: + if not isinstance(tid, int): return + if tid < 0: + raise ValueError(f'invalid value for special token type {typ}: {tid}') if self.n_vocab is None or tid < self.n_vocab: if typ in self.special_token_ids: return From 800a489e4a8be199122259a995b1ee9dd7fae320 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Dec 2023 19:38:41 +0200 Subject: [PATCH 11/20] llama.swiftui : add bench functionality (#4483) * llama.swiftui : add bench button * llama.swiftui : initial bench functionality * force to use n_gpu_layers on simulator * add download buttons & expose llamaState.loadModel * update project.pbxproj * comment #Preview & fix editorconfig check * gitignore : xcode stuff * llama.swiftui : UX improvements * llama.swiftui : avoid data copy via "downloadTask" * llama.swiftui : remove model from project * llama : remove "mostly" from model infos * llama.swiftui : improve bench --------- Co-authored-by: jhen --- .editorconfig | 3 + examples/llama.swiftui/.gitignore | 1 + .../llama.cpp.swift/LibLlama.swift | 182 +++- .../llama.swiftui.xcodeproj/project.pbxproj | 898 +++++++++--------- .../llama.swiftui/Models/LlamaState.swift | 52 +- .../llama.swiftui/UI/ContentView.swift | 114 ++- .../llama.swiftui/UI/DownloadButton.swift | 122 +++ llama.cpp | 33 +- 8 files changed, 895 insertions(+), 510 deletions(-) create mode 100644 examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift diff --git a/.editorconfig b/.editorconfig index a56e9ccc8..16d16b3b5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -23,3 +23,6 @@ insert_final_newline = unset [examples/server/public/*] indent_size = 2 + +[examples/llama.swiftui/llama.swiftui.xcodeproj/*] +indent_style = tab diff --git a/examples/llama.swiftui/.gitignore b/examples/llama.swiftui/.gitignore index 9bce6af39..e585a2a4f 100644 --- a/examples/llama.swiftui/.gitignore +++ b/examples/llama.swiftui/.gitignore @@ -1 +1,2 @@ xcuserdata +xcshareddata diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 3754f0551..272e1fd8a 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -6,16 +6,34 @@ enum LlamaError: Error { case couldNotInitializeContext } +func llama_batch_clear(_ batch: inout llama_batch) { + batch.n_tokens = 0 +} + +func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) { + batch.token [Int(batch.n_tokens)] = id + batch.pos [Int(batch.n_tokens)] = pos + batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count) + for i in 0.. LlamaContext { + static func create_context(path: String) throws -> LlamaContext { llama_backend_init(false) - let model_params = llama_model_default_params() + var model_params = llama_model_default_params() +#if targetEnvironment(simulator) + model_params.n_gpu_layers = 0 + print("Running on simulator, force use n_gpu_layers = 0") +#endif let model = llama_load_model_from_file(path, model_params) guard let model else { print("Could not load model at \(path)") throw LlamaError.couldNotInitializeContext } + + let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2)) + print("Using \(n_threads) threads") + var ctx_params = llama_context_default_params() - ctx_params.seed = 1234 + ctx_params.seed = 1234 ctx_params.n_ctx = 2048 - ctx_params.n_threads = 8 - ctx_params.n_threads_batch = 8 + ctx_params.n_threads = UInt32(n_threads) + ctx_params.n_threads_batch = UInt32(n_threads) let context = llama_new_context_with_model(model, ctx_params) guard let context else { @@ -56,6 +83,26 @@ actor LlamaContext { return LlamaContext(model: model, context: context) } + func model_info() -> String { + let result = UnsafeMutablePointer.allocate(capacity: 256) + result.initialize(repeating: Int8(0), count: 256) + defer { + result.deallocate() + } + + // TODO: this is probably very stupid way to get the string from C + + let nChars = llama_model_desc(model, result, 256) + let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars)) + + var SwiftString = "" + for char in bufferPointer { + SwiftString.append(Character(UnicodeScalar(UInt8(char)))) + } + + return SwiftString + } + func get_n_tokens() -> Int32 { return batch.n_tokens; } @@ -79,16 +126,11 @@ actor LlamaContext { print(String(cString: token_to_piece(token: id) + [0])) } - // batch = llama_batch_init(512, 0) // done in init() - batch.n_tokens = Int32(tokens_list.count) + llama_batch_clear(&batch) - for i1 in 0.. String { + var pp_avg: Double = 0 + var tg_avg: Double = 0 + + var pp_std: Double = 0 + var tg_std: Double = 0 + + for r in 0.. 1 { + pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1)) + tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1)) + } else { + pp_std = 0 + tg_std = 0 + } + + let model_desc = model_info(); + let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0); + let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9); + let backend = "Metal"; + let pp_avg_str = String(format: "%.2f", pp_avg); + let tg_avg_str = String(format: "%.2f", tg_avg); + let pp_std_str = String(format: "%.2f", pp_std); + let tg_std_str = String(format: "%.2f", tg_std); + + var result = "" + + result += String("| model | size | params | backend | test | t/s |\n") + result += String("| --- | --- | --- | --- | --- | --- |\n") + result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n") + result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n") + + return result; + } + func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() + llama_kv_cache_clear(context) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count - let n_tokens = utf8Count + (add_bos ? 1 : 0) + let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1 let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index bc1fd15ce..2e6159928 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -1,481 +1,483 @@ // !$*UTF8*$! { - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { /* Begin PBXBuildFile section */ - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; }; - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; }; - 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; }; - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; - 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; - 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; - 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; - 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; - 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; - 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; - 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; - 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; + 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; + 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; - 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; - 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; - 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; - 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; - 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; - 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; - 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; - 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; - 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; - 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; - 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; - 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; - 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; - 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = ""; }; - 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; - 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; - 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; + 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; + 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; + 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; + 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; + 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; + 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; + 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; + 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; + 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; + 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; + 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = ""; }; + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; + 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; + 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; + 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ - 8A1C83702AC328BD0096AF73 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, - 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C83702AC328BD0096AF73 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { - isa = PBXGroup; - children = ( - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, - 542376092B0D9C40008E6A1C /* ggml-backend.h */, - 542376062B0D9BEA008E6A1C /* ggml-quants.h */, - 542376072B0D9BFB008E6A1C /* ggml-quants.c */, - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, - 549479C62AC9E0F200E0F78B /* ggml-metal.h */, - 549479C52AC9E0F200E0F78B /* ggml-metal.m */, - 542EA09B2AC8723900A8AEE9 /* ggml.c */, - 542EA09C2AC8723900A8AEE9 /* ggml.h */, - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, - 542EA0A12AC8729100A8AEE9 /* llama.cpp */, - 542EA0A22AC8729100A8AEE9 /* llama.h */, - ); - name = llama.cpp; - sourceTree = ""; - }; - 8A1C836A2AC328BD0096AF73 = { - isa = PBXGroup; - children = ( - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, - 8A907F312AC7134E006146EA /* llama.cpp.swift */, - 8A3F84232AC4C891005E2EE8 /* models */, - 8A1C83752AC328BD0096AF73 /* llama.swiftui */, - 8A1C83742AC328BD0096AF73 /* Products */, - 8A39BE082AC7601000BFEB40 /* Frameworks */, - ); - sourceTree = ""; - }; - 8A1C83742AC328BD0096AF73 /* Products */ = { - isa = PBXGroup; - children = ( - 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, - ); - name = Products; - sourceTree = ""; - }; - 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { - isa = PBXGroup; - children = ( - 8A3F84102AC4BD85005E2EE8 /* Resources */, - 8A9F7C4B2AC332DC008AE1EA /* Models */, - 8A9F7C4A2AC332BF008AE1EA /* UI */, - 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, - 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, - 8A1C837C2AC328BE0096AF73 /* Preview Content */, - ); - path = llama.swiftui; - sourceTree = ""; - }; - 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { - isa = PBXGroup; - children = ( - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, - ); - path = "Preview Content"; - sourceTree = ""; - }; - 8A39BE082AC7601000BFEB40 /* Frameworks */ = { - isa = PBXGroup; - children = ( - 549479CA2AC9E16000E0F78B /* Metal.framework */, - 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, - ); - name = Frameworks; - sourceTree = ""; - }; - 8A3F84102AC4BD85005E2EE8 /* Resources */ = { - isa = PBXGroup; - children = ( - 8A3F84112AC4BD8C005E2EE8 /* models */, - ); - path = Resources; - sourceTree = ""; - }; - 8A3F84112AC4BD8C005E2EE8 /* models */ = { - isa = PBXGroup; - children = ( - 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */, - ); - path = models; - sourceTree = ""; - }; - 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { - isa = PBXGroup; - children = ( - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, - 8A907F322AC7134E006146EA /* LibLlama.swift */, - ); - path = llama.cpp.swift; - sourceTree = ""; - }; - 8A9F7C4A2AC332BF008AE1EA /* UI */ = { - isa = PBXGroup; - children = ( - 8A1C83782AC328BD0096AF73 /* ContentView.swift */, - ); - path = UI; - sourceTree = ""; - }; - 8A9F7C4B2AC332DC008AE1EA /* Models */ = { - isa = PBXGroup; - children = ( - 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, - ); - path = Models; - sourceTree = ""; - }; + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { + isa = PBXGroup; + children = ( + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, + 542376092B0D9C40008E6A1C /* ggml-backend.h */, + 542376062B0D9BEA008E6A1C /* ggml-quants.h */, + 542376072B0D9BFB008E6A1C /* ggml-quants.c */, + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, + 549479C62AC9E0F200E0F78B /* ggml-metal.h */, + 549479C52AC9E0F200E0F78B /* ggml-metal.m */, + 542EA09B2AC8723900A8AEE9 /* ggml.c */, + 542EA09C2AC8723900A8AEE9 /* ggml.h */, + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, + 542EA0A12AC8729100A8AEE9 /* llama.cpp */, + 542EA0A22AC8729100A8AEE9 /* llama.h */, + ); + name = llama.cpp; + sourceTree = ""; + }; + 8A1C836A2AC328BD0096AF73 = { + isa = PBXGroup; + children = ( + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, + 8A907F312AC7134E006146EA /* llama.cpp.swift */, + 8A3F84232AC4C891005E2EE8 /* models */, + 8A1C83752AC328BD0096AF73 /* llama.swiftui */, + 8A1C83742AC328BD0096AF73 /* Products */, + 8A39BE082AC7601000BFEB40 /* Frameworks */, + ); + sourceTree = ""; + }; + 8A1C83742AC328BD0096AF73 /* Products */ = { + isa = PBXGroup; + children = ( + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, + ); + name = Products; + sourceTree = ""; + }; + 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXGroup; + children = ( + 8A3F84102AC4BD85005E2EE8 /* Resources */, + 8A9F7C4B2AC332DC008AE1EA /* Models */, + 8A9F7C4A2AC332BF008AE1EA /* UI */, + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, + 8A1C837C2AC328BE0096AF73 /* Preview Content */, + ); + path = llama.swiftui; + sourceTree = ""; + }; + 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { + isa = PBXGroup; + children = ( + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, + ); + path = "Preview Content"; + sourceTree = ""; + }; + 8A39BE082AC7601000BFEB40 /* Frameworks */ = { + isa = PBXGroup; + children = ( + 549479CA2AC9E16000E0F78B /* Metal.framework */, + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; + 8A3F84102AC4BD85005E2EE8 /* Resources */ = { + isa = PBXGroup; + children = ( + 8A3F84112AC4BD8C005E2EE8 /* models */, + ); + path = Resources; + sourceTree = ""; + }; + 8A3F84112AC4BD8C005E2EE8 /* models */ = { + isa = PBXGroup; + children = ( + ); + path = models; + sourceTree = ""; + }; + 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { + isa = PBXGroup; + children = ( + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, + 8A907F322AC7134E006146EA /* LibLlama.swift */, + ); + path = llama.cpp.swift; + sourceTree = ""; + }; + 8A9F7C4A2AC332BF008AE1EA /* UI */ = { + isa = PBXGroup; + children = ( + 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */, + 8A1C83782AC328BD0096AF73 /* ContentView.swift */, + ); + path = UI; + sourceTree = ""; + }; + 8A9F7C4B2AC332DC008AE1EA /* Models */ = { + isa = PBXGroup; + children = ( + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, + ); + path = Models; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXNativeTarget section */ - 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { - isa = PBXNativeTarget; - buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; - buildPhases = ( - 8A1C836F2AC328BD0096AF73 /* Sources */, - 8A1C83702AC328BD0096AF73 /* Frameworks */, - 8A1C83712AC328BD0096AF73 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = llama.swiftui; - packageProductDependencies = ( - ); - productName = llama.swiftui; - productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; - productType = "com.apple.product-type.application"; - }; + 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXNativeTarget; + buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; + buildPhases = ( + 8A1C836F2AC328BD0096AF73 /* Sources */, + 8A1C83702AC328BD0096AF73 /* Frameworks */, + 8A1C83712AC328BD0096AF73 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = llama.swiftui; + packageProductDependencies = ( + ); + productName = llama.swiftui; + productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; + productType = "com.apple.product-type.application"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ - 8A1C836B2AC328BD0096AF73 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - 8A1C83722AC328BD0096AF73 = { - CreatedOnToolsVersion = 15.0; - LastSwiftMigration = 1500; - }; - }; - }; - buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = 8A1C836A2AC328BD0096AF73; - packageReferences = ( - ); - productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; - projectDirPath = ""; - projectRoot = ""; - targets = ( - 8A1C83722AC328BD0096AF73 /* llama.swiftui */, - ); - }; + 8A1C836B2AC328BD0096AF73 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + 8A1C83722AC328BD0096AF73 = { + CreatedOnToolsVersion = 15.0; + LastSwiftMigration = 1500; + }; + }; + }; + buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 8A1C836A2AC328BD0096AF73; + packageReferences = ( + ); + productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8A1C83722AC328BD0096AF73 /* llama.swiftui */, + ); + }; /* End PBXProject section */ /* Begin PBXResourcesBuildPhase section */ - 8A1C83712AC328BD0096AF73 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, - 8A3F84242AC4C891005E2EE8 /* models in Resources */, - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, - 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C83712AC328BD0096AF73 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, + 8A3F84242AC4C891005E2EE8 /* models in Resources */, + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXResourcesBuildPhase section */ /* Begin PBXSourcesBuildPhase section */ - 8A1C836F2AC328BD0096AF73 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, - 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, - 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, - 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, - 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C836F2AC328BD0096AF73 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, + 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */, + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXSourcesBuildPhase section */ /* Begin XCBuildConfiguration section */ - 8A1C837F2AC328BE0096AF73 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_DYNAMIC_NO_PIC = NO; - GCC_NO_COMMON_BLOCKS = YES; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 17.0; - LOCALIZATION_PREFERS_STRING_CATALOGS = YES; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - MTL_FAST_MATH = YES; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - 8A1C83802AC328BE0096AF73 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 17.0; - LOCALIZATION_PREFERS_STRING_CATALOGS = YES; - MTL_ENABLE_DEBUG_INFO = NO; - MTL_FAST_MATH = YES; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - 8A1C83822AC328BE0096AF73 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CLANG_ENABLE_MODULES = YES; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; - DEVELOPMENT_TEAM = STLSG3FG8Q; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - 8A1C83832AC328BE0096AF73 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CLANG_ENABLE_MODULES = YES; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; - DEVELOPMENT_TEAM = STLSG3FG8Q; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; + 8A1C837F2AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 8A1C83802AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + 8A1C83822AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 8A1C83832AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - 8A1C837F2AC328BE0096AF73 /* Debug */, - 8A1C83802AC328BE0096AF73 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - 8A1C83822AC328BE0096AF73 /* Debug */, - 8A1C83832AC328BE0096AF73 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; + 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C837F2AC328BE0096AF73 /* Debug */, + 8A1C83802AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C83822AC328BE0096AF73 /* Debug */, + 8A1C83832AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ - }; - rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; + }; + rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; } diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index babc60cdc..3393eb242 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -3,24 +3,26 @@ import Foundation @MainActor class LlamaState: ObservableObject { @Published var messageLog = "" + @Published var cacheCleared = false private var llamaContext: LlamaContext? - private var modelUrl: URL? { - Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") + private var defaultModelUrl: URL? { + Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models") // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models") } + init() { do { - try loadModel() + try loadModel(modelUrl: defaultModelUrl) } catch { messageLog += "Error!\n" } } - private func loadModel() throws { + func loadModel(modelUrl: URL?) throws { messageLog += "Loading model...\n" if let modelUrl { - llamaContext = try LlamaContext.createContext(path: modelUrl.path()) + llamaContext = try LlamaContext.create_context(path: modelUrl.path()) messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" } else { messageLog += "Could not locate model\n" @@ -31,7 +33,7 @@ class LlamaState: ObservableObject { guard let llamaContext else { return } - messageLog += "Attempting to complete text...\n" + await llamaContext.completion_init(text: text) messageLog += "\(text)" @@ -42,4 +44,42 @@ class LlamaState: ObservableObject { await llamaContext.clear() messageLog += "\n\ndone\n" } + + func bench() async { + guard let llamaContext else { + return + } + + messageLog += "\n" + messageLog += "Running benchmark...\n" + messageLog += "Model info: " + messageLog += await llamaContext.model_info() + "\n" + + let t_start = DispatchTime.now().uptimeNanoseconds + await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up + let t_end = DispatchTime.now().uptimeNanoseconds + + let t_heat = Double(t_end - t_start) / 1_000_000_000.0 + messageLog += "Heat up time: \(t_heat) seconds, please wait...\n" + + // if more than 5 seconds, then we're probably running on a slow device + if t_heat > 5.0 { + messageLog += "Heat up time is too long, aborting benchmark\n" + return + } + + let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3) + + messageLog += "\(result)" + messageLog += "\n" + } + + func clear() async { + guard let llamaContext else { + return + } + + await llamaContext.clear() + messageLog = "" + } } diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 0bd16a806..219bf4dc1 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -5,24 +5,97 @@ struct ContentView: View { @State private var multiLineText = "" + private static func cleanupModelCaches() { + // Delete all models (*.gguf) + let fileManager = FileManager.default + let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] + do { + let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil) + for fileURL in fileURLs { + if fileURL.pathExtension == "gguf" { + try fileManager.removeItem(at: fileURL) + } + } + } catch { + print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)") + } + } + var body: some View { VStack { - ScrollView(.vertical) { + ScrollView(.vertical, showsIndicators: true) { Text(llamaState.messageLog) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .onTapGesture { + UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) + } } TextEditor(text: $multiLineText) - .frame(height: 200) + .frame(height: 80) .padding() .border(Color.gray, width: 0.5) - Button(action: { - sendText() - }) { - Text("Send") - .padding() - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) + + HStack { + Button("Send") { + sendText() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Bench") { + bench() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Clear") { + clear() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Copy") { + UIPasteboard.general.string = llamaState.messageLog + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + } + + VStack { + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (Q4_0)", + modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", + filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" + ) + .font(.system(size: 12)) + .padding(.top, 4) + + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (Q8_0)", + modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true", + filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf" + ) + .font(.system(size: 12)) + + Button("Clear downloaded models") { + ContentView.cleanupModelCaches() + llamaState.cacheCleared = true + } + .padding(8) + .font(.system(size: 12)) } } .padding() @@ -34,9 +107,20 @@ struct ContentView: View { multiLineText = "" } } + + func bench() { + Task { + await llamaState.bench() + } + } + + func clear() { + Task { + await llamaState.clear() + } + } } -/* -#Preview { - ContentView() -} -*/ + +//#Preview { +// ContentView() +//} diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift new file mode 100644 index 000000000..4bd75cb69 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift @@ -0,0 +1,122 @@ +import SwiftUI + +struct DownloadButton: View { + @ObservedObject private var llamaState: LlamaState + private var modelName: String + private var modelUrl: String + private var filename: String + + @State private var status: String + + @State private var downloadTask: URLSessionDownloadTask? + @State private var progress = 0.0 + @State private var observation: NSKeyValueObservation? + + private static func getFileURL(filename: String) -> URL { + FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename) + } + + private func checkFileExistenceAndUpdateStatus() { + } + + init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) { + self.llamaState = llamaState + self.modelName = modelName + self.modelUrl = modelUrl + self.filename = filename + + let fileURL = DownloadButton.getFileURL(filename: filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + + private func download() { + status = "downloading" + print("Downloading model \(modelName) from \(modelUrl)") + guard let url = URL(string: modelUrl) else { return } + let fileURL = DownloadButton.getFileURL(filename: filename) + + downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in + if let error = error { + print("Error: \(error.localizedDescription)") + return + } + + guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else { + print("Server error!") + return + } + + do { + if let temporaryURL = temporaryURL { + try FileManager.default.copyItem(at: temporaryURL, to: fileURL) + print("Writing to \(filename) completed") + + llamaState.cacheCleared = false + + status = "downloaded" + } + } catch let err { + print("Error: \(err.localizedDescription)") + } + } + + observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in + self.progress = progress.fractionCompleted + } + + downloadTask?.resume() + } + + var body: some View { + VStack { + if status == "download" { + Button(action: download) { + Text("Download " + modelName) + } + } else if status == "downloading" { + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("\(modelName) (Downloading \(Int(progress * 100))%)") + } + } else if status == "downloaded" { + Button(action: { + let fileURL = DownloadButton.getFileURL(filename: filename) + if !FileManager.default.fileExists(atPath: fileURL.path) { + download() + return + } + do { + try llamaState.loadModel(modelUrl: fileURL) + } catch let err { + print("Error: \(err.localizedDescription)") + } + }) { + Text("\(modelName) (Downloaded)") + } + } else { + Text("Unknown status") + } + } + .onDisappear() { + downloadTask?.cancel() + } + .onChange(of: llamaState.cacheCleared) { newValue in + if newValue { + downloadTask?.cancel() + let fileURL = DownloadButton.getFileURL(filename: filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + } + } +} + +// #Preview { +// DownloadButton( +// llamaState: LlamaState(), +// modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)", +// modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", +// filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" +// ) +// } diff --git a/llama.cpp b/llama.cpp index f49214c13..fd9fd6ed9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2397,25 +2397,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + return "Q4_1, some F16"; + case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; + case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; default: return "unknown, may not work"; } @@ -2533,6 +2533,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; From b1306c439490c7fa4ec33594500d980d1e9e15e6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Dec 2023 20:16:23 +0200 Subject: [PATCH 12/20] readme : update hot topics --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index edbe6ba57..01aef2afc 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- Collecting Apple Silicon performance stats: + - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 + - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 -- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309** -- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 -- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167 ---- From 2994f0c5a2e8c96955b422dedc93ec2595d16b82 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 17 Dec 2023 19:39:02 -0500 Subject: [PATCH 13/20] decode : fix logits_valid for legacy API (#4516) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index fd9fd6ed9..d6d575f9e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6184,7 +6184,7 @@ static int llama_decode_internal( logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); #ifndef NDEBUG - logits_valid[n_tokens - 1] = true; + logits_valid[0] = true; #endif } } From 3c04bf6da89eaf4c7d317e0518f0687dfcbf2de7 Mon Sep 17 00:00:00 2001 From: hankcs Date: Mon, 18 Dec 2023 05:14:58 -0800 Subject: [PATCH 14/20] llama : fix try_override for bool_value which always return true (#4519) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d6d575f9e..99facbf77 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1937,7 +1937,7 @@ namespace GGUFMeta { target = override->bool_value; return true; } - return true; + return false; } template From b9e74f9bca5fdf7d0a22ed25e7a9626335fdfa48 Mon Sep 17 00:00:00 2001 From: Ebey Abraham Date: Mon, 18 Dec 2023 17:27:47 +0000 Subject: [PATCH 15/20] llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490) * phi2 implementation * fix breaking change * phi-2 : various fixes * phi-2 : use layer norm eps * py : whitespaces * llama : fix meta KV override bug * convert : phi don't add BOS token * convert : revert "added_tokens_decoder" change * phi-2 : scale Q instead of KQ for better precision * ggml : fix NeoX rope to rotate just first n_dims * cuda : less diff in the rope_neox kernel * ggml : add ggml_mul_mat_set_prec ggml-ci * Update ggml-cuda.cu Co-authored-by: slaren * Update ggml-cuda.cu Co-authored-by: slaren * cuda : ggml_cuda_op_mul_mat_cublas support F32 precision * cuda : remove oboslete comment --------- Co-authored-by: Ebey Abraham Co-authored-by: Georgi Gerganov Co-authored-by: slaren --- convert-hf-to-gguf.py | 22 +++ ggml-cuda.cu | 117 +++++++++---- ggml-metal.metal | 13 +- ggml.c | 46 ++++- ggml.h | 12 ++ gguf-py/gguf/constants.py | 13 ++ gguf-py/gguf/tensor_mapping.py | 8 + llama.cpp | 307 +++++++++++++++++++++++++++++---- tests/test-backend-ops.cpp | 1 + 9 files changed, 463 insertions(+), 76 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e46a7813a..e71a96c48 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -182,6 +182,8 @@ class Model: return QwenModel if model_architecture == "MixtralForCausalLM": return MixtralModel + if model_architecture == "PhiForCausalLM": + return Phi2Model return Model def _is_model_safetensors(self) -> bool: @@ -221,6 +223,8 @@ class Model: return gguf.MODEL_ARCH.QWEN if arch == "MixtralForCausalLM": return gguf.MODEL_ARCH.LLAMA + if arch == "PhiForCausalLM": + return gguf.MODEL_ARCH.PHI2 raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -980,6 +984,24 @@ class QwenModel(Model): print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) + +class Phi2Model(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + ###### CONVERSION LOGIC ###### diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0a63c1ecf..d0f3d8034 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4998,7 +4998,16 @@ static __global__ void rope_neox( const int ib = col / n_dims; const int ic = col % n_dims; - const int i = row*ncols + ib*n_dims + ic/2; + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; const int i2 = row/p_delta_rows; float cur_rot = inv_ndims * ic - ib; @@ -7057,6 +7066,7 @@ inline void ggml_cuda_op_upscale( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_pad( @@ -7073,6 +7083,7 @@ inline void ggml_cuda_op_pad( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_rms_norm( @@ -7376,7 +7387,7 @@ inline void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_compute_capabilities[id]; - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 half * src0_as_f16 = nullptr; size_t src0_as = 0; @@ -8300,27 +8311,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor } static __global__ void k_compute_batched_ptrs( - const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, + const half * src0_as_f16, const half * src1_as_f16, char * dst, const void ** ptrs_src, void ** ptrs_dst, - int ne12, int ne13, - int ne23, - int nb02, int nb03, - int nb12, int nb13, - int nb2, int nb3, - int r2, int r3) { - int i13 = blockIdx.x * blockDim.x + threadIdx.x; - int i12 = blockIdx.y * blockDim.y + threadIdx.y; + int64_t ne12, int64_t ne13, + int64_t ne23, + size_t nb02, size_t nb03, + size_t nb12, size_t nb13, + size_t nbd2, size_t nbd3, + int64_t r2, int64_t r3) { + int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int i03 = i13 / r3; - int i02 = i12 / r2; + int64_t i03 = i13 / r3; + int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -8376,7 +8387,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + half * dst_f16 = nullptr; + char * dst_t = nullptr; + + cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; + cudaDataType_t cu_data_type = CUDA_R_16F; + + // dst strides + size_t nbd2 = dst->nb[2]; + size_t nbd3 = dst->nb[3]; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + const void * alpha = &alpha_f16; + const void * beta = &beta_f16; + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + dst_t = (char *) dst_f16; + + nbd2 /= sizeof(float) / sizeof(half); + nbd3 /= sizeof(float) / sizeof(half); + } else { + dst_t = (char *) dst_ddf; + + cu_compute_type = CUBLAS_COMPUTE_32F; + cu_data_type = CUDA_R_32F; + + alpha = &alpha_f32; + beta = &beta_f32; + } GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -8385,9 +8430,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - #if 0 // use cublasGemmEx { @@ -8397,12 +8439,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, - CUBLAS_COMPUTE_16F, + alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } @@ -8414,11 +8456,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA - (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB - &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC + alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA + (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB + beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC ne12*ne13, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx @@ -8435,24 +8477,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_as_f16, src1_as_f16, dst_f16, + src0_as_f16, src1_as_f16, dst_t, ptrs_src, ptrs_dst, ne12, ne13, ne23, nb02, nb03, nb12, nb13, - dst->nb[2], dst->nb[3], + nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, + alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01, ne23, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (ptrs_src_s != 0) { @@ -8464,11 +8506,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } #endif - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(dst_f16, dst_as); + } ggml_cuda_pool_free(src1_as_f16, src1_as); - ggml_cuda_pool_free(dst_f16, dst_as); } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml-metal.metal b/ggml-metal.metal index fe0ada445..d5b54e112 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1702,8 +1702,9 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) { + if (ic < n_dims) { + const int64_t ib = 0; // simplified from `(ib * n_dims + ic) * inv_ndims` const float cur_rot = inv_ndims*ic - ib; @@ -1722,6 +1723,14 @@ kernel void kernel_rope( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } diff --git a/ggml.c b/ggml.c index ad546a731..6da65bd92 100644 --- a/ggml.c +++ b/ggml.c @@ -4098,6 +4098,14 @@ struct ggml_tensor * ggml_mul_mat( return result; } +void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec) { + const int32_t prec_i32 = (int32_t) prec; + + ggml_set_op_params_i32(a, 0, prec_i32); +} + // ggml_mul_mat_id struct ggml_tensor * ggml_mul_mat_id( @@ -9168,6 +9176,8 @@ static void ggml_compute_forward_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -9237,6 +9247,8 @@ static void ggml_compute_forward_rms_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -11562,10 +11574,13 @@ static void ggml_compute_forward_rope_f32( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11588,6 +11603,14 @@ static void ggml_compute_forward_rope_f32( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } @@ -11715,10 +11738,13 @@ static void ggml_compute_forward_rope_f16( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11741,6 +11767,14 @@ static void ggml_compute_forward_rope_f16( dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const int64_t i0 = ic; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } diff --git a/ggml.h b/ggml.h index 68f7833b6..f1003984f 100644 --- a/ggml.h +++ b/ggml.h @@ -343,6 +343,12 @@ extern "C" { GGML_TYPE_COUNT, }; + // precision + enum ggml_prec { + GGML_PREC_DEFAULT, + GGML_PREC_F32, + }; + enum ggml_backend_type { GGML_BACKEND_CPU = 0, GGML_BACKEND_GPU = 10, @@ -1057,6 +1063,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // change the precision of a matrix multiplication + // set to GGML_PREC_F32 for higher precision (useful for phi-2) + GGML_API void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + // indirect matrix multiplication // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 12133882b..390dca049 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum): BLOOM = auto() STABLELM = auto() QWEN = auto() + PHI2 = auto() class MODEL_TENSOR(IntEnum): @@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.PHI2: "phi2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -350,6 +352,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.GPT2: [ # TODO ], + MODEL_ARCH.PHI2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ] # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0115ea1c6..6fcbdbc1c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,6 +17,7 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert "language_model.embedding.word_embeddings", # persimmon + "transformer.embd.wte", # phi2 ), # Token type embeddings @@ -41,6 +42,7 @@ class TensorNameMap: "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen "output", # llama-pth bloom "word_embeddings_for_head", # persimmon + "lm_head.linear", # phi2 ), # Output norm @@ -53,6 +55,7 @@ class TensorNameMap: "transformer.norm_f", # mpt "ln_f", # refact bloom qwen "language_model.encoder.final_layernorm", # persimmon + "lm_head.ln", # phi2 ), # Rope frequencies @@ -75,6 +78,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi + "transformer.h.{bid}.ln", # phi2 ), # Attention norm 2 @@ -90,6 +94,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "transformer.h.{bid}.mixer.Wqkv", # phi2 ), # Attention query @@ -128,6 +133,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "transformer.h.{bid}.mixer.out_proj", # phi2 ), # Rotary embeddings @@ -167,6 +173,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen + "transformer.h.{bid}.mlp.fc1", # phi2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -198,6 +205,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "transformer.h.{bid}.mlp.fc2", # phi2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index 99facbf77..edd2910b3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -195,6 +195,7 @@ enum llm_arch { LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, + LLM_ARCH_PHI2, LLM_ARCH_UNKNOWN, }; @@ -212,6 +213,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_PHI2, "phi2" }, }; enum llm_kv { @@ -550,6 +552,19 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PHI2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, @@ -1420,6 +1435,7 @@ struct llama_model { struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; struct ggml_tensor * output; + struct ggml_tensor * output_b; std::vector layers; @@ -2635,6 +2651,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_PHI2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -2987,7 +3012,7 @@ static void llm_load_tensors( (void) main_gpu; - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; + enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; #ifdef GGML_USE_CUBLAS @@ -3630,7 +3655,73 @@ static void llm_load_tensors( } } } break; + case LLM_ARCH_PHI2: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + backend_norm = llama_backend_offload; + backend_output = llama_backend_offload; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + vram_weights += ggml_nbytes(model.output); + vram_weights += ggml_nbytes(model.output_b); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); + } + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -3991,6 +4082,7 @@ static struct ggml_tensor * llm_build_ffn( // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, + const llama_model & model, const llama_hparams & hparams, const llama_kv_cache & kv, struct ggml_tensor * wo, @@ -4002,6 +4094,7 @@ static struct ggml_tensor * llm_build_kqv( int32_t n_tokens, int32_t n_kv, float max_alibi_bias, + float scale, const llm_build_cb & cb, int il) { const int64_t n_embd = hparams.n_embd; @@ -4024,6 +4117,12 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); + if (model.arch == LLM_ARCH_PHI2) { + // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs + // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + } + if (max_alibi_bias > 0.0f) { // temporary branch until we figure out how to handle ggml_alibi through ggml_add kq = ggml_scale(ctx, kq, kq_scale); @@ -4043,7 +4142,7 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); } else { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head))); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale); cb(kq, "kq_soft_max_ext", il); } @@ -4250,9 +4349,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4433,9 +4532,9 @@ struct llm_build_context { // apply ALiBi for 13B model const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4557,9 +4656,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4657,9 +4756,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4866,9 +4965,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); // TODO: not tested, could be broken - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4957,9 +5056,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5054,9 +5153,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5148,9 +5247,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5261,9 +5360,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5320,15 +5419,15 @@ struct llm_build_context { cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); // KQ_scale - struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed @@ -5378,9 +5477,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5422,6 +5521,122 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); + return gf; + } + struct ggml_cgraph * build_phi2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct ggml_tensor * cur; + struct ggml_tensor * attn_norm_output; + struct ggml_tensor * ffn_output; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // Q_scale + struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(Q_scale, "Q_scale", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(attn_norm_output, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, Q_scale); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, model, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + // FF + { + ffn_output = llm_build_ffn(ctx0, attn_norm_output, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(ffn_output, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_output); + cb(cur, "l_out", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output_no_bias", -1); + + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; } }; @@ -5437,7 +5652,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_FRC, // force offload OFFLOAD_FUNC_KQV, OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_EMB, // embeddings OFFLOAD_FUNC_OUT, }; @@ -5522,6 +5737,7 @@ static const std::unordered_map k_offload_map { "pos_embd", OFFLOAD_FUNC_NR }, { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) + { "Q_scale", OFFLOAD_FUNC_FRC }, { "KQ_scale", OFFLOAD_FUNC_FRC }, { "KQ_mask", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC }, @@ -5606,6 +5822,7 @@ static const std::unordered_map k_offload_map { "l_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output_no_bias", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, }; @@ -5623,6 +5840,7 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_tokens = false; bool alloc_inp_embd = false; bool alloc_inp_pos = false; + bool alloc_inp_Q_scale = false; bool alloc_inp_KQ_scale = false; bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; @@ -5690,7 +5908,7 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_pos = true; } - if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) { ggml_allocr_alloc(lctx.alloc, cur); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -5698,6 +5916,23 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); } + alloc_inp_Q_scale = true; + } + + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + if (model.arch == LLM_ARCH_PHI2) { + // with phi2, we scale the Q to avoid precision issues + // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 + ggml_set_f32(cur, 1.0f); + } else { + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + } + alloc_inp_KQ_scale = true; } @@ -5922,6 +6157,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_qwen(); } break; + case LLM_ARCH_PHI2: + { + result = llm.build_phi2(); + } break; default: GGML_ASSERT(false); } @@ -6055,12 +6294,16 @@ static int llama_decode_internal( ggml_allocr_alloc_graph(lctx.alloc, gf); - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + + // the embeddings could be the second to last tensor, or the third to last tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - + if (strcmp(embeddings->name, "result_norm") != 0) { + embeddings = gf->nodes[gf->n_nodes - 3]; + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + } #ifdef GGML_USE_CUBLAS for (int i = 0; i < gf->n_leafs; i++) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index df2c3fb6e..f04b9438a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1555,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2) } test_cases.emplace_back(new test_alibi()); From 6ff39b129d0281d045f83d515e51b7197b44b253 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Dec 2023 20:05:12 +0200 Subject: [PATCH 16/20] llama.swiftui : add more models --- .../llama.cpp.swift/LibLlama.swift | 2 +- .../llama.swiftui/UI/ContentView.swift | 31 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 272e1fd8a..464fb3277 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -203,7 +203,7 @@ actor LlamaContext { var pp_std: Double = 0 var tg_std: Double = 0 - for r in 0.. Date: Mon, 18 Dec 2023 20:17:43 +0200 Subject: [PATCH 17/20] llama.swiftui : add tinyllama 1.1B F16 --- .../llama.swiftui/llama.swiftui/UI/ContentView.swift | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 9cbe8efd6..c78f107b3 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -91,6 +91,15 @@ struct ContentView: View { ) .font(.system(size: 12)) + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (F16, 2.2 GiB)", + modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", + filename: "tinyllama-1.1b-f16.gguf" + ) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + DownloadButton( llamaState: llamaState, modelName: "Phi-2.7B (Q4_0, 1.6 GiB)", @@ -98,7 +107,6 @@ struct ContentView: View { filename: "phi-2-q4_0.gguf" ) .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -107,6 +115,7 @@ struct ContentView: View { filename: "phi-2-q8_0.gguf" ) .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -115,7 +124,6 @@ struct ContentView: View { filename: "mistral-7b-v0.1.Q4_0.gguf" ) .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) Button("Clear downloaded models") { ContentView.cleanupModelCaches() From a7aee47b98e45539d491071b25778b833b77e387 Mon Sep 17 00:00:00 2001 From: arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> Date: Mon, 18 Dec 2023 22:33:45 +0100 Subject: [PATCH 18/20] ggml-cuda: Fix HIP build (#4528) regression of #4490 Adds defines for two new datatypes cublasComputeType_t, cudaDataType_t. Currently using deprecated hipblasDatatype_t since newer ones very recent. --- ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d0f3d8034..f20846fef 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -31,6 +31,7 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx @@ -40,6 +41,7 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess From 328b83de23b33240e28f4e74900d1d06726f5eb1 Mon Sep 17 00:00:00 2001 From: Eric Sommerlade Date: Tue, 19 Dec 2023 16:17:01 +0000 Subject: [PATCH 19/20] ggml : fixed check for _MSC_VER (#4535) Co-authored-by: Eric Sommerlade --- ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index f1003984f..beacdc8be 100644 --- a/ggml.h +++ b/ggml.h @@ -303,7 +303,7 @@ extern "C" { #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; From 799fc2268989482054944c902874cca76337580f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 20 Dec 2023 15:41:22 +0100 Subject: [PATCH 20/20] CUDA: Faster Mixtral prompt processing (#4538) * CUDA: make MoE tensors contiguous for batch size>1 * Update ggml-cuda.cu Co-authored-by: slaren --------- Co-authored-by: slaren --- ggml-cuda.cu | 118 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f20846fef..9f4b188cb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { } #ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); @@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - ggml_cuda_set_peer_access(ne11); - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); @@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + const int64_t nb11 = src1->nb[1]; + const int64_t nb1 = dst->nb[1]; + const struct ggml_tensor * ids = src0; const int32_t id = ((int32_t *) dst->op_params)[0]; const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); + const cudaStream_t stream = g_cudaStreams[g_main_device][0]; + if (ids->backend == GGML_BACKEND_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } else { memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); } @@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.ne[1] = 1; - dst_row.ne[1] = 1; - - src1_row.nb[2] = src1_row.nb[1]; - dst_row.nb[2] = dst_row.nb[1]; - - src1_row.nb[3] = src1_row.nb[1]; - dst_row.nb[3] = dst_row.nb[1]; - src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; + char * src1_original = (char *) src1_extra->data_device[g_main_device]; + char * dst_original = (char *) dst_extra->data_device[g_main_device]; - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + if (src1->ne[1] == 1) { + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < n_as); + GGML_ASSERT(row_id >= 0 && row_id < n_as); - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } + } else { + size_t as_src1, as_dst; + char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1); + char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst); + + src1_row_extra.data_device[g_main_device] = src1_contiguous; + dst_row_extra.data_device[g_main_device] = dst_contiguous; + + for (int32_t row_id = 0; row_id < n_as; ++row_id) { + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + int64_t num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11, + nb11, cudaMemcpyDeviceToDevice, stream)); + num_src1_rows++; + } + + if (num_src1_rows == 0) { + continue; + } + + src1_row.ne[1] = num_src1_rows; + dst_row.ne[1] = num_src1_rows; + + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; + + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + + num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1, + nb1, cudaMemcpyDeviceToDevice, stream)); + num_src1_rows++; + } + } + + ggml_cuda_pool_free(src1_contiguous, as_src1); + ggml_cuda_pool_free(dst_contiguous, as_dst); } } @@ -9370,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); + } + if (params->ith != 0) { return true; }