Merge branch 'master' into HEAD

This commit is contained in:
Georgi Gerganov 2023-12-18 10:17:55 +02:00
commit a8d2a6f3ef
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
19 changed files with 1350 additions and 706 deletions

View file

@ -23,3 +23,6 @@ insert_final_newline = unset
[examples/server/public/*] [examples/server/public/*]
indent_size = 2 indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab

View file

@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else() else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif() endif()

View file

@ -439,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
endif # LLAMA_CLBLAST endif # LLAMA_CLBLAST
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm
HIPCC ?= $(ROCM_PATH)/bin/hipcc ifeq ($(wildcard /opt/rocm),)
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else
ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif
HIPCC ?= $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2

View file

@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
---- ----

View file

@ -3,7 +3,6 @@ from __future__ import annotations
import json import json
import os import os
import re
import struct import struct
import sys import sys
from typing import Any, BinaryIO, Sequence from typing import Any, BinaryIO, Sequence
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attn_q",
"self_attn.k_proj": "attn_k",
"self_attn.v_proj": "attn_v",
"self_attn.o_proj": "attn_output",
"mlp.gate_proj": "ffn_gate",
"mlp.down_proj": "ffn_down",
"mlp.up_proj": "ffn_up",
"input_layernorm": "attn_norm",
"post_attention_layernorm": "ffn_norm",
}
def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)
output_string = (
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", 1)) # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(struct.pack("i", int(params["lora_alpha"]))) fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header( def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
) -> None:
sname = name.encode("utf-8") sname = name.encode("utf-8")
fout.write( fout.write(
struct.pack( struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
fout.seek((fout.tell() + 31) & -32) fout.seek((fout.tell() + 31) & -32)
if len(sys.argv) != 2: if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path>") print(f"Usage: python {sys.argv[0]} <path> [arch]")
print( print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
) )
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1) sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json") input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu") model = torch.load(input_model, map_location="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f: with open(input_json, "r") as f:
params = json.load(f) params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
write_file_header(fout, params) write_file_header(fout, params)
for k, v in model.items(): for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"): if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight") k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]: if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
v = v.float() v = v.float()
t = v.detach().numpy() t = v.detach().numpy()
tname = translate_tensor_name(k)
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype) write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout) t.tofile(fout)

View file

@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
opt->params.adam.gclip = params.common.adam_gclip; opt->params.adam.gclip = params.common.adam_gclip;
opt->params.adam.eps_f = params.common.adam_eps_f; opt->params.adam.eps_f = params.common.adam_eps_f;
ggml_allocr * alloc = NULL;
printf("%s: init model\n", __func__); printf("%s: init model\n", __func__);
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); mem_input_data.resize(max_input_size);
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
ggml_allocr_alloc(alloc, tokens_input); ggml_allocr_alloc(alloc_inps, tokens_input);
ggml_allocr_alloc(alloc, target_probs); ggml_allocr_alloc(alloc_inps, target_probs);
ggml_allocr_free(alloc);
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size); mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
params.common.use_checkpointing params.common.use_checkpointing
); );
ggml_allocr_free(alloc); ggml_allocr_free(alloc);
ggml_allocr_free(alloc_inps);
// tokenize data // tokenize data
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;

View file

@ -1 +1,2 @@
xcuserdata xcuserdata
xcshareddata

View file

@ -6,16 +6,34 @@ enum LlamaError: Error {
case couldNotInitializeContext case couldNotInitializeContext
} }
func llama_batch_clear(_ batch: inout llama_batch) {
batch.n_tokens = 0
}
func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
batch.token [Int(batch.n_tokens)] = id
batch.pos [Int(batch.n_tokens)] = pos
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
for i in 0..<seq_ids.count {
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
}
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0
batch.n_tokens += 1
}
actor LlamaContext { actor LlamaContext {
private var model: OpaquePointer private var model: OpaquePointer
private var context: OpaquePointer private var context: OpaquePointer
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
/// This variable is used to store temporarily invalid cchars /// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar] private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 512 var n_len: Int32 = 64
var n_cur: Int32 = 0 var n_cur: Int32 = 0
var n_decode: Int32 = 0 var n_decode: Int32 = 0
init(model: OpaquePointer, context: OpaquePointer) { init(model: OpaquePointer, context: OpaquePointer) {
@ -27,25 +45,34 @@ actor LlamaContext {
} }
deinit { deinit {
llama_batch_free(batch)
llama_free(context) llama_free(context)
llama_free_model(model) llama_free_model(model)
llama_backend_free() llama_backend_free()
} }
static func createContext(path: String) throws -> LlamaContext { static func create_context(path: String) throws -> LlamaContext {
llama_backend_init(false) llama_backend_init(false)
let model_params = llama_model_default_params() var model_params = llama_model_default_params()
#if targetEnvironment(simulator)
model_params.n_gpu_layers = 0
print("Running on simulator, force use n_gpu_layers = 0")
#endif
let model = llama_load_model_from_file(path, model_params) let model = llama_load_model_from_file(path, model_params)
guard let model else { guard let model else {
print("Could not load model at \(path)") print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext throw LlamaError.couldNotInitializeContext
} }
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
print("Using \(n_threads) threads")
var ctx_params = llama_context_default_params() var ctx_params = llama_context_default_params()
ctx_params.seed = 1234 ctx_params.seed = 1234
ctx_params.n_ctx = 2048 ctx_params.n_ctx = 2048
ctx_params.n_threads = 8 ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = 8 ctx_params.n_threads_batch = UInt32(n_threads)
let context = llama_new_context_with_model(model, ctx_params) let context = llama_new_context_with_model(model, ctx_params)
guard let context else { guard let context else {
@ -56,6 +83,26 @@ actor LlamaContext {
return LlamaContext(model: model, context: context) return LlamaContext(model: model, context: context)
} }
func model_info() -> String {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
result.initialize(repeating: Int8(0), count: 256)
defer {
result.deallocate()
}
// TODO: this is probably very stupid way to get the string from C
let nChars = llama_model_desc(model, result, 256)
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
var SwiftString = ""
for char in bufferPointer {
SwiftString.append(Character(UnicodeScalar(UInt8(char))))
}
return SwiftString
}
func get_n_tokens() -> Int32 { func get_n_tokens() -> Int32 {
return batch.n_tokens; return batch.n_tokens;
} }
@ -79,16 +126,11 @@ actor LlamaContext {
print(String(cString: token_to_piece(token: id) + [0])) print(String(cString: token_to_piece(token: id) + [0]))
} }
// batch = llama_batch_init(512, 0) // done in init() llama_batch_clear(&batch)
batch.n_tokens = Int32(tokens_list.count)
for i1 in 0..<batch.n_tokens { for i1 in 0..<tokens_list.count {
let i = Int(i1) let i = Int(i1)
batch.token[i] = tokens_list[i] llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
batch.pos[i] = i1
batch.n_seq_id[Int(i)] = 1
batch.seq_id[Int(i)]![0] = 0
batch.logits[i] = 0
} }
batch.logits[Int(batch.n_tokens) - 1] = 1 // true batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@ -141,18 +183,11 @@ actor LlamaContext {
print(new_token_str) print(new_token_str)
// tokens_list.append(new_token_id) // tokens_list.append(new_token_id)
batch.n_tokens = 0 llama_batch_clear(&batch)
llama_batch_add(&batch, new_token_id, n_cur, [0], true)
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
batch.seq_id[Int(batch.n_tokens)]![0] = 0
batch.logits[Int(batch.n_tokens)] = 1 // true
batch.n_tokens += 1
n_decode += 1 n_decode += 1
n_cur += 1
n_cur += 1
if llama_decode(context, batch) != 0 { if llama_decode(context, batch) != 0 {
print("failed to evaluate llama!") print("failed to evaluate llama!")
@ -161,14 +196,111 @@ actor LlamaContext {
return new_token_str return new_token_str
} }
func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
var pp_avg: Double = 0
var tg_avg: Double = 0
var pp_std: Double = 0
var tg_std: Double = 0
for r in 0..<nr {
// bench prompt processing
llama_batch_clear(&batch)
let n_tokens = pp
for i in 0..<n_tokens {
llama_batch_add(&batch, 0, Int32(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_kv_cache_clear(context)
let t_pp_start = ggml_time_us()
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt")
}
let t_pp_end = ggml_time_us()
// bench text generation
llama_kv_cache_clear(context)
let t_tg_start = ggml_time_us()
for i in 0..<tg {
llama_batch_clear(&batch)
for j in 0..<pl {
llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
}
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation")
}
}
let t_tg_end = ggml_time_us()
llama_kv_cache_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
let speed_pp = Double(pp) / t_pp
let speed_tg = Double(pl*tg) / t_tg
pp_avg += speed_pp
tg_avg += speed_tg
pp_std += speed_pp * speed_pp
tg_std += speed_tg * speed_tg
print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
}
pp_avg /= Double(nr)
tg_avg /= Double(nr)
if nr > 1 {
pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
} else {
pp_std = 0
tg_std = 0
}
let model_desc = model_info();
let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
let backend = "Metal";
let pp_avg_str = String(format: "%.2f", pp_avg);
let tg_avg_str = String(format: "%.2f", tg_avg);
let pp_std_str = String(format: "%.2f", pp_std);
let tg_std_str = String(format: "%.2f", tg_std);
var result = ""
result += String("| model | size | params | backend | test | t/s |\n")
result += String("| --- | --- | --- | --- | --- | --- |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
return result;
}
func clear() { func clear() {
tokens_list.removeAll() tokens_list.removeAll()
temporary_invalid_cchars.removeAll() temporary_invalid_cchars.removeAll()
llama_kv_cache_clear(context)
} }
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0) let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

View file

@ -1,481 +1,483 @@
// !$*UTF8*$! // !$*UTF8*$!
{ {
archiveVersion = 1; archiveVersion = 1;
classes = { classes = {
}; };
objectVersion = 56; objectVersion = 56;
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; }; 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; }; 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; }; 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; }; 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; }; 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; }; 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; }; 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; }; 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; }; 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; }; 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; }; 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; }; 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; }; 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; }; 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; }; 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; }; 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; }; 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; }; 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; }; 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; }; 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; }; 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; }; 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; }; 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; }; 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; }; 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
8A1C83702AC328BD0096AF73 /* Frameworks */ = { 8A1C83702AC328BD0096AF73 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase; isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;
}; };
/* End PBXFrameworksBuildPhase section */ /* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */ /* Begin PBXGroup section */
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
542376092B0D9C40008E6A1C /* ggml-backend.h */, 542376092B0D9C40008E6A1C /* ggml-backend.h */,
542376062B0D9BEA008E6A1C /* ggml-quants.h */, 542376062B0D9BEA008E6A1C /* ggml-quants.h */,
542376072B0D9BFB008E6A1C /* ggml-quants.c */, 542376072B0D9BFB008E6A1C /* ggml-quants.c */,
549479C82AC9E10B00E0F78B /* ggml-metal.metal */, 549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
549479C62AC9E0F200E0F78B /* ggml-metal.h */, 549479C62AC9E0F200E0F78B /* ggml-metal.h */,
549479C52AC9E0F200E0F78B /* ggml-metal.m */, 549479C52AC9E0F200E0F78B /* ggml-metal.m */,
542EA09B2AC8723900A8AEE9 /* ggml.c */, 542EA09B2AC8723900A8AEE9 /* ggml.c */,
542EA09C2AC8723900A8AEE9 /* ggml.h */, 542EA09C2AC8723900A8AEE9 /* ggml.h */,
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
542EA0A12AC8729100A8AEE9 /* llama.cpp */, 542EA0A12AC8729100A8AEE9 /* llama.cpp */,
542EA0A22AC8729100A8AEE9 /* llama.h */, 542EA0A22AC8729100A8AEE9 /* llama.h */,
); );
name = llama.cpp; name = llama.cpp;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C836A2AC328BD0096AF73 = { 8A1C836A2AC328BD0096AF73 = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A08D1F62AC7383900FE6CD4 /* llama.cpp */, 8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
8A907F312AC7134E006146EA /* llama.cpp.swift */, 8A907F312AC7134E006146EA /* llama.cpp.swift */,
8A3F84232AC4C891005E2EE8 /* models */, 8A3F84232AC4C891005E2EE8 /* models */,
8A1C83752AC328BD0096AF73 /* llama.swiftui */, 8A1C83752AC328BD0096AF73 /* llama.swiftui */,
8A1C83742AC328BD0096AF73 /* Products */, 8A1C83742AC328BD0096AF73 /* Products */,
8A39BE082AC7601000BFEB40 /* Frameworks */, 8A39BE082AC7601000BFEB40 /* Frameworks */,
); );
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C83742AC328BD0096AF73 /* Products */ = { 8A1C83742AC328BD0096AF73 /* Products */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
); );
name = Products; name = Products;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F84102AC4BD85005E2EE8 /* Resources */, 8A3F84102AC4BD85005E2EE8 /* Resources */,
8A9F7C4B2AC332DC008AE1EA /* Models */, 8A9F7C4B2AC332DC008AE1EA /* Models */,
8A9F7C4A2AC332BF008AE1EA /* UI */, 8A9F7C4A2AC332BF008AE1EA /* UI */,
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
8A1C837C2AC328BE0096AF73 /* Preview Content */, 8A1C837C2AC328BE0096AF73 /* Preview Content */,
); );
path = llama.swiftui; path = llama.swiftui;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C837C2AC328BE0096AF73 /* Preview Content */ = { 8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
); );
path = "Preview Content"; path = "Preview Content";
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A39BE082AC7601000BFEB40 /* Frameworks */ = { 8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
549479CA2AC9E16000E0F78B /* Metal.framework */, 549479CA2AC9E16000E0F78B /* Metal.framework */,
8A39BE092AC7601000BFEB40 /* Accelerate.framework */, 8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
); );
name = Frameworks; name = Frameworks;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A3F84102AC4BD85005E2EE8 /* Resources */ = { 8A3F84102AC4BD85005E2EE8 /* Resources */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F84112AC4BD8C005E2EE8 /* models */, 8A3F84112AC4BD8C005E2EE8 /* models */,
); );
path = Resources; path = Resources;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A3F84112AC4BD8C005E2EE8 /* models */ = { 8A3F84112AC4BD8C005E2EE8 /* models */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */, );
); path = models;
path = models; sourceTree = "<group>";
sourceTree = "<group>"; };
}; 8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
8A907F312AC7134E006146EA /* llama.cpp.swift */ = { isa = PBXGroup;
isa = PBXGroup; children = (
children = ( 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, 8A907F322AC7134E006146EA /* LibLlama.swift */,
8A907F322AC7134E006146EA /* LibLlama.swift */, );
); path = llama.cpp.swift;
path = llama.cpp.swift; sourceTree = "<group>";
sourceTree = "<group>"; };
}; 8A9F7C4A2AC332BF008AE1EA /* UI */ = {
8A9F7C4A2AC332BF008AE1EA /* UI */ = { isa = PBXGroup;
isa = PBXGroup; children = (
children = ( 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
8A1C83782AC328BD0096AF73 /* ContentView.swift */, 8A1C83782AC328BD0096AF73 /* ContentView.swift */,
); );
path = UI; path = UI;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A9F7C4B2AC332DC008AE1EA /* Models */ = { 8A9F7C4B2AC332DC008AE1EA /* Models */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
); );
path = Models; path = Models;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
/* End PBXGroup section */ /* End PBXGroup section */
/* Begin PBXNativeTarget section */ /* Begin PBXNativeTarget section */
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXNativeTarget; isa = PBXNativeTarget;
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
buildPhases = ( buildPhases = (
8A1C836F2AC328BD0096AF73 /* Sources */, 8A1C836F2AC328BD0096AF73 /* Sources */,
8A1C83702AC328BD0096AF73 /* Frameworks */, 8A1C83702AC328BD0096AF73 /* Frameworks */,
8A1C83712AC328BD0096AF73 /* Resources */, 8A1C83712AC328BD0096AF73 /* Resources */,
); );
buildRules = ( buildRules = (
); );
dependencies = ( dependencies = (
); );
name = llama.swiftui; name = llama.swiftui;
packageProductDependencies = ( packageProductDependencies = (
); );
productName = llama.swiftui; productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
productType = "com.apple.product-type.application"; productType = "com.apple.product-type.application";
}; };
/* End PBXNativeTarget section */ /* End PBXNativeTarget section */
/* Begin PBXProject section */ /* Begin PBXProject section */
8A1C836B2AC328BD0096AF73 /* Project object */ = { 8A1C836B2AC328BD0096AF73 /* Project object */ = {
isa = PBXProject; isa = PBXProject;
attributes = { attributes = {
BuildIndependentTargetsInParallel = 1; BuildIndependentTargetsInParallel = 1;
LastSwiftUpdateCheck = 1500; LastSwiftUpdateCheck = 1500;
LastUpgradeCheck = 1500; LastUpgradeCheck = 1500;
TargetAttributes = { TargetAttributes = {
8A1C83722AC328BD0096AF73 = { 8A1C83722AC328BD0096AF73 = {
CreatedOnToolsVersion = 15.0; CreatedOnToolsVersion = 15.0;
LastSwiftMigration = 1500; LastSwiftMigration = 1500;
}; };
}; };
}; };
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
compatibilityVersion = "Xcode 14.0"; compatibilityVersion = "Xcode 14.0";
developmentRegion = en; developmentRegion = en;
hasScannedForEncodings = 0; hasScannedForEncodings = 0;
knownRegions = ( knownRegions = (
en, en,
Base, Base,
); );
mainGroup = 8A1C836A2AC328BD0096AF73; mainGroup = 8A1C836A2AC328BD0096AF73;
packageReferences = ( packageReferences = (
); );
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
projectDirPath = ""; projectDirPath = "";
projectRoot = ""; projectRoot = "";
targets = ( targets = (
8A1C83722AC328BD0096AF73 /* llama.swiftui */, 8A1C83722AC328BD0096AF73 /* llama.swiftui */,
); );
}; };
/* End PBXProject section */ /* End PBXProject section */
/* Begin PBXResourcesBuildPhase section */ /* Begin PBXResourcesBuildPhase section */
8A1C83712AC328BD0096AF73 /* Resources */ = { 8A1C83712AC328BD0096AF73 /* Resources */ = {
isa = PBXResourcesBuildPhase; isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */, 8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;
}; };
/* End PBXResourcesBuildPhase section */ /* End PBXResourcesBuildPhase section */
/* Begin PBXSourcesBuildPhase section */ /* Begin PBXSourcesBuildPhase section */
8A1C836F2AC328BD0096AF73 /* Sources */ = { 8A1C836F2AC328BD0096AF73 /* Sources */ = {
isa = PBXSourcesBuildPhase; isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
); 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
runOnlyForDeploymentPostprocessing = 0; );
}; runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */ /* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */ /* Begin XCBuildConfiguration section */
8A1C837F2AC328BE0096AF73 /* Debug */ = { 8A1C837F2AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO; ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES; CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES; CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES; CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES; CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES; CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES; CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES; CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES; CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO; COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf; DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES; ENABLE_TESTABILITY = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17; GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_DYNAMIC_NO_PIC = NO; GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES; GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0; GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = ( GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1", "DEBUG=1",
"$(inherited)", "$(inherited)",
); );
GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES; GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0; IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES; LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES; MTL_FAST_MATH = YES;
ONLY_ACTIVE_ARCH = YES; ONLY_ACTIVE_ARCH = YES;
SDKROOT = iphoneos; SDKROOT = iphoneos;
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_OPTIMIZATION_LEVEL = "-Onone";
}; };
name = Debug; name = Debug;
}; };
8A1C83802AC328BE0096AF73 /* Release */ = { 8A1C83802AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO; ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES; CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES; CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES; CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES; CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES; CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES; CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES; CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES; CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO; COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO; ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17; GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_NO_COMMON_BLOCKS = YES; GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES; GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0; IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES; LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = NO; MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES; MTL_FAST_MATH = YES;
SDKROOT = iphoneos; SDKROOT = iphoneos;
SWIFT_COMPILATION_MODE = wholemodule; SWIFT_COMPILATION_MODE = wholemodule;
VALIDATE_PRODUCT = YES; VALIDATE_PRODUCT = YES;
}; };
name = Release; name = Release;
}; };
8A1C83822AC328BE0096AF73 /* Debug */ = { 8A1C83822AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic; CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1; CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q; DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES; ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES; GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0; IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = ( LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)", "$(inherited)",
"@executable_path/Frameworks", "@executable_path/Frameworks",
); );
MARKETING_VERSION = 1.0; MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)"; PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0; SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2"; TARGETED_DEVICE_FAMILY = "1,2";
}; };
name = Debug; name = Debug;
}; };
8A1C83832AC328BE0096AF73 /* Release */ = { 8A1C83832AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic; CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1; CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q; DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES; ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES; GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0; IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = ( LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)", "$(inherited)",
"@executable_path/Frameworks", "@executable_path/Frameworks",
); );
MARKETING_VERSION = 1.0; MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)"; PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_VERSION = 5.0; SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2"; TARGETED_DEVICE_FAMILY = "1,2";
}; };
name = Release; name = Release;
}; };
/* End XCBuildConfiguration section */ /* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */ /* Begin XCConfigurationList section */
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
isa = XCConfigurationList; isa = XCConfigurationList;
buildConfigurations = ( buildConfigurations = (
8A1C837F2AC328BE0096AF73 /* Debug */, 8A1C837F2AC328BE0096AF73 /* Debug */,
8A1C83802AC328BE0096AF73 /* Release */, 8A1C83802AC328BE0096AF73 /* Release */,
); );
defaultConfigurationIsVisible = 0; defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release; defaultConfigurationName = Release;
}; };
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
isa = XCConfigurationList; isa = XCConfigurationList;
buildConfigurations = ( buildConfigurations = (
8A1C83822AC328BE0096AF73 /* Debug */, 8A1C83822AC328BE0096AF73 /* Debug */,
8A1C83832AC328BE0096AF73 /* Release */, 8A1C83832AC328BE0096AF73 /* Release */,
); );
defaultConfigurationIsVisible = 0; defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release; defaultConfigurationName = Release;
}; };
/* End XCConfigurationList section */ /* End XCConfigurationList section */
}; };
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
} }

View file

@ -3,24 +3,26 @@ import Foundation
@MainActor @MainActor
class LlamaState: ObservableObject { class LlamaState: ObservableObject {
@Published var messageLog = "" @Published var messageLog = ""
@Published var cacheCleared = false
private var llamaContext: LlamaContext? private var llamaContext: LlamaContext?
private var modelUrl: URL? { private var defaultModelUrl: URL? {
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models") // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
} }
init() { init() {
do { do {
try loadModel() try loadModel(modelUrl: defaultModelUrl)
} catch { } catch {
messageLog += "Error!\n" messageLog += "Error!\n"
} }
} }
private func loadModel() throws { func loadModel(modelUrl: URL?) throws {
messageLog += "Loading model...\n" messageLog += "Loading model...\n"
if let modelUrl { if let modelUrl {
llamaContext = try LlamaContext.createContext(path: modelUrl.path()) llamaContext = try LlamaContext.create_context(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else { } else {
messageLog += "Could not locate model\n" messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
guard let llamaContext else { guard let llamaContext else {
return return
} }
messageLog += "Attempting to complete text...\n"
await llamaContext.completion_init(text: text) await llamaContext.completion_init(text: text)
messageLog += "\(text)" messageLog += "\(text)"
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
await llamaContext.clear() await llamaContext.clear()
messageLog += "\n\ndone\n" messageLog += "\n\ndone\n"
} }
func bench() async {
guard let llamaContext else {
return
}
messageLog += "\n"
messageLog += "Running benchmark...\n"
messageLog += "Model info: "
messageLog += await llamaContext.model_info() + "\n"
let t_start = DispatchTime.now().uptimeNanoseconds
await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
let t_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_end - t_start) / 1_000_000_000.0
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
// if more than 5 seconds, then we're probably running on a slow device
if t_heat > 5.0 {
messageLog += "Heat up time is too long, aborting benchmark\n"
return
}
let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
messageLog += "\(result)"
messageLog += "\n"
}
func clear() async {
guard let llamaContext else {
return
}
await llamaContext.clear()
messageLog = ""
}
} }

View file

@ -5,24 +5,97 @@ struct ContentView: View {
@State private var multiLineText = "" @State private var multiLineText = ""
private static func cleanupModelCaches() {
// Delete all models (*.gguf)
let fileManager = FileManager.default
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
do {
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
for fileURL in fileURLs {
if fileURL.pathExtension == "gguf" {
try fileManager.removeItem(at: fileURL)
}
}
} catch {
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
}
}
var body: some View { var body: some View {
VStack { VStack {
ScrollView(.vertical) { ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog) Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
} }
TextEditor(text: $multiLineText) TextEditor(text: $multiLineText)
.frame(height: 200) .frame(height: 80)
.padding() .padding()
.border(Color.gray, width: 0.5) .border(Color.gray, width: 0.5)
Button(action: {
sendText() HStack {
}) { Button("Send") {
Text("Send") sendText()
.padding() }
.background(Color.blue) .padding(8)
.foregroundColor(.white) .background(Color.blue)
.cornerRadius(8) .foregroundColor(.white)
.cornerRadius(8)
Button("Bench") {
bench()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Clear") {
clear()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
VStack {
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q4_0)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
)
.font(.system(size: 12))
.padding(.top, 4)
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q8_0)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
)
.font(.system(size: 12))
Button("Clear downloaded models") {
ContentView.cleanupModelCaches()
llamaState.cacheCleared = true
}
.padding(8)
.font(.system(size: 12))
} }
} }
.padding() .padding()
@ -34,9 +107,20 @@ struct ContentView: View {
multiLineText = "" multiLineText = ""
} }
} }
func bench() {
Task {
await llamaState.bench()
}
}
func clear() {
Task {
await llamaState.clear()
}
}
} }
/*
#Preview { //#Preview {
ContentView() // ContentView()
} //}
*/

View file

@ -0,0 +1,122 @@
import SwiftUI
struct DownloadButton: View {
@ObservedObject private var llamaState: LlamaState
private var modelName: String
private var modelUrl: String
private var filename: String
@State private var status: String
@State private var downloadTask: URLSessionDownloadTask?
@State private var progress = 0.0
@State private var observation: NSKeyValueObservation?
private static func getFileURL(filename: String) -> URL {
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
}
private func checkFileExistenceAndUpdateStatus() {
}
init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
self.llamaState = llamaState
self.modelName = modelName
self.modelUrl = modelUrl
self.filename = filename
let fileURL = DownloadButton.getFileURL(filename: filename)
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
}
private func download() {
status = "downloading"
print("Downloading model \(modelName) from \(modelUrl)")
guard let url = URL(string: modelUrl) else { return }
let fileURL = DownloadButton.getFileURL(filename: filename)
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
if let error = error {
print("Error: \(error.localizedDescription)")
return
}
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
print("Server error!")
return
}
do {
if let temporaryURL = temporaryURL {
try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
print("Writing to \(filename) completed")
llamaState.cacheCleared = false
status = "downloaded"
}
} catch let err {
print("Error: \(err.localizedDescription)")
}
}
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
self.progress = progress.fractionCompleted
}
downloadTask?.resume()
}
var body: some View {
VStack {
if status == "download" {
Button(action: download) {
Text("Download " + modelName)
}
} else if status == "downloading" {
Button(action: {
downloadTask?.cancel()
status = "download"
}) {
Text("\(modelName) (Downloading \(Int(progress * 100))%)")
}
} else if status == "downloaded" {
Button(action: {
let fileURL = DownloadButton.getFileURL(filename: filename)
if !FileManager.default.fileExists(atPath: fileURL.path) {
download()
return
}
do {
try llamaState.loadModel(modelUrl: fileURL)
} catch let err {
print("Error: \(err.localizedDescription)")
}
}) {
Text("\(modelName) (Downloaded)")
}
} else {
Text("Unknown status")
}
}
.onDisappear() {
downloadTask?.cancel()
}
.onChange(of: llamaState.cacheCleared) { newValue in
if newValue {
downloadTask?.cancel()
let fileURL = DownloadButton.getFileURL(filename: filename)
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
}
}
}
}
// #Preview {
// DownloadButton(
// llamaState: LlamaState(),
// modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
// modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
// filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
// )
// }

View file

@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
headers: { headers: {
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Accept': 'text/event-stream' 'Accept': 'text/event-stream',
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
}, },
signal: controller.signal, signal: controller.signal,
}); });

View file

@ -235,7 +235,8 @@
grammar: '', grammar: '',
n_probs: 0, // no completion_probabilities, n_probs: 0, // no completion_probabilities,
image_data: [], image_data: [],
cache_prompt: true cache_prompt: true,
api_key: ''
}) })
/* START: Support for storing prompt templates and parameters in browsers LocalStorage */ /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
@ -790,6 +791,10 @@
<fieldset> <fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })} ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset> </fieldset>
<fieldset>
<label for="api_key">API Key</label>
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
</fieldset>
</details> </details>
</form> </form>
` `

View file

@ -10,7 +10,8 @@
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
#define CPPHTTPLIB_NO_EXCEPTIONS 1 #define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif #endif
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h" #include "httplib.h"
#include "json.hpp" #include "json.hpp"
@ -36,6 +37,7 @@ using json = nlohmann::json;
struct server_params struct server_params
{ {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string api_key;
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
@ -1953,6 +1955,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
@ -2002,6 +2005,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
sparams.public_path = argv[i]; sparams.public_path = argv[i];
} }
else if (arg == "--api-key")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.api_key = argv[i];
}
else if (arg == "--timeout" || arg == "-to") else if (arg == "--timeout" || arg == "-to")
{ {
if (++i >= argc) if (++i >= argc)
@ -2402,7 +2414,7 @@ json oaicompat_completion_params_parse(
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
if (llama_params.count("grammar") != 0) { if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object()); llama_params["grammar"] = json_value(body, "grammar", json::object());
} }
@ -2633,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if SERVER_VERBOSE != 1
log_disable();
#endif
// own arguments required by this example // own arguments required by this example
gpt_params params; gpt_params params;
server_params sparams; server_params sparams;
@ -2669,6 +2684,32 @@ int main(int argc, char **argv)
httplib::Server svr; httplib::Server svr;
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
if (sparams.api_key.empty()) {
return true;
}
// Check for API key in the header
auto auth_header = req.get_header_value("Authorization");
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
if (received_api_key == sparams.api_key) {
return true; // API key is valid
}
}
// API key is invalid or not provided
res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
res.status = 401; // Unauthorized
LOG_WARNING("Unauthorized: Invalid API Key", {});
return false;
};
svr.set_default_headers({{"Server", "llama.cpp"}, svr.set_default_headers({{"Server", "llama.cpp"},
{"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}}); {"Access-Control-Allow-Headers", "content-type"}});
@ -2676,28 +2717,28 @@ int main(int argc, char **argv)
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/", [](const httplib::Request &, httplib::Response &res) svr.Get("/", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html"); res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.js is found in the public --path // this is only called if no index.js is found in the public --path
svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res) svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript"); res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res) svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res) svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
return false; return false;
}); });
@ -2708,23 +2749,26 @@ int main(int argc, char **argv)
{ "user_name", llama.name_user.c_str() }, { "user_name", llama.name_user.c_str() },
{ "assistant_name", llama.name_assistant.c_str() } { "assistant_name", llama.name_assistant.c_str() }
}; };
res.set_content(data.dump(), "application/json"); res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, false, false, -1); const int task_id = llama.request_completion(data, false, false, -1);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
std::string completion_text; std::string completion_text;
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
if (!result.error && result.stop) { if (!result.error && result.stop) {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} }
else else
{ {
res.status = 404; res.status = 404;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2795,12 +2839,15 @@ int main(int argc, char **argv)
}} }}
}; };
res.set_content(models.dump(), "application/json"); res.set_content(models.dump(), "application/json; charset=utf-8");
}); });
// TODO: add mount point without "/v1" prefix -- how? // TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = oaicompat_completion_params_parse(json::parse(req.body)); json data = oaicompat_completion_params_parse(json::parse(req.body));
const int task_id = llama.request_completion(data, false, false, -1); const int task_id = llama.request_completion(data, false, false, -1);
@ -2814,10 +2861,10 @@ int main(int argc, char **argv)
res.set_content(oaicompat_result.dump(-1, ' ', false, res.set_content(oaicompat_result.dump(-1, ' ', false,
json::error_handler_t::replace), json::error_handler_t::replace),
"application/json"); "application/json; charset=utf-8");
} else { } else {
res.status = 500; res.status = 500;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2869,8 +2916,11 @@ int main(int argc, char **argv)
} }
}); });
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, true, false, -1); const int task_id = llama.request_completion(data, true, false, -1);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
@ -2878,12 +2928,12 @@ int main(int argc, char **argv)
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
if (!result.error && result.stop) if (!result.error && result.stop)
{ {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} }
else else
{ {
res.status = 404; res.status = 404;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2932,11 +2982,11 @@ int main(int argc, char **argv)
svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res) svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
{ {
const json data = llama.get_model_props(); const json data = llama.get_model_props();
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
{ return res.set_content("", "application/json"); }); { return res.set_content("", "application/json; charset=utf-8"); });
svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{ {
@ -2947,7 +2997,7 @@ int main(int argc, char **argv)
tokens = llama.tokenize(body["content"], false); tokens = llama.tokenize(body["content"], false);
} }
const json data = format_tokenizer_response(tokens); const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2961,7 +3011,7 @@ int main(int argc, char **argv)
} }
const json data = format_detokenized_response(content); const json data = format_detokenized_response(content);
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2978,7 +3028,7 @@ int main(int argc, char **argv)
} }
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
return res.set_content(result.result_json.dump(), "application/json"); return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
}); });
svr.set_logger(log_server_request); svr.set_logger(log_server_request);
@ -2999,19 +3049,23 @@ int main(int argc, char **argv)
{ {
snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
} }
res.set_content(buf, "text/plain"); res.set_content(buf, "text/plain; charset=utf-8");
res.status = 500; res.status = 500;
}); });
svr.set_error_handler([](const httplib::Request &, httplib::Response &res) svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
{ {
if (res.status == 401)
{
res.set_content("Unauthorized", "text/plain; charset=utf-8");
}
if (res.status == 400) if (res.status == 400)
{ {
res.set_content("Invalid request", "text/plain"); res.set_content("Invalid request", "text/plain; charset=utf-8");
} }
else if (res.status != 500) else if (res.status == 404)
{ {
res.set_content("File Not Found", "text/plain"); res.set_content("File Not Found", "text/plain; charset=utf-8");
res.status = 404; res.status = 404;
} }
}); });
@ -3032,11 +3086,15 @@ int main(int argc, char **argv)
// to make it ctrl+clickable: // to make it ctrl+clickable:
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", { std::unordered_map<std::string, std::string> log_data;
{"hostname", sparams.hostname}, log_data["hostname"] = sparams.hostname;
{"port", sparams.port}, log_data["port"] = std::to_string(sparams.port);
});
if (!sparams.api_key.empty()) {
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
}
LOG_INFO("HTTP server listening", log_data);
// run the HTTP server in a thread - see comment below // run the HTTP server in a thread - see comment below
std::thread t([&]() std::thread t([&]()
{ {

237
ggml.c
View file

@ -9584,16 +9584,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
} }
#endif #endif
// off1 = offset in i11 and i1
// cne1 = ne11 and ne1
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst, struct ggml_tensor * dst) {
int64_t off1, int64_t cne1) {
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
@ -9661,9 +9656,9 @@ static void ggml_compute_forward_mul_mat(
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03; const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) { if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata; float * const wdata = params->wdata;
@ -9680,7 +9675,7 @@ static void ggml_compute_forward_mul_mat(
} }
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
cne1, ne01, ne10, ne1, ne01, ne10,
1.0f, y, ne10, 1.0f, y, ne10,
x, ne00, x, ne00,
0.0f, d, ne01); 0.0f, d, ne01);
@ -9721,8 +9716,8 @@ static void ggml_compute_forward_mul_mat(
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
const int64_t nr0 = ne01; // src0 rows const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1*ne12*ne13; // src1 rows const int64_t nr1 = ne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@ -9764,9 +9759,9 @@ static void ggml_compute_forward_mul_mat(
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*cne1)); const int64_t i13 = (ir1/(ne12*ne1));
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
// broadcast src0 into src1 // broadcast src0 into src1
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
@ -9806,28 +9801,191 @@ static void ggml_compute_forward_mul_mat(
static void ggml_compute_forward_mul_mat_id( static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * ids,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
return;
}
const struct ggml_tensor * ids = src0; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith;
const int nth = params->nth;
const enum ggml_type type = src0->type;
const bool src1_cont = ggml_is_contiguous(src1);
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
// row groups
const int id = ggml_get_op_params_i32(dst, 0); const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = ggml_get_op_params_i32(dst, 1); const int n_as = ggml_get_op_params_i32(dst, 1);
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { char * wdata_src1_end = (src1->type == vec_dot_type) ?
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); (char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
GGML_ASSERT(row_id >= 0 && row_id < n_as); int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
const struct ggml_tensor * src0_row = dst->src[row_id + 2]; #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
if (params->type == GGML_TASK_INIT) {
char * wdata = params->wdata;
if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
assert(params->wsize >= ne11*ne12*ne13*row_size);
assert(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
wdata += row_size;
}
}
}
}
// initialize matrix_row_counts
GGML_ASSERT(wdata == wdata_src1_end);
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
// group rows by src0 matrix
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < n_as);
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
}
return;
} }
if (params->type == GGML_TASK_FINALIZE) {
return;
}
// compute each matrix multiplication in sequence
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
if (cne1 == 0) {
continue;
}
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
const int64_t ir010 = dr0*ith0;
const int64_t ir011 = MIN(ir010 + dr0, nr0);
const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {
sched_yield();
continue;
}
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);
// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
// broadcast src0 into src1
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
}
}
#undef MMID_MATRIX_ROW
} }
// ggml_compute_forward_out_prod // ggml_compute_forward_out_prod
@ -14217,7 +14375,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
@ -16017,7 +16175,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
// FIXME: blas
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:
@ -16351,20 +16508,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
const struct ggml_tensor * a = node->src[2]; const struct ggml_tensor * src0 = node->src[2];
const struct ggml_tensor * b = node->src[1]; const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (src1->type != vec_dot_type) {
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) { cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
if (a->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
}
} else
#endif
if (b->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
} }
const int n_as = ggml_get_op_params_i32(node, 1);
cur = GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
} break; } break;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:
{ {

View file

@ -109,8 +109,10 @@ class SpecialVocab:
return True return True
def _set_special_token(self, typ: str, tid: Any) -> None: def _set_special_token(self, typ: str, tid: Any) -> None:
if not isinstance(tid, int) or tid < 0: if not isinstance(tid, int):
return return
if tid < 0:
raise ValueError(f'invalid value for special token type {typ}: {tid}')
if self.n_vocab is None or tid < self.n_vocab: if self.n_vocab is None or tid < self.n_vocab:
if typ in self.special_token_ids: if typ in self.special_token_ids:
return return

188
llama.cpp
View file

@ -1521,6 +1521,10 @@ struct llama_context {
// decode output (2-dimensional array: [n_tokens][n_vocab]) // decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits; std::vector<float> logits;
#ifndef NDEBUG
// guard against access to unset logits
std::vector<bool> logits_valid;
#endif
bool logits_all = false; bool logits_all = false;
// input embedding (1-dimensional array: [n_embd]) // input embedding (1-dimensional array: [n_embd])
@ -2409,25 +2413,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
switch (ftype) { switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16"; return "Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
// K-quants // K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
@ -2545,6 +2549,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 22: model.type = e_model::MODEL_1B; break;
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model::MODEL_13B; break;
@ -6386,6 +6391,14 @@ static int llama_decode_internal(
{ {
auto & logits_out = lctx.logits; auto & logits_out = lctx.logits;
#ifndef NDEBUG
auto & logits_valid = lctx.logits_valid;
logits_valid.clear();
logits_valid.resize(n_tokens);
logits_out.clear();
#endif
if (batch.logits) { if (batch.logits) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
@ -6393,13 +6406,22 @@ static int llama_decode_internal(
continue; continue;
} }
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
#ifndef NDEBUG
logits_valid[i] = true;
#endif
} }
} else if (lctx.logits_all) { } else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif
} else { } else {
logits_out.resize(n_vocab); logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
#ifndef NDEBUG
logits_valid[0] = true;
#endif
} }
} }
@ -8862,53 +8884,60 @@ static int llama_apply_lora_from_file_internal(
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary); llama_file fin(path_lora, "rb");
if (!fin) {
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
return 1;
}
// verify magic and version // verify magic and version
{ {
uint32_t magic; uint32_t magic = fin.read_u32();
fin.read((char *) &magic, sizeof(magic)); if (magic != LLAMA_FILE_MAGIC_GGLA) {
uint32_t format_version; LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
fin.read((char *) &format_version, sizeof(format_version)); return 1;
}
uint32_t format_version = fin.read_u32();
if (format_version != 1) { if (format_version != 1) {
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
return 1; return 1;
} }
} }
int32_t lora_r; int32_t lora_r = fin.read_u32();
int32_t lora_alpha; int32_t lora_alpha = fin.read_u32();
fin.read((char *) &lora_r, sizeof(lora_r));
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
float scaling = scale * (float)lora_alpha / (float)lora_r; float scaling = scale * (float)lora_alpha / (float)lora_r;
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
// create a name -> tensor map of the model to accelerate lookups
// find the max tensor size to estimate the required temporary buffer size
size_t max_tensor_size = 0;
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (const auto & kv : model.tensors_by_name) {
model_tensors.insert(kv);
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
max_tensor_size = std::max(max_tensor_size, f32_size);
}
// create a temporary ggml context to store the lora tensors // create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor // TODO: use ggml-alloc
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull); size_t lora_ctx_size = max_tensor_size * 3;
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
std::vector<uint8_t> lora_buf(lora_ctx_size);
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = lora_buf.size(); params.mem_size = lora_buf.size();
params.mem_buffer = lora_buf.data(); params.mem_buffer = lora_buf.data();
params.no_alloc = false; params.no_alloc = false;
ggml_context * lora_ctx = ggml_init(params); using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
// create a name -> tensor map of the model to accelerate lookups unique_context lora_ctx(nullptr, ggml_free);
std::unordered_map<std::string, struct ggml_tensor*> model_tensors; lora_ctx.reset(ggml_init(params));
for (const auto & kv : model.tensors_by_name) { std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
model_tensors.insert(kv);
}
// load base model // load base model
std::unique_ptr<llama_model_loader> ml; std::unique_ptr<llama_model_loader> ml;
ggml_context * base_ctx = NULL;
unique_context base_ctx(nullptr, ggml_free);
std::vector<uint8_t> base_buf; std::vector<uint8_t> base_buf;
if (path_base_model) { if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
@ -8917,6 +8946,7 @@ static int llama_apply_lora_from_file_internal(
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
ml->calc_sizes(ctx_size, mmapped_size); ml->calc_sizes(ctx_size, mmapped_size);
base_buf.resize(ctx_size); base_buf.resize(ctx_size);
ggml_init_params base_params; ggml_init_params base_params;
@ -8924,9 +8954,9 @@ static int llama_apply_lora_from_file_internal(
base_params.mem_buffer = base_buf.data(); base_params.mem_buffer = base_buf.data();
base_params.no_alloc = ml->use_mmap; base_params.no_alloc = ml->use_mmap;
base_ctx = ggml_init(base_params); base_ctx.reset(ggml_init(base_params));
// maybe this should in llama_model_loader // maybe this should be in llama_model_loader
if (ml->use_mmap) { if (ml->use_mmap) {
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
} }
@ -8939,27 +8969,35 @@ static int llama_apply_lora_from_file_internal(
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
while (true) { while (true) {
if (fin.tell() == fin.size) {
// eof
break;
}
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t name_len;
int32_t ftype; int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fin.read_raw(&n_dims, sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length)); fin.read_raw(&name_len, sizeof(name_len));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); fin.read_raw(&ftype, sizeof(ftype));
if (fin.eof()) {
break; if (n_dims != 1 && n_dims != 2) {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
} }
int32_t ne[2] = { 1, 1 }; int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); fin.read_raw(&ne[i], sizeof(ne[i]));
} }
std::string name; std::string name;
{ {
GGML_ASSERT(name_len <= 1024);
char buf[1024]; char buf[1024];
fin.read(buf, length); fin.read_raw(buf, name_len);
name = std::string(buf, length); name = std::string(buf, name_len);
} }
// check for lora suffix and get the type of tensor // check for lora suffix and get the type of tensor
@ -8973,7 +9011,7 @@ static int llama_apply_lora_from_file_internal(
std::string lora_type = name.substr(pos + lora_suffix.length()); std::string lora_type = name.substr(pos + lora_suffix.length());
std::string base_name = name; std::string base_name = name;
base_name.erase(pos); base_name.erase(pos);
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
if (model_tensors.find(base_name) == model_tensors.end()) { if (model_tensors.find(base_name) == model_tensors.end()) {
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@ -8992,22 +9030,15 @@ static int llama_apply_lora_from_file_internal(
return false; return false;
} }
} }
ggml_tensor * lora_tensor; ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
if (n_dims == 2) { ggml_set_name(lora_tensor, name.c_str());
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
}
else {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
}
ggml_set_name(lora_tensor, "lora_tensor");
// load tensor data // load tensor data
size_t offset = fin.tellg(); size_t offset = fin.tell();
size_t tensor_data_size = ggml_nbytes(lora_tensor); size_t tensor_data_size = ggml_nbytes(lora_tensor);
offset = (offset + 31) & -32; offset = (offset + 31) & -32;
fin.seekg(offset); fin.seek(offset, SEEK_SET);
fin.read((char*)lora_tensor->data, tensor_data_size); fin.read_raw(lora_tensor->data, tensor_data_size);
lora_tensors[name] = lora_tensor; lora_tensors[name] = lora_tensor;
@ -9037,13 +9068,11 @@ static int llama_apply_lora_from_file_internal(
// load from base model // load from base model
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
// TODO: throw
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1; return 1;
} }
// TODO: not tested!! maybe not working! base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
ml->load_data_for(base_t); ml->load_data_for(base_t);
} else { } else {
base_t = dest_t; base_t = dest_t;
@ -9072,43 +9101,45 @@ static int llama_apply_lora_from_file_internal(
} }
// w = w + BA*s // w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
offload_func(BA); offload_func(BA);
ggml_set_name(BA, "BA"); ggml_set_name(BA, "BA");
if (scaling != 1.0f) { if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
ggml_set_name(scale_tensor, "scale_tensor"); ggml_set_name(scale_tensor, "scale_tensor");
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
offload_func(BA); offload_func(BA);
ggml_set_name(BA, "BA_scaled"); ggml_set_name(BA, "BA_scaled");
} }
ggml_tensor * r; ggml_tensor * r;
if (base_t == dest_t) { if (base_t == dest_t) {
r = ggml_add_inplace(lora_ctx, dest_t, BA); r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
offload_func_force_inplace(r); offload_func_force_inplace(r);
ggml_set_name(r, "r_add_inplace"); ggml_set_name(r, "r_add_inplace");
} }
else { else {
r = ggml_add(lora_ctx, base_t, BA); r = ggml_add(lora_ctx.get(), base_t, BA);
offload_func(r); offload_func(r);
ggml_set_name(r, "r_add"); ggml_set_name(r, "r_add");
r = ggml_cpy(lora_ctx, r, dest_t); r = ggml_cpy(lora_ctx.get(), r, dest_t);
offload_func(r); offload_func(r);
ggml_set_name(r, "r_cpy"); ggml_set_name(r, "r_cpy");
} }
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
ggml_build_forward_expand(gf, r); ggml_build_forward_expand(gf, r);
ggml_graph_compute_helper(work_buffer, gf, n_threads); ggml_graph_compute_helper(work_buffer, gf, n_threads);
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
GGML_ASSERT(lora_tensors.size() == 2);
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx); lora_ctx.reset(ggml_init(params));
lora_ctx = ggml_init(params);
lora_tensors.clear(); lora_tensors.clear();
n_tensors++; n_tensors++;
@ -9118,12 +9149,6 @@ static int llama_apply_lora_from_file_internal(
} }
} }
// TODO: this should be in a destructor, it will leak on failure
ggml_free(lora_ctx);
if (base_ctx) {
ggml_free(base_ctx);
}
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
@ -10288,6 +10313,7 @@ float * llama_get_logits(struct llama_context * ctx) {
} }
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
assert(ctx->logits_valid.at(i));
return ctx->logits.data() + i*ctx->model.hparams.n_vocab; return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
} }

View file

@ -39,6 +39,7 @@
#define LLAMA_MAX_RNG_STATE (64*1024) #define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN