resolve merge conflict

This commit is contained in:
manikbhandari 2023-12-20 06:28:20 -05:00
commit 5d5b6088b5
24 changed files with 1684 additions and 762 deletions

View file

@ -23,3 +23,6 @@ insert_final_newline = unset
[examples/server/public/*] [examples/server/public/*]
indent_size = 2 indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab

View file

@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else() else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif() endif()

View file

@ -439,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
endif # LLAMA_CLBLAST endif # LLAMA_CLBLAST
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm
HIPCC ?= $(ROCM_PATH)/bin/hipcc ifeq ($(wildcard /opt/rocm),)
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else
ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif
HIPCC ?= $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2

View file

@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
---- ----

View file

@ -185,6 +185,8 @@ class Model:
return MixtralModel return MixtralModel
if model_architecture == "GPT2LMHeadModel": if model_architecture == "GPT2LMHeadModel":
return GPT2Model return GPT2Model
if model_architecture == "PhiForCausalLM":
return Phi2Model
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -226,6 +228,8 @@ class Model:
return gguf.MODEL_ARCH.LLAMA return gguf.MODEL_ARCH.LLAMA
if arch == "GPT2LMHeadModel": if arch == "GPT2LMHeadModel":
return gguf.MODEL_ARCH.GPT2 return gguf.MODEL_ARCH.GPT2
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -1046,6 +1050,23 @@ class GPT2Model(Model):
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
class Phi2Model(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -3,7 +3,6 @@ from __future__ import annotations
import json import json
import os import os
import re
import struct import struct
import sys import sys
from typing import Any, BinaryIO, Sequence from typing import Any, BinaryIO, Sequence
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attn_q",
"self_attn.k_proj": "attn_k",
"self_attn.v_proj": "attn_v",
"self_attn.o_proj": "attn_output",
"mlp.gate_proj": "ffn_gate",
"mlp.down_proj": "ffn_down",
"mlp.up_proj": "ffn_up",
"input_layernorm": "attn_norm",
"post_attention_layernorm": "ffn_norm",
}
def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)
output_string = (
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", 1)) # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(struct.pack("i", int(params["lora_alpha"]))) fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header( def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
) -> None:
sname = name.encode("utf-8") sname = name.encode("utf-8")
fout.write( fout.write(
struct.pack( struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
fout.seek((fout.tell() + 31) & -32) fout.seek((fout.tell() + 31) & -32)
if len(sys.argv) != 2: if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path>") print(f"Usage: python {sys.argv[0]} <path> [arch]")
print( print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
) )
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1) sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json") input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu") model = torch.load(input_model, map_location="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f: with open(input_json, "r") as f:
params = json.load(f) params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
write_file_header(fout, params) write_file_header(fout, params)
for k, v in model.items(): for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"): if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight") k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]: if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
v = v.float() v = v.float()
t = v.detach().numpy() t = v.detach().numpy()
tname = translate_tensor_name(k)
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype) write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout) t.tofile(fout)

View file

@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
opt->params.adam.gclip = params.common.adam_gclip; opt->params.adam.gclip = params.common.adam_gclip;
opt->params.adam.eps_f = params.common.adam_eps_f; opt->params.adam.eps_f = params.common.adam_eps_f;
ggml_allocr * alloc = NULL;
printf("%s: init model\n", __func__); printf("%s: init model\n", __func__);
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); mem_input_data.resize(max_input_size);
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
ggml_allocr_alloc(alloc, tokens_input); ggml_allocr_alloc(alloc_inps, tokens_input);
ggml_allocr_alloc(alloc, target_probs); ggml_allocr_alloc(alloc_inps, target_probs);
ggml_allocr_free(alloc);
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size); mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
params.common.use_checkpointing params.common.use_checkpointing
); );
ggml_allocr_free(alloc); ggml_allocr_free(alloc);
ggml_allocr_free(alloc_inps);
// tokenize data // tokenize data
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;

View file

@ -1 +1,2 @@
xcuserdata xcuserdata
xcshareddata

View file

@ -6,16 +6,34 @@ enum LlamaError: Error {
case couldNotInitializeContext case couldNotInitializeContext
} }
func llama_batch_clear(_ batch: inout llama_batch) {
batch.n_tokens = 0
}
func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
batch.token [Int(batch.n_tokens)] = id
batch.pos [Int(batch.n_tokens)] = pos
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
for i in 0..<seq_ids.count {
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
}
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0
batch.n_tokens += 1
}
actor LlamaContext { actor LlamaContext {
private var model: OpaquePointer private var model: OpaquePointer
private var context: OpaquePointer private var context: OpaquePointer
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
/// This variable is used to store temporarily invalid cchars /// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar] private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 512 var n_len: Int32 = 64
var n_cur: Int32 = 0 var n_cur: Int32 = 0
var n_decode: Int32 = 0 var n_decode: Int32 = 0
init(model: OpaquePointer, context: OpaquePointer) { init(model: OpaquePointer, context: OpaquePointer) {
@ -27,25 +45,34 @@ actor LlamaContext {
} }
deinit { deinit {
llama_batch_free(batch)
llama_free(context) llama_free(context)
llama_free_model(model) llama_free_model(model)
llama_backend_free() llama_backend_free()
} }
static func createContext(path: String) throws -> LlamaContext { static func create_context(path: String) throws -> LlamaContext {
llama_backend_init(false) llama_backend_init(false)
let model_params = llama_model_default_params() var model_params = llama_model_default_params()
#if targetEnvironment(simulator)
model_params.n_gpu_layers = 0
print("Running on simulator, force use n_gpu_layers = 0")
#endif
let model = llama_load_model_from_file(path, model_params) let model = llama_load_model_from_file(path, model_params)
guard let model else { guard let model else {
print("Could not load model at \(path)") print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext throw LlamaError.couldNotInitializeContext
} }
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
print("Using \(n_threads) threads")
var ctx_params = llama_context_default_params() var ctx_params = llama_context_default_params()
ctx_params.seed = 1234 ctx_params.seed = 1234
ctx_params.n_ctx = 2048 ctx_params.n_ctx = 2048
ctx_params.n_threads = 8 ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = 8 ctx_params.n_threads_batch = UInt32(n_threads)
let context = llama_new_context_with_model(model, ctx_params) let context = llama_new_context_with_model(model, ctx_params)
guard let context else { guard let context else {
@ -56,6 +83,26 @@ actor LlamaContext {
return LlamaContext(model: model, context: context) return LlamaContext(model: model, context: context)
} }
func model_info() -> String {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
result.initialize(repeating: Int8(0), count: 256)
defer {
result.deallocate()
}
// TODO: this is probably very stupid way to get the string from C
let nChars = llama_model_desc(model, result, 256)
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
var SwiftString = ""
for char in bufferPointer {
SwiftString.append(Character(UnicodeScalar(UInt8(char))))
}
return SwiftString
}
func get_n_tokens() -> Int32 { func get_n_tokens() -> Int32 {
return batch.n_tokens; return batch.n_tokens;
} }
@ -79,16 +126,11 @@ actor LlamaContext {
print(String(cString: token_to_piece(token: id) + [0])) print(String(cString: token_to_piece(token: id) + [0]))
} }
// batch = llama_batch_init(512, 0) // done in init() llama_batch_clear(&batch)
batch.n_tokens = Int32(tokens_list.count)
for i1 in 0..<batch.n_tokens { for i1 in 0..<tokens_list.count {
let i = Int(i1) let i = Int(i1)
batch.token[i] = tokens_list[i] llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
batch.pos[i] = i1
batch.n_seq_id[Int(i)] = 1
batch.seq_id[Int(i)]![0] = 0
batch.logits[i] = 0
} }
batch.logits[Int(batch.n_tokens) - 1] = 1 // true batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@ -141,18 +183,11 @@ actor LlamaContext {
print(new_token_str) print(new_token_str)
// tokens_list.append(new_token_id) // tokens_list.append(new_token_id)
batch.n_tokens = 0 llama_batch_clear(&batch)
llama_batch_add(&batch, new_token_id, n_cur, [0], true)
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
batch.seq_id[Int(batch.n_tokens)]![0] = 0
batch.logits[Int(batch.n_tokens)] = 1 // true
batch.n_tokens += 1
n_decode += 1 n_decode += 1
n_cur += 1
n_cur += 1
if llama_decode(context, batch) != 0 { if llama_decode(context, batch) != 0 {
print("failed to evaluate llama!") print("failed to evaluate llama!")
@ -161,14 +196,111 @@ actor LlamaContext {
return new_token_str return new_token_str
} }
func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
var pp_avg: Double = 0
var tg_avg: Double = 0
var pp_std: Double = 0
var tg_std: Double = 0
for _ in 0..<nr {
// bench prompt processing
llama_batch_clear(&batch)
let n_tokens = pp
for i in 0..<n_tokens {
llama_batch_add(&batch, 0, Int32(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_kv_cache_clear(context)
let t_pp_start = ggml_time_us()
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt")
}
let t_pp_end = ggml_time_us()
// bench text generation
llama_kv_cache_clear(context)
let t_tg_start = ggml_time_us()
for i in 0..<tg {
llama_batch_clear(&batch)
for j in 0..<pl {
llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
}
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation")
}
}
let t_tg_end = ggml_time_us()
llama_kv_cache_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
let speed_pp = Double(pp) / t_pp
let speed_tg = Double(pl*tg) / t_tg
pp_avg += speed_pp
tg_avg += speed_tg
pp_std += speed_pp * speed_pp
tg_std += speed_tg * speed_tg
print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
}
pp_avg /= Double(nr)
tg_avg /= Double(nr)
if nr > 1 {
pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
} else {
pp_std = 0
tg_std = 0
}
let model_desc = model_info();
let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
let backend = "Metal";
let pp_avg_str = String(format: "%.2f", pp_avg);
let tg_avg_str = String(format: "%.2f", tg_avg);
let pp_std_str = String(format: "%.2f", pp_std);
let tg_std_str = String(format: "%.2f", tg_std);
var result = ""
result += String("| model | size | params | backend | test | t/s |\n")
result += String("| --- | --- | --- | --- | --- | --- |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
return result;
}
func clear() { func clear() {
tokens_list.removeAll() tokens_list.removeAll()
temporary_invalid_cchars.removeAll() temporary_invalid_cchars.removeAll()
llama_kv_cache_clear(context)
} }
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0) let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

View file

@ -1,481 +1,483 @@
// !$*UTF8*$! // !$*UTF8*$!
{ {
archiveVersion = 1; archiveVersion = 1;
classes = { classes = {
}; };
objectVersion = 56; objectVersion = 56;
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; }; 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; }; 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; }; 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; }; 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; }; 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; }; 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; }; 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; }; 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; }; 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; }; 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; }; 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; }; 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; }; 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; }; 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; }; 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; }; 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; }; 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; }; 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; }; 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; }; 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; }; 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; }; 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; }; 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; }; 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; }; 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
8A1C83702AC328BD0096AF73 /* Frameworks */ = { 8A1C83702AC328BD0096AF73 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase; isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;
}; };
/* End PBXFrameworksBuildPhase section */ /* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */ /* Begin PBXGroup section */
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
542376092B0D9C40008E6A1C /* ggml-backend.h */, 542376092B0D9C40008E6A1C /* ggml-backend.h */,
542376062B0D9BEA008E6A1C /* ggml-quants.h */, 542376062B0D9BEA008E6A1C /* ggml-quants.h */,
542376072B0D9BFB008E6A1C /* ggml-quants.c */, 542376072B0D9BFB008E6A1C /* ggml-quants.c */,
549479C82AC9E10B00E0F78B /* ggml-metal.metal */, 549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
549479C62AC9E0F200E0F78B /* ggml-metal.h */, 549479C62AC9E0F200E0F78B /* ggml-metal.h */,
549479C52AC9E0F200E0F78B /* ggml-metal.m */, 549479C52AC9E0F200E0F78B /* ggml-metal.m */,
542EA09B2AC8723900A8AEE9 /* ggml.c */, 542EA09B2AC8723900A8AEE9 /* ggml.c */,
542EA09C2AC8723900A8AEE9 /* ggml.h */, 542EA09C2AC8723900A8AEE9 /* ggml.h */,
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
542EA0A12AC8729100A8AEE9 /* llama.cpp */, 542EA0A12AC8729100A8AEE9 /* llama.cpp */,
542EA0A22AC8729100A8AEE9 /* llama.h */, 542EA0A22AC8729100A8AEE9 /* llama.h */,
); );
name = llama.cpp; name = llama.cpp;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C836A2AC328BD0096AF73 = { 8A1C836A2AC328BD0096AF73 = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A08D1F62AC7383900FE6CD4 /* llama.cpp */, 8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
8A907F312AC7134E006146EA /* llama.cpp.swift */, 8A907F312AC7134E006146EA /* llama.cpp.swift */,
8A3F84232AC4C891005E2EE8 /* models */, 8A3F84232AC4C891005E2EE8 /* models */,
8A1C83752AC328BD0096AF73 /* llama.swiftui */, 8A1C83752AC328BD0096AF73 /* llama.swiftui */,
8A1C83742AC328BD0096AF73 /* Products */, 8A1C83742AC328BD0096AF73 /* Products */,
8A39BE082AC7601000BFEB40 /* Frameworks */, 8A39BE082AC7601000BFEB40 /* Frameworks */,
); );
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C83742AC328BD0096AF73 /* Products */ = { 8A1C83742AC328BD0096AF73 /* Products */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
); );
name = Products; name = Products;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F84102AC4BD85005E2EE8 /* Resources */, 8A3F84102AC4BD85005E2EE8 /* Resources */,
8A9F7C4B2AC332DC008AE1EA /* Models */, 8A9F7C4B2AC332DC008AE1EA /* Models */,
8A9F7C4A2AC332BF008AE1EA /* UI */, 8A9F7C4A2AC332BF008AE1EA /* UI */,
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
8A1C837C2AC328BE0096AF73 /* Preview Content */, 8A1C837C2AC328BE0096AF73 /* Preview Content */,
); );
path = llama.swiftui; path = llama.swiftui;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A1C837C2AC328BE0096AF73 /* Preview Content */ = { 8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
); );
path = "Preview Content"; path = "Preview Content";
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A39BE082AC7601000BFEB40 /* Frameworks */ = { 8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
549479CA2AC9E16000E0F78B /* Metal.framework */, 549479CA2AC9E16000E0F78B /* Metal.framework */,
8A39BE092AC7601000BFEB40 /* Accelerate.framework */, 8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
); );
name = Frameworks; name = Frameworks;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A3F84102AC4BD85005E2EE8 /* Resources */ = { 8A3F84102AC4BD85005E2EE8 /* Resources */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F84112AC4BD8C005E2EE8 /* models */, 8A3F84112AC4BD8C005E2EE8 /* models */,
); );
path = Resources; path = Resources;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A3F84112AC4BD8C005E2EE8 /* models */ = { 8A3F84112AC4BD8C005E2EE8 /* models */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */, );
); path = models;
path = models; sourceTree = "<group>";
sourceTree = "<group>"; };
}; 8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
8A907F312AC7134E006146EA /* llama.cpp.swift */ = { isa = PBXGroup;
isa = PBXGroup; children = (
children = ( 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, 8A907F322AC7134E006146EA /* LibLlama.swift */,
8A907F322AC7134E006146EA /* LibLlama.swift */, );
); path = llama.cpp.swift;
path = llama.cpp.swift; sourceTree = "<group>";
sourceTree = "<group>"; };
}; 8A9F7C4A2AC332BF008AE1EA /* UI */ = {
8A9F7C4A2AC332BF008AE1EA /* UI */ = { isa = PBXGroup;
isa = PBXGroup; children = (
children = ( 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
8A1C83782AC328BD0096AF73 /* ContentView.swift */, 8A1C83782AC328BD0096AF73 /* ContentView.swift */,
); );
path = UI; path = UI;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
8A9F7C4B2AC332DC008AE1EA /* Models */ = { 8A9F7C4B2AC332DC008AE1EA /* Models */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
); );
path = Models; path = Models;
sourceTree = "<group>"; sourceTree = "<group>";
}; };
/* End PBXGroup section */ /* End PBXGroup section */
/* Begin PBXNativeTarget section */ /* Begin PBXNativeTarget section */
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXNativeTarget; isa = PBXNativeTarget;
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
buildPhases = ( buildPhases = (
8A1C836F2AC328BD0096AF73 /* Sources */, 8A1C836F2AC328BD0096AF73 /* Sources */,
8A1C83702AC328BD0096AF73 /* Frameworks */, 8A1C83702AC328BD0096AF73 /* Frameworks */,
8A1C83712AC328BD0096AF73 /* Resources */, 8A1C83712AC328BD0096AF73 /* Resources */,
); );
buildRules = ( buildRules = (
); );
dependencies = ( dependencies = (
); );
name = llama.swiftui; name = llama.swiftui;
packageProductDependencies = ( packageProductDependencies = (
); );
productName = llama.swiftui; productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
productType = "com.apple.product-type.application"; productType = "com.apple.product-type.application";
}; };
/* End PBXNativeTarget section */ /* End PBXNativeTarget section */
/* Begin PBXProject section */ /* Begin PBXProject section */
8A1C836B2AC328BD0096AF73 /* Project object */ = { 8A1C836B2AC328BD0096AF73 /* Project object */ = {
isa = PBXProject; isa = PBXProject;
attributes = { attributes = {
BuildIndependentTargetsInParallel = 1; BuildIndependentTargetsInParallel = 1;
LastSwiftUpdateCheck = 1500; LastSwiftUpdateCheck = 1500;
LastUpgradeCheck = 1500; LastUpgradeCheck = 1500;
TargetAttributes = { TargetAttributes = {
8A1C83722AC328BD0096AF73 = { 8A1C83722AC328BD0096AF73 = {
CreatedOnToolsVersion = 15.0; CreatedOnToolsVersion = 15.0;
LastSwiftMigration = 1500; LastSwiftMigration = 1500;
}; };
}; };
}; };
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
compatibilityVersion = "Xcode 14.0"; compatibilityVersion = "Xcode 14.0";
developmentRegion = en; developmentRegion = en;
hasScannedForEncodings = 0; hasScannedForEncodings = 0;
knownRegions = ( knownRegions = (
en, en,
Base, Base,
); );
mainGroup = 8A1C836A2AC328BD0096AF73; mainGroup = 8A1C836A2AC328BD0096AF73;
packageReferences = ( packageReferences = (
); );
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
projectDirPath = ""; projectDirPath = "";
projectRoot = ""; projectRoot = "";
targets = ( targets = (
8A1C83722AC328BD0096AF73 /* llama.swiftui */, 8A1C83722AC328BD0096AF73 /* llama.swiftui */,
); );
}; };
/* End PBXProject section */ /* End PBXProject section */
/* Begin PBXResourcesBuildPhase section */ /* Begin PBXResourcesBuildPhase section */
8A1C83712AC328BD0096AF73 /* Resources */ = { 8A1C83712AC328BD0096AF73 /* Resources */ = {
isa = PBXResourcesBuildPhase; isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */, 8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;
}; };
/* End PBXResourcesBuildPhase section */ /* End PBXResourcesBuildPhase section */
/* Begin PBXSourcesBuildPhase section */ /* Begin PBXSourcesBuildPhase section */
8A1C836F2AC328BD0096AF73 /* Sources */ = { 8A1C836F2AC328BD0096AF73 /* Sources */ = {
isa = PBXSourcesBuildPhase; isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
); 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
runOnlyForDeploymentPostprocessing = 0; );
}; runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */ /* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */ /* Begin XCBuildConfiguration section */
8A1C837F2AC328BE0096AF73 /* Debug */ = { 8A1C837F2AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO; ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES; CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES; CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES; CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES; CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES; CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES; CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES; CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES; CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO; COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf; DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES; ENABLE_TESTABILITY = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17; GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_DYNAMIC_NO_PIC = NO; GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES; GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0; GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = ( GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1", "DEBUG=1",
"$(inherited)", "$(inherited)",
); );
GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES; GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0; IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES; LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES; MTL_FAST_MATH = YES;
ONLY_ACTIVE_ARCH = YES; ONLY_ACTIVE_ARCH = YES;
SDKROOT = iphoneos; SDKROOT = iphoneos;
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_OPTIMIZATION_LEVEL = "-Onone";
}; };
name = Debug; name = Debug;
}; };
8A1C83802AC328BE0096AF73 /* Release */ = { 8A1C83802AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO; ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES; CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES; CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES; CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES; CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES; CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES; CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES; CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES; CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES; CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES; CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO; COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO; ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17; GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_NO_COMMON_BLOCKS = YES; GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES; GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0; IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES; LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = NO; MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES; MTL_FAST_MATH = YES;
SDKROOT = iphoneos; SDKROOT = iphoneos;
SWIFT_COMPILATION_MODE = wholemodule; SWIFT_COMPILATION_MODE = wholemodule;
VALIDATE_PRODUCT = YES; VALIDATE_PRODUCT = YES;
}; };
name = Release; name = Release;
}; };
8A1C83822AC328BE0096AF73 /* Debug */ = { 8A1C83822AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic; CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1; CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q; DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES; ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES; GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0; IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = ( LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)", "$(inherited)",
"@executable_path/Frameworks", "@executable_path/Frameworks",
); );
MARKETING_VERSION = 1.0; MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)"; PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0; SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2"; TARGETED_DEVICE_FAMILY = "1,2";
}; };
name = Debug; name = Debug;
}; };
8A1C83832AC328BE0096AF73 /* Release */ = { 8A1C83832AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration; isa = XCBuildConfiguration;
buildSettings = { buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic; CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1; CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q; DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES; ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES; GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES; INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0; IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = ( LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)", "$(inherited)",
"@executable_path/Frameworks", "@executable_path/Frameworks",
); );
MARKETING_VERSION = 1.0; MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)"; PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_VERSION = 5.0; SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2"; TARGETED_DEVICE_FAMILY = "1,2";
}; };
name = Release; name = Release;
}; };
/* End XCBuildConfiguration section */ /* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */ /* Begin XCConfigurationList section */
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
isa = XCConfigurationList; isa = XCConfigurationList;
buildConfigurations = ( buildConfigurations = (
8A1C837F2AC328BE0096AF73 /* Debug */, 8A1C837F2AC328BE0096AF73 /* Debug */,
8A1C83802AC328BE0096AF73 /* Release */, 8A1C83802AC328BE0096AF73 /* Release */,
); );
defaultConfigurationIsVisible = 0; defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release; defaultConfigurationName = Release;
}; };
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
isa = XCConfigurationList; isa = XCConfigurationList;
buildConfigurations = ( buildConfigurations = (
8A1C83822AC328BE0096AF73 /* Debug */, 8A1C83822AC328BE0096AF73 /* Debug */,
8A1C83832AC328BE0096AF73 /* Release */, 8A1C83832AC328BE0096AF73 /* Release */,
); );
defaultConfigurationIsVisible = 0; defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release; defaultConfigurationName = Release;
}; };
/* End XCConfigurationList section */ /* End XCConfigurationList section */
}; };
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
} }

View file

@ -3,24 +3,26 @@ import Foundation
@MainActor @MainActor
class LlamaState: ObservableObject { class LlamaState: ObservableObject {
@Published var messageLog = "" @Published var messageLog = ""
@Published var cacheCleared = false
private var llamaContext: LlamaContext? private var llamaContext: LlamaContext?
private var modelUrl: URL? { private var defaultModelUrl: URL? {
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models") // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
} }
init() { init() {
do { do {
try loadModel() try loadModel(modelUrl: defaultModelUrl)
} catch { } catch {
messageLog += "Error!\n" messageLog += "Error!\n"
} }
} }
private func loadModel() throws { func loadModel(modelUrl: URL?) throws {
messageLog += "Loading model...\n" messageLog += "Loading model...\n"
if let modelUrl { if let modelUrl {
llamaContext = try LlamaContext.createContext(path: modelUrl.path()) llamaContext = try LlamaContext.create_context(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else { } else {
messageLog += "Could not locate model\n" messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
guard let llamaContext else { guard let llamaContext else {
return return
} }
messageLog += "Attempting to complete text...\n"
await llamaContext.completion_init(text: text) await llamaContext.completion_init(text: text)
messageLog += "\(text)" messageLog += "\(text)"
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
await llamaContext.clear() await llamaContext.clear()
messageLog += "\n\ndone\n" messageLog += "\n\ndone\n"
} }
func bench() async {
guard let llamaContext else {
return
}
messageLog += "\n"
messageLog += "Running benchmark...\n"
messageLog += "Model info: "
messageLog += await llamaContext.model_info() + "\n"
let t_start = DispatchTime.now().uptimeNanoseconds
await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
let t_end = DispatchTime.now().uptimeNanoseconds
let t_heat = Double(t_end - t_start) / 1_000_000_000.0
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
// if more than 5 seconds, then we're probably running on a slow device
if t_heat > 5.0 {
messageLog += "Heat up time is too long, aborting benchmark\n"
return
}
let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
messageLog += "\(result)"
messageLog += "\n"
}
func clear() async {
guard let llamaContext else {
return
}
await llamaContext.clear()
messageLog = ""
}
} }

View file

@ -5,24 +5,132 @@ struct ContentView: View {
@State private var multiLineText = "" @State private var multiLineText = ""
private static func cleanupModelCaches() {
// Delete all models (*.gguf)
let fileManager = FileManager.default
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
do {
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
for fileURL in fileURLs {
if fileURL.pathExtension == "gguf" {
try fileManager.removeItem(at: fileURL)
}
}
} catch {
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
}
}
var body: some View { var body: some View {
VStack { VStack {
ScrollView(.vertical) { ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog) Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
} }
TextEditor(text: $multiLineText) TextEditor(text: $multiLineText)
.frame(height: 200) .frame(height: 80)
.padding() .padding()
.border(Color.gray, width: 0.5) .border(Color.gray, width: 0.5)
Button(action: {
sendText() HStack {
}) { Button("Send") {
Text("Send") sendText()
.padding() }
.background(Color.blue) .padding(8)
.foregroundColor(.white) .background(Color.blue)
.cornerRadius(8) .foregroundColor(.white)
.cornerRadius(8)
Button("Bench") {
bench()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Clear") {
clear()
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
.padding(8)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
VStack {
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
)
.font(.system(size: 12))
.padding(.top, 4)
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
)
.font(.system(size: 12))
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
filename: "tinyllama-1.1b-f16.gguf"
)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
filename: "phi-2-q4_0.gguf"
)
.font(.system(size: 12))
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
filename: "phi-2-q8_0.gguf"
)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
DownloadButton(
llamaState: llamaState,
modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
filename: "mistral-7b-v0.1.Q4_0.gguf"
)
.font(.system(size: 12))
Button("Clear downloaded models") {
ContentView.cleanupModelCaches()
llamaState.cacheCleared = true
}
.padding(8)
.font(.system(size: 12))
} }
} }
.padding() .padding()
@ -34,9 +142,20 @@ struct ContentView: View {
multiLineText = "" multiLineText = ""
} }
} }
func bench() {
Task {
await llamaState.bench()
}
}
func clear() {
Task {
await llamaState.clear()
}
}
} }
/*
#Preview { //#Preview {
ContentView() // ContentView()
} //}
*/

View file

@ -0,0 +1,122 @@
import SwiftUI
struct DownloadButton: View {
@ObservedObject private var llamaState: LlamaState
private var modelName: String
private var modelUrl: String
private var filename: String
@State private var status: String
@State private var downloadTask: URLSessionDownloadTask?
@State private var progress = 0.0
@State private var observation: NSKeyValueObservation?
private static func getFileURL(filename: String) -> URL {
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
}
private func checkFileExistenceAndUpdateStatus() {
}
init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
self.llamaState = llamaState
self.modelName = modelName
self.modelUrl = modelUrl
self.filename = filename
let fileURL = DownloadButton.getFileURL(filename: filename)
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
}
private func download() {
status = "downloading"
print("Downloading model \(modelName) from \(modelUrl)")
guard let url = URL(string: modelUrl) else { return }
let fileURL = DownloadButton.getFileURL(filename: filename)
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
if let error = error {
print("Error: \(error.localizedDescription)")
return
}
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
print("Server error!")
return
}
do {
if let temporaryURL = temporaryURL {
try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
print("Writing to \(filename) completed")
llamaState.cacheCleared = false
status = "downloaded"
}
} catch let err {
print("Error: \(err.localizedDescription)")
}
}
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
self.progress = progress.fractionCompleted
}
downloadTask?.resume()
}
var body: some View {
VStack {
if status == "download" {
Button(action: download) {
Text("Download " + modelName)
}
} else if status == "downloading" {
Button(action: {
downloadTask?.cancel()
status = "download"
}) {
Text("\(modelName) (Downloading \(Int(progress * 100))%)")
}
} else if status == "downloaded" {
Button(action: {
let fileURL = DownloadButton.getFileURL(filename: filename)
if !FileManager.default.fileExists(atPath: fileURL.path) {
download()
return
}
do {
try llamaState.loadModel(modelUrl: fileURL)
} catch let err {
print("Error: \(err.localizedDescription)")
}
}) {
Text("\(modelName) (Downloaded)")
}
} else {
Text("Unknown status")
}
}
.onDisappear() {
downloadTask?.cancel()
}
.onChange(of: llamaState.cacheCleared) { newValue in
if newValue {
downloadTask?.cancel()
let fileURL = DownloadButton.getFileURL(filename: filename)
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
}
}
}
}
// #Preview {
// DownloadButton(
// llamaState: LlamaState(),
// modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
// modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
// filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
// )
// }

View file

@ -10,7 +10,8 @@
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
#define CPPHTTPLIB_NO_EXCEPTIONS 1 #define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif #endif
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h" #include "httplib.h"
#include "json.hpp" #include "json.hpp"
@ -2413,7 +2414,7 @@ json oaicompat_completion_params_parse(
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
if (llama_params.count("grammar") != 0) { if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object()); llama_params["grammar"] = json_value(body, "grammar", json::object());
} }
@ -2644,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if SERVER_VERBOSE != 1
log_disable();
#endif
// own arguments required by this example // own arguments required by this example
gpt_params params; gpt_params params;
server_params sparams; server_params sparams;
@ -2698,7 +2702,7 @@ int main(int argc, char **argv)
} }
// API key is invalid or not provided // API key is invalid or not provided
res.set_content("Unauthorized: Invalid API Key", "text/plain"); res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
res.status = 401; // Unauthorized res.status = 401; // Unauthorized
LOG_WARNING("Unauthorized: Invalid API Key", {}); LOG_WARNING("Unauthorized: Invalid API Key", {});
@ -2713,28 +2717,28 @@ int main(int argc, char **argv)
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/", [](const httplib::Request &, httplib::Response &res) svr.Get("/", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html"); res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.js is found in the public --path // this is only called if no index.js is found in the public --path
svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res) svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript"); res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res) svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
return false; return false;
}); });
// this is only called if no index.html is found in the public --path // this is only called if no index.html is found in the public --path
svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res) svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
{ {
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
return false; return false;
}); });
@ -2745,7 +2749,7 @@ int main(int argc, char **argv)
{ "user_name", llama.name_user.c_str() }, { "user_name", llama.name_user.c_str() },
{ "assistant_name", llama.name_assistant.c_str() } { "assistant_name", llama.name_assistant.c_str() }
}; };
res.set_content(data.dump(), "application/json"); res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
@ -2759,12 +2763,12 @@ int main(int argc, char **argv)
std::string completion_text; std::string completion_text;
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
if (!result.error && result.stop) { if (!result.error && result.stop) {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} }
else else
{ {
res.status = 404; res.status = 404;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2835,7 +2839,7 @@ int main(int argc, char **argv)
}} }}
}; };
res.set_content(models.dump(), "application/json"); res.set_content(models.dump(), "application/json; charset=utf-8");
}); });
// TODO: add mount point without "/v1" prefix -- how? // TODO: add mount point without "/v1" prefix -- how?
@ -2857,10 +2861,10 @@ int main(int argc, char **argv)
res.set_content(oaicompat_result.dump(-1, ' ', false, res.set_content(oaicompat_result.dump(-1, ' ', false,
json::error_handler_t::replace), json::error_handler_t::replace),
"application/json"); "application/json; charset=utf-8");
} else { } else {
res.status = 500; res.status = 500;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2924,12 +2928,12 @@ int main(int argc, char **argv)
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
if (!result.error && result.stop) if (!result.error && result.stop)
{ {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} }
else else
{ {
res.status = 404; res.status = 404;
res.set_content(result.result_json["content"], "text/plain"); res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
return; return;
} }
} else { } else {
@ -2978,11 +2982,11 @@ int main(int argc, char **argv)
svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res) svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
{ {
const json data = llama.get_model_props(); const json data = llama.get_model_props();
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
{ return res.set_content("", "application/json"); }); { return res.set_content("", "application/json; charset=utf-8"); });
svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{ {
@ -2993,7 +2997,7 @@ int main(int argc, char **argv)
tokens = llama.tokenize(body["content"], false); tokens = llama.tokenize(body["content"], false);
} }
const json data = format_tokenizer_response(tokens); const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
@ -3007,7 +3011,7 @@ int main(int argc, char **argv)
} }
const json data = format_detokenized_response(content); const json data = format_detokenized_response(content);
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json; charset=utf-8");
}); });
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
@ -3024,7 +3028,7 @@ int main(int argc, char **argv)
} }
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
return res.set_content(result.result_json.dump(), "application/json"); return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
}); });
svr.set_logger(log_server_request); svr.set_logger(log_server_request);
@ -3045,7 +3049,7 @@ int main(int argc, char **argv)
{ {
snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
} }
res.set_content(buf, "text/plain"); res.set_content(buf, "text/plain; charset=utf-8");
res.status = 500; res.status = 500;
}); });
@ -3053,15 +3057,15 @@ int main(int argc, char **argv)
{ {
if (res.status == 401) if (res.status == 401)
{ {
res.set_content("Unauthorized", "text/plain"); res.set_content("Unauthorized", "text/plain; charset=utf-8");
} }
if (res.status == 400) if (res.status == 400)
{ {
res.set_content("Invalid request", "text/plain"); res.set_content("Invalid request", "text/plain; charset=utf-8");
} }
else if (res.status == 404) else if (res.status == 404)
{ {
res.set_content("File Not Found", "text/plain"); res.set_content("File Not Found", "text/plain; charset=utf-8");
res.status = 404; res.status = 404;
} }
}); });

View file

@ -31,6 +31,7 @@
#define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F #define CUDA_R_32F HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate #define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx #define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmBatchedEx hipblasGemmBatchedEx
@ -40,6 +41,7 @@
#define cublasSetStream hipblasSetStream #define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm #define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t #define cublasStatus_t hipblasStatus_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@ -4998,7 +5000,16 @@ static __global__ void rope_neox(
const int ib = col / n_dims; const int ib = col / n_dims;
const int ic = col % n_dims; const int ic = col % n_dims;
const int i = row*ncols + ib*n_dims + ic/2; if (ib > 0) {
const int i = row*ncols + ib*n_dims + ic;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int i = row*ncols + ib*n_dims + ic/2;
const int i2 = row/p_delta_rows; const int i2 = row/p_delta_rows;
float cur_rot = inv_ndims * ic - ib; float cur_rot = inv_ndims * ic - ib;
@ -7057,6 +7068,7 @@ inline void ggml_cuda_op_upscale(
(void) src1; (void) src1;
(void) dst; (void) dst;
(void) src1_dd;
} }
inline void ggml_cuda_op_pad( inline void ggml_cuda_op_pad(
@ -7073,6 +7085,7 @@ inline void ggml_cuda_op_pad(
(void) src1; (void) src1;
(void) dst; (void) dst;
(void) src1_dd;
} }
inline void ggml_cuda_op_rms_norm( inline void ggml_cuda_op_rms_norm(
@ -7376,7 +7389,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
const int compute_capability = g_compute_capabilities[id]; const int compute_capability = g_compute_capabilities[id];
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
half * src0_as_f16 = nullptr; half * src0_as_f16 = nullptr;
size_t src0_as = 0; size_t src0_as = 0;
@ -7817,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
} }
#ifdef NDEBUG #ifdef NDEBUG
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id));
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id)); CUDA_CHECK(ggml_cuda_set_device(id));
@ -7868,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
const int nb2 = dst->nb[2]; const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3]; const int nb3 = dst->nb[3];
ggml_cuda_set_peer_access(ne11);
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
@ -8300,27 +8316,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
} }
static __global__ void k_compute_batched_ptrs( static __global__ void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst, const void ** ptrs_src, void ** ptrs_dst,
int ne12, int ne13, int64_t ne12, int64_t ne13,
int ne23, int64_t ne23,
int nb02, int nb03, size_t nb02, size_t nb03,
int nb12, int nb13, size_t nb12, size_t nb13,
int nb2, int nb3, size_t nbd2, size_t nbd3,
int r2, int r3) { int64_t r2, int64_t r3) {
int i13 = blockIdx.x * blockDim.x + threadIdx.x; int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
int i12 = blockIdx.y * blockDim.y + threadIdx.y; int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) { if (i13 >= ne13 || i12 >= ne12) {
return; return;
} }
int i03 = i13 / r3; int64_t i03 = i13 / r3;
int i02 = i12 / r2; int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
} }
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8376,7 +8392,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0; size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
half * dst_f16 = nullptr;
char * dst_t = nullptr;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
const void * alpha = &alpha_f16;
const void * beta = &beta_f16;
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
dst_t = (char *) dst_f16;
nbd2 /= sizeof(float) / sizeof(half);
nbd3 /= sizeof(float) / sizeof(half);
} else {
dst_t = (char *) dst_ddf;
cu_compute_type = CUBLAS_COMPUTE_32F;
cu_data_type = CUDA_R_32F;
alpha = &alpha_f32;
beta = &beta_f32;
}
GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0); GGML_ASSERT(ne13 % ne03 == 0);
@ -8385,9 +8435,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
const int64_t r2 = ne12/ne02; const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03; const int64_t r3 = ne13/ne03;
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
#if 0 #if 0
// use cublasGemmEx // use cublasGemmEx
{ {
@ -8397,12 +8444,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
int i02 = i12 / r2; int i02 = i12 / r2;
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10, ne01, ne11, ne10,
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
CUBLAS_COMPUTE_16F, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} }
} }
@ -8414,11 +8461,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10, ne01, ne11, ne10,
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
ne12*ne13, ne12*ne13,
CUBLAS_COMPUTE_16F, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else { } else {
// use cublasGemmBatchedEx // use cublasGemmBatchedEx
@ -8435,24 +8482,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
dim3 block_dims(ne13, ne12); dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
src0_as_f16, src1_as_f16, dst_f16, src0_as_f16, src1_as_f16, dst_t,
ptrs_src, ptrs_dst, ptrs_src, ptrs_dst,
ne12, ne13, ne12, ne13,
ne23, ne23,
nb02, nb03, nb02, nb03,
nb12, nb13, nb12, nb13,
dst->nb[2], dst->nb[3], nbd2, nbd3,
r2, r3); r2, r3);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10, ne01, ne11, ne10,
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
ne23, ne23,
CUBLAS_COMPUTE_16F, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (ptrs_src_s != 0) { if (ptrs_src_s != 0) {
@ -8464,11 +8511,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
} }
#endif #endif
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); if (dst->op_params[0] == GGML_PREC_DEFAULT) {
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
ggml_cuda_pool_free(dst_f16, dst_as);
}
ggml_cuda_pool_free(src1_as_f16, src1_as); ggml_cuda_pool_free(src1_as_f16, src1_as);
ggml_cuda_pool_free(dst_f16, dst_as);
} }
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8734,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
const int64_t nb11 = src1->nb[1];
const int64_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0; const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0]; const int32_t id = ((int32_t *) dst->op_params)[0];
const int32_t n_as = ((int32_t *) dst->op_params)[1]; const int32_t n_as = ((int32_t *) dst->op_params)[1];
std::vector<char> ids_host(ggml_nbytes(ids)); std::vector<char> ids_host(ggml_nbytes(ids));
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
if (ids->backend == GGML_BACKEND_GPU) { if (ids->backend == GGML_BACKEND_GPU) {
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); CUDA_CHECK(cudaStreamSynchronize(stream));
} else { } else {
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
} }
@ -8757,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
ggml_tensor src1_row = *src1; ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst; ggml_tensor dst_row = *dst;
src1_row.ne[1] = 1;
dst_row.ne[1] = 1;
src1_row.nb[2] = src1_row.nb[1];
dst_row.nb[2] = dst_row.nb[1];
src1_row.nb[3] = src1_row.nb[1];
dst_row.nb[3] = dst_row.nb[1];
src1_row.extra = &src1_row_extra; src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra; dst_row.extra = &dst_row_extra;
char * src1_original = (char *) src1_extra->data_device[g_main_device];
char * dst_original = (char *) dst_extra->data_device[g_main_device];
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { if (src1->ne[1] == 1) {
//int32_t row_id; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); //int32_t row_id;
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < n_as); GGML_ASSERT(row_id >= 0 && row_id < n_as);
const struct ggml_tensor * src0_row = dst->src[row_id + 2]; const struct ggml_tensor * src0_row = dst->src[row_id + 2];
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
src1_row.data = (char *) src1->data + i01*src1->nb[1]; src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
dst_row.data = (char *) dst->data + i01*dst->nb[1]; dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
}
} else {
size_t as_src1, as_dst;
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
src1_row_extra.data_device[g_main_device] = src1_contiguous;
dst_row_extra.data_device[g_main_device] = dst_contiguous;
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
int64_t num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
if (row_id_i != row_id) {
continue;
}
GGML_ASSERT(row_id >= 0 && row_id < n_as);
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
nb11, cudaMemcpyDeviceToDevice, stream));
num_src1_rows++;
}
if (num_src1_rows == 0) {
continue;
}
src1_row.ne[1] = num_src1_rows;
dst_row.ne[1] = num_src1_rows;
src1_row.nb[1] = nb11;
src1_row.nb[2] = num_src1_rows*nb11;
src1_row.nb[3] = num_src1_rows*nb11;
dst_row.nb[1] = nb1;
dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1;
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
if (row_id_i != row_id) {
continue;
}
GGML_ASSERT(row_id >= 0 && row_id < n_as);
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
nb1, cudaMemcpyDeviceToDevice, stream));
num_src1_rows++;
}
}
ggml_cuda_pool_free(src1_contiguous, as_src1);
ggml_cuda_pool_free(dst_contiguous, as_dst);
} }
} }
@ -9323,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
return false; return false;
} }
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
}
if (params->ith != 0) { if (params->ith != 0) {
return true; return true;
} }

View file

@ -1702,8 +1702,9 @@ kernel void kernel_rope(
dst_data[1] = x0*sin_theta + x1*cos_theta; dst_data[1] = x0*sin_theta + x1*cos_theta;
} }
} else { } else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { if (ic < n_dims) {
const int64_t ib = 0;
// simplified from `(ib * n_dims + ic) * inv_ndims` // simplified from `(ib * n_dims + ic) * inv_ndims`
const float cur_rot = inv_ndims*ic - ib; const float cur_rot = inv_ndims*ic - ib;
@ -1722,6 +1723,14 @@ kernel void kernel_rope(
dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
} else {
const int64_t i0 = ic;
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
} }
} }
} }

46
ggml.c
View file

@ -4098,6 +4098,14 @@ struct ggml_tensor * ggml_mul_mat(
return result; return result;
} }
void ggml_mul_mat_set_prec(
struct ggml_tensor * a,
enum ggml_prec prec) {
const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 0, prec_i32);
}
// ggml_mul_mat_id // ggml_mul_mat_id
struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * ggml_mul_mat_id(
@ -9168,6 +9176,8 @@ static void ggml_compute_forward_norm_f32(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps > 0.0f);
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -9237,6 +9247,8 @@ static void ggml_compute_forward_rms_norm_f32(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps > 0.0f);
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -11562,10 +11574,13 @@ static void ggml_compute_forward_rope_f32(
} }
} else { } else {
// TODO: this might be wrong for ne0 != n_dims - need double check // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // it seems we have to rope just the first n_dims elements and do nothing with the rest
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
theta_base *= freq_scale; theta_base *= freq_scale;
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < ne0; ic += 2) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { if (ic < n_dims) {
const int64_t ib = 0;
// simplified from `(ib * n_dims + ic) * inv_ndims` // simplified from `(ib * n_dims + ic) * inv_ndims`
float cur_rot = inv_ndims * ic - ib; float cur_rot = inv_ndims * ic - ib;
@ -11588,6 +11603,14 @@ static void ggml_compute_forward_rope_f32(
dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
} else {
const int64_t i0 = ic;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
} }
} }
} }
@ -11715,10 +11738,13 @@ static void ggml_compute_forward_rope_f16(
} }
} else { } else {
// TODO: this might be wrong for ne0 != n_dims - need double check // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // it seems we have to rope just the first n_dims elements and do nothing with the rest
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
theta_base *= freq_scale; theta_base *= freq_scale;
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < ne0; ic += 2) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { if (ic < n_dims) {
const int64_t ib = 0;
// simplified from `(ib * n_dims + ic) * inv_ndims` // simplified from `(ib * n_dims + ic) * inv_ndims`
float cur_rot = inv_ndims * ic - ib; float cur_rot = inv_ndims * ic - ib;
@ -11741,6 +11767,14 @@ static void ggml_compute_forward_rope_f16(
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} else {
const int64_t i0 = ic;
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
} }
} }
} }

14
ggml.h
View file

@ -303,7 +303,7 @@ extern "C" {
#if defined(__ARM_NEON) && defined(__CUDACC__) #if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t; typedef half ggml_fp16_t;
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t; typedef __fp16 ggml_fp16_t;
#else #else
typedef uint16_t ggml_fp16_t; typedef uint16_t ggml_fp16_t;
@ -343,6 +343,12 @@ extern "C" {
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
// precision
enum ggml_prec {
GGML_PREC_DEFAULT,
GGML_PREC_F32,
};
enum ggml_backend_type { enum ggml_backend_type {
GGML_BACKEND_CPU = 0, GGML_BACKEND_CPU = 0,
GGML_BACKEND_GPU = 10, GGML_BACKEND_GPU = 10,
@ -1057,6 +1063,12 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// change the precision of a matrix multiplication
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
GGML_API void ggml_mul_mat_set_prec(
struct ggml_tensor * a,
enum ggml_prec prec);
// indirect matrix multiplication // indirect matrix multiplication
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id( GGML_API struct ggml_tensor * ggml_mul_mat_id(

View file

@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum):
BLOOM = auto() BLOOM = auto()
STABLELM = auto() STABLELM = auto()
QWEN = auto() QWEN = auto()
PHI2 = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.PHI2: "phi2",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -359,6 +361,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.PHI2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
]
# TODO # TODO
} }

View file

@ -18,6 +18,7 @@ class TensorNameMap:
"embeddings.word_embeddings", # bert "embeddings.word_embeddings", # bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
"wte", # gpt2 "wte", # gpt2
"transformer.embd.wte", # phi2
), ),
# Token type embeddings # Token type embeddings
@ -43,6 +44,7 @@ class TensorNameMap:
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
"output", # llama-pth bloom "output", # llama-pth bloom
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
), ),
# Output norm # Output norm
@ -55,6 +57,7 @@ class TensorNameMap:
"transformer.norm_f", # mpt "transformer.norm_f", # mpt
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon "language_model.encoder.final_layernorm", # persimmon
"lm_head.ln", # phi2
), ),
# Rope frequencies # Rope frequencies
@ -78,6 +81,7 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
"h.{bid}.ln_1", # gpt2 "h.{bid}.ln_1", # gpt2
"transformer.h.{bid}.ln", # phi2
), ),
# Attention norm 2 # Attention norm 2
@ -94,6 +98,7 @@ class TensorNameMap:
"h.{bid}.self_attention.query_key_value", # bloom "h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
"h.{bid}.attn.c_attn", # gpt2 "h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2
), ),
# Attention query # Attention query
@ -133,6 +138,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"h.{bid}.attn.c_proj", # gpt2 "h.{bid}.attn.c_proj", # gpt2
"transformer.h.{bid}.mixer.out_proj", # phi2
), ),
# Rotary embeddings # Rotary embeddings
@ -174,6 +180,7 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"transformer.h.{bid}.mlp.w1", # qwen "transformer.h.{bid}.mlp.w1", # qwen
"h.{bid}.mlp.c_fc", # gpt2 "h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", # phi2
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
@ -206,6 +213,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"h.{bid}.mlp.c_proj", # gpt2 "h.{bid}.mlp.c_proj", # gpt2
"transformer.h.{bid}.mlp.fc2", # phi2
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (

View file

@ -109,8 +109,10 @@ class SpecialVocab:
return True return True
def _set_special_token(self, typ: str, tid: Any) -> None: def _set_special_token(self, typ: str, tid: Any) -> None:
if not isinstance(tid, int) or tid < 0: if not isinstance(tid, int):
return return
if tid < 0:
raise ValueError(f'invalid value for special token type {typ}: {tid}')
if self.n_vocab is None or tid < self.n_vocab: if self.n_vocab is None or tid < self.n_vocab:
if typ in self.special_token_ids: if typ in self.special_token_ids:
return return

503
llama.cpp
View file

@ -195,6 +195,7 @@ enum llm_arch {
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
LLM_ARCH_QWEN, LLM_ARCH_QWEN,
LLM_ARCH_PHI2,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_PHI2, "phi2" },
}; };
enum llm_kv { enum llm_kv {
@ -559,6 +561,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_PHI2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
@ -1430,6 +1445,7 @@ struct llama_model {
struct ggml_tensor * output_norm; struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b; struct ggml_tensor * output_norm_b;
struct ggml_tensor * output; struct ggml_tensor * output;
struct ggml_tensor * output_b;
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
@ -1515,6 +1531,10 @@ struct llama_context {
// decode output (2-dimensional array: [n_tokens][n_vocab]) // decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits; std::vector<float> logits;
#ifndef NDEBUG
// guard against access to unset logits
std::vector<bool> logits_valid;
#endif
bool logits_all = false; bool logits_all = false;
// input embedding (1-dimensional array: [n_embd]) // input embedding (1-dimensional array: [n_embd])
@ -1943,7 +1963,7 @@ namespace GGUFMeta {
target = override->bool_value; target = override->bool_value;
return true; return true;
} }
return true; return false;
} }
template<typename OT> template<typename OT>
@ -2403,25 +2423,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
switch (ftype) { switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16"; return "Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
// K-quants // K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
@ -2540,6 +2560,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 22: model.type = e_model::MODEL_1B; break;
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model::MODEL_13B; break;
@ -2641,6 +2662,15 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_PHI2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_GPT2: case LLM_ARCH_GPT2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -3001,7 +3031,7 @@ static void llm_load_tensors(
(void) main_gpu; (void) main_gpu;
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
@ -3644,6 +3674,73 @@ static void llm_load_tensors(
} }
} }
} break; } break;
case LLM_ARCH_PHI2:
{
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
// output
{
ggml_backend_type backend_norm;
ggml_backend_type backend_output;
if (n_gpu_layers > int(n_layer)) {
backend_norm = llama_backend_offload;
backend_output = llama_backend_offload;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
}
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
vram_weights += ggml_nbytes(model.output);
vram_weights += ggml_nbytes(model.output_b);
}
}
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
}
}
} break;
case LLM_ARCH_GPT2: case LLM_ARCH_GPT2:
{ {
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
@ -3716,7 +3813,6 @@ static void llm_load_tensors(
} }
} }
} break; } break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -4077,6 +4173,7 @@ static struct ggml_tensor * llm_build_ffn(
// if max_alibi_bias > 0 then apply ALiBi // if max_alibi_bias > 0 then apply ALiBi
static struct ggml_tensor * llm_build_kqv( static struct ggml_tensor * llm_build_kqv(
struct ggml_context * ctx, struct ggml_context * ctx,
const llama_model & model,
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_kv_cache & kv, const llama_kv_cache & kv,
struct ggml_tensor * wo, struct ggml_tensor * wo,
@ -4088,6 +4185,7 @@ static struct ggml_tensor * llm_build_kqv(
int32_t n_tokens, int32_t n_tokens,
int32_t n_kv, int32_t n_kv,
float max_alibi_bias, float max_alibi_bias,
float scale,
const llm_build_cb & cb, const llm_build_cb & cb,
int il) { int il) {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
@ -4110,6 +4208,12 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il); cb(kq, "kq", il);
if (model.arch == LLM_ARCH_PHI2) {
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
}
if (max_alibi_bias > 0.0f) { if (max_alibi_bias > 0.0f) {
// temporary branch until we figure out how to handle ggml_alibi through ggml_add // temporary branch until we figure out how to handle ggml_alibi through ggml_add
kq = ggml_scale(ctx, kq, kq_scale); kq = ggml_scale(ctx, kq, kq_scale);
@ -4129,7 +4233,7 @@ static struct ggml_tensor * llm_build_kqv(
kq = ggml_soft_max(ctx, kq); kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il); cb(kq, "kq_soft_max", il);
} else { } else {
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head))); kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale);
cb(kq, "kq_soft_max_ext", il); cb(kq, "kq_soft_max_ext", il);
} }
@ -4336,9 +4440,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4519,9 +4623,9 @@ struct llm_build_context {
// apply ALiBi for 13B model // apply ALiBi for 13B model
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4643,9 +4747,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4743,9 +4847,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4952,9 +5056,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
// TODO: not tested, could be broken // TODO: not tested, could be broken
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5043,9 +5147,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5140,9 +5244,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5234,9 +5338,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5347,9 +5451,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5406,15 +5510,15 @@ struct llm_build_context {
cb(inpL, "inp_embd", -1); cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1); cb(inp_pos, "inp_pos", -1);
// KQ_scale // KQ_scale
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
cb(KQ_scale, "KQ_scale", -1); cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1); cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed // shift the entire K-cache if needed
@ -5464,9 +5568,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5510,6 +5614,122 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * attn_norm_output;
struct ggml_tensor * ffn_output;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
// Q_scale
struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
cb(Q_scale, "Q_scale", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm,
model.layers[il].attn_norm_b,
LLM_NORM, cb, il);
cb(attn_norm_output, "attn_norm", il);
// self-attention
{
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_custom(
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Qcur = ggml_scale(ctx0, Qcur, Q_scale);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
// FF
{
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(ffn_output, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_output);
cb(cur, "l_out", il);
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "l_out", il);
inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, hparams,
model.output_norm,
model.output_norm_b,
LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output_no_bias", -1);
cur = ggml_add(ctx0, cur, model.output_b);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_gpt2() { struct ggml_cgraph * build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -5565,9 +5785,9 @@ struct llm_build_context {
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
cur = llm_build_kqv(ctx0, hparams, kv_self, cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5621,7 +5841,7 @@ enum llm_offload_func_e {
OFFLOAD_FUNC_FRC, // force offload OFFLOAD_FUNC_FRC, // force offload
OFFLOAD_FUNC_KQV, OFFLOAD_FUNC_KQV,
OFFLOAD_FUNC_NR, OFFLOAD_FUNC_NR,
OFFLOAD_FUNC_EMB, OFFLOAD_FUNC_EMB, // embeddings
OFFLOAD_FUNC_OUT, OFFLOAD_FUNC_OUT,
}; };
@ -5706,6 +5926,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "pos_embd", OFFLOAD_FUNC_NR }, { "pos_embd", OFFLOAD_FUNC_NR },
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
{ "Q_scale", OFFLOAD_FUNC_FRC },
{ "KQ_scale", OFFLOAD_FUNC_FRC }, { "KQ_scale", OFFLOAD_FUNC_FRC },
{ "KQ_mask", OFFLOAD_FUNC_FRC }, { "KQ_mask", OFFLOAD_FUNC_FRC },
{ "K_shift", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC },
@ -5790,6 +6011,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "l_out", OFFLOAD_FUNC }, { "l_out", OFFLOAD_FUNC },
{ "result_norm", OFFLOAD_FUNC_EMB }, { "result_norm", OFFLOAD_FUNC_EMB },
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
{ "result_output", OFFLOAD_FUNC_OUT }, { "result_output", OFFLOAD_FUNC_OUT },
}; };
@ -5807,6 +6029,7 @@ static struct ggml_cgraph * llama_build_graph(
bool alloc_inp_tokens = false; bool alloc_inp_tokens = false;
bool alloc_inp_embd = false; bool alloc_inp_embd = false;
bool alloc_inp_pos = false; bool alloc_inp_pos = false;
bool alloc_inp_Q_scale = false;
bool alloc_inp_KQ_scale = false; bool alloc_inp_KQ_scale = false;
bool alloc_inp_KQ_mask = false; bool alloc_inp_KQ_mask = false;
bool alloc_inp_K_shift = false; bool alloc_inp_K_shift = false;
@ -5874,7 +6097,7 @@ static struct ggml_cgraph * llama_build_graph(
alloc_inp_pos = true; alloc_inp_pos = true;
} }
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) {
ggml_allocr_alloc(lctx.alloc, cur); ggml_allocr_alloc(lctx.alloc, cur);
if (!ggml_allocr_is_measure(lctx.alloc)) { if (!ggml_allocr_is_measure(lctx.alloc)) {
@ -5882,6 +6105,23 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
} }
alloc_inp_Q_scale = true;
}
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
ggml_allocr_alloc(lctx.alloc, cur);
if (!ggml_allocr_is_measure(lctx.alloc)) {
const int64_t n_embd_head = model.hparams.n_embd_head();
if (model.arch == LLM_ARCH_PHI2) {
// with phi2, we scale the Q to avoid precision issues
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
ggml_set_f32(cur, 1.0f);
} else {
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
}
}
alloc_inp_KQ_scale = true; alloc_inp_KQ_scale = true;
} }
@ -6106,6 +6346,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_qwen(); result = llm.build_qwen();
} break; } break;
case LLM_ARCH_PHI2:
{
result = llm.build_phi2();
} break;
case LLM_ARCH_GPT2: case LLM_ARCH_GPT2:
{ {
result = llm.build_gpt2(); result = llm.build_gpt2();
@ -6243,12 +6487,16 @@ static int llama_decode_internal(
ggml_allocr_alloc_graph(lctx.alloc, gf); ggml_allocr_alloc_graph(lctx.alloc, gf);
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; // the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
// the embeddings could be the second to last tensor, or the third to last tensor
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
if (strcmp(embeddings->name, "result_norm") != 0) {
GGML_ASSERT(strcmp(res->name, "result_output") == 0); embeddings = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
}
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
for (int i = 0; i < gf->n_leafs; i++) { for (int i = 0; i < gf->n_leafs; i++) {
@ -6343,6 +6591,14 @@ static int llama_decode_internal(
{ {
auto & logits_out = lctx.logits; auto & logits_out = lctx.logits;
#ifndef NDEBUG
auto & logits_valid = lctx.logits_valid;
logits_valid.clear();
logits_valid.resize(n_tokens);
logits_out.clear();
#endif
if (batch.logits) { if (batch.logits) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
@ -6350,13 +6606,22 @@ static int llama_decode_internal(
continue; continue;
} }
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
#ifndef NDEBUG
logits_valid[i] = true;
#endif
} }
} else if (lctx.logits_all) { } else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif
} else { } else {
logits_out.resize(n_vocab); logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
#ifndef NDEBUG
logits_valid[0] = true;
#endif
} }
} }
@ -8819,53 +9084,60 @@ static int llama_apply_lora_from_file_internal(
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary); llama_file fin(path_lora, "rb");
if (!fin) {
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
return 1;
}
// verify magic and version // verify magic and version
{ {
uint32_t magic; uint32_t magic = fin.read_u32();
fin.read((char *) &magic, sizeof(magic)); if (magic != LLAMA_FILE_MAGIC_GGLA) {
uint32_t format_version; LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
fin.read((char *) &format_version, sizeof(format_version)); return 1;
}
uint32_t format_version = fin.read_u32();
if (format_version != 1) { if (format_version != 1) {
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
return 1; return 1;
} }
} }
int32_t lora_r; int32_t lora_r = fin.read_u32();
int32_t lora_alpha; int32_t lora_alpha = fin.read_u32();
fin.read((char *) &lora_r, sizeof(lora_r));
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
float scaling = scale * (float)lora_alpha / (float)lora_r; float scaling = scale * (float)lora_alpha / (float)lora_r;
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
// create a name -> tensor map of the model to accelerate lookups
// find the max tensor size to estimate the required temporary buffer size
size_t max_tensor_size = 0;
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (const auto & kv : model.tensors_by_name) {
model_tensors.insert(kv);
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
max_tensor_size = std::max(max_tensor_size, f32_size);
}
// create a temporary ggml context to store the lora tensors // create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor // TODO: use ggml-alloc
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull); size_t lora_ctx_size = max_tensor_size * 3;
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
std::vector<uint8_t> lora_buf(lora_ctx_size);
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = lora_buf.size(); params.mem_size = lora_buf.size();
params.mem_buffer = lora_buf.data(); params.mem_buffer = lora_buf.data();
params.no_alloc = false; params.no_alloc = false;
ggml_context * lora_ctx = ggml_init(params); using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
// create a name -> tensor map of the model to accelerate lookups unique_context lora_ctx(nullptr, ggml_free);
std::unordered_map<std::string, struct ggml_tensor*> model_tensors; lora_ctx.reset(ggml_init(params));
for (const auto & kv : model.tensors_by_name) { std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
model_tensors.insert(kv);
}
// load base model // load base model
std::unique_ptr<llama_model_loader> ml; std::unique_ptr<llama_model_loader> ml;
ggml_context * base_ctx = NULL;
unique_context base_ctx(nullptr, ggml_free);
std::vector<uint8_t> base_buf; std::vector<uint8_t> base_buf;
if (path_base_model) { if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
@ -8874,6 +9146,7 @@ static int llama_apply_lora_from_file_internal(
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
ml->calc_sizes(ctx_size, mmapped_size); ml->calc_sizes(ctx_size, mmapped_size);
base_buf.resize(ctx_size); base_buf.resize(ctx_size);
ggml_init_params base_params; ggml_init_params base_params;
@ -8881,9 +9154,9 @@ static int llama_apply_lora_from_file_internal(
base_params.mem_buffer = base_buf.data(); base_params.mem_buffer = base_buf.data();
base_params.no_alloc = ml->use_mmap; base_params.no_alloc = ml->use_mmap;
base_ctx = ggml_init(base_params); base_ctx.reset(ggml_init(base_params));
// maybe this should in llama_model_loader // maybe this should be in llama_model_loader
if (ml->use_mmap) { if (ml->use_mmap) {
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
} }
@ -8896,27 +9169,35 @@ static int llama_apply_lora_from_file_internal(
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
while (true) { while (true) {
if (fin.tell() == fin.size) {
// eof
break;
}
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t name_len;
int32_t ftype; int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fin.read_raw(&n_dims, sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length)); fin.read_raw(&name_len, sizeof(name_len));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); fin.read_raw(&ftype, sizeof(ftype));
if (fin.eof()) {
break; if (n_dims != 1 && n_dims != 2) {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
} }
int32_t ne[2] = { 1, 1 }; int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); fin.read_raw(&ne[i], sizeof(ne[i]));
} }
std::string name; std::string name;
{ {
GGML_ASSERT(name_len <= 1024);
char buf[1024]; char buf[1024];
fin.read(buf, length); fin.read_raw(buf, name_len);
name = std::string(buf, length); name = std::string(buf, name_len);
} }
// check for lora suffix and get the type of tensor // check for lora suffix and get the type of tensor
@ -8930,7 +9211,7 @@ static int llama_apply_lora_from_file_internal(
std::string lora_type = name.substr(pos + lora_suffix.length()); std::string lora_type = name.substr(pos + lora_suffix.length());
std::string base_name = name; std::string base_name = name;
base_name.erase(pos); base_name.erase(pos);
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
if (model_tensors.find(base_name) == model_tensors.end()) { if (model_tensors.find(base_name) == model_tensors.end()) {
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@ -8949,22 +9230,15 @@ static int llama_apply_lora_from_file_internal(
return false; return false;
} }
} }
ggml_tensor * lora_tensor; ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
if (n_dims == 2) { ggml_set_name(lora_tensor, name.c_str());
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
}
else {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
}
ggml_set_name(lora_tensor, "lora_tensor");
// load tensor data // load tensor data
size_t offset = fin.tellg(); size_t offset = fin.tell();
size_t tensor_data_size = ggml_nbytes(lora_tensor); size_t tensor_data_size = ggml_nbytes(lora_tensor);
offset = (offset + 31) & -32; offset = (offset + 31) & -32;
fin.seekg(offset); fin.seek(offset, SEEK_SET);
fin.read((char*)lora_tensor->data, tensor_data_size); fin.read_raw(lora_tensor->data, tensor_data_size);
lora_tensors[name] = lora_tensor; lora_tensors[name] = lora_tensor;
@ -8994,13 +9268,11 @@ static int llama_apply_lora_from_file_internal(
// load from base model // load from base model
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
// TODO: throw
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1; return 1;
} }
// TODO: not tested!! maybe not working! base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
ml->load_data_for(base_t); ml->load_data_for(base_t);
} else { } else {
base_t = dest_t; base_t = dest_t;
@ -9029,43 +9301,45 @@ static int llama_apply_lora_from_file_internal(
} }
// w = w + BA*s // w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
offload_func(BA); offload_func(BA);
ggml_set_name(BA, "BA"); ggml_set_name(BA, "BA");
if (scaling != 1.0f) { if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
ggml_set_name(scale_tensor, "scale_tensor"); ggml_set_name(scale_tensor, "scale_tensor");
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
offload_func(BA); offload_func(BA);
ggml_set_name(BA, "BA_scaled"); ggml_set_name(BA, "BA_scaled");
} }
ggml_tensor * r; ggml_tensor * r;
if (base_t == dest_t) { if (base_t == dest_t) {
r = ggml_add_inplace(lora_ctx, dest_t, BA); r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
offload_func_force_inplace(r); offload_func_force_inplace(r);
ggml_set_name(r, "r_add_inplace"); ggml_set_name(r, "r_add_inplace");
} }
else { else {
r = ggml_add(lora_ctx, base_t, BA); r = ggml_add(lora_ctx.get(), base_t, BA);
offload_func(r); offload_func(r);
ggml_set_name(r, "r_add"); ggml_set_name(r, "r_add");
r = ggml_cpy(lora_ctx, r, dest_t); r = ggml_cpy(lora_ctx.get(), r, dest_t);
offload_func(r); offload_func(r);
ggml_set_name(r, "r_cpy"); ggml_set_name(r, "r_cpy");
} }
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
ggml_build_forward_expand(gf, r); ggml_build_forward_expand(gf, r);
ggml_graph_compute_helper(work_buffer, gf, n_threads); ggml_graph_compute_helper(work_buffer, gf, n_threads);
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
GGML_ASSERT(lora_tensors.size() == 2);
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx); lora_ctx.reset(ggml_init(params));
lora_ctx = ggml_init(params);
lora_tensors.clear(); lora_tensors.clear();
n_tensors++; n_tensors++;
@ -9075,12 +9349,6 @@ static int llama_apply_lora_from_file_internal(
} }
} }
// TODO: this should be in a destructor, it will leak on failure
ggml_free(lora_ctx);
if (base_ctx) {
ggml_free(base_ctx);
}
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
@ -10245,6 +10513,7 @@ float * llama_get_logits(struct llama_context * ctx) {
} }
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
assert(ctx->logits_valid.at(i));
return ctx->logits.data() + i*ctx->model.hparams.n_vocab; return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
} }

View file

@ -39,6 +39,7 @@
#define LLAMA_MAX_RNG_STATE (64*1024) #define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN

View file

@ -1555,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm) test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
} }
test_cases.emplace_back(new test_alibi()); test_cases.emplace_back(new test_alibi());