resolve merge conflict
This commit is contained in:
commit
5d5b6088b5
24 changed files with 1684 additions and 762 deletions
|
@ -23,3 +23,6 @@ insert_final_newline = unset
|
|||
|
||||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
|
|
@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
|
|||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
|
12
Makefile
12
Makefile
|
@ -439,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef LLAMA_HIPBLAS
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
|
||||
ifeq ($(wildcard /opt/rocm),)
|
||||
ROCM_PATH ?= /usr
|
||||
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||
else
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
endif
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
LLAMA_CUDA_DMMV_X ?= 32
|
||||
LLAMA_CUDA_MMV_Y ?= 1
|
||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||
|
|
|
@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||
|
||||
### Hot topics
|
||||
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
|
||||
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
|
||||
----
|
||||
|
||||
|
|
|
@ -185,6 +185,8 @@ class Model:
|
|||
return MixtralModel
|
||||
if model_architecture == "GPT2LMHeadModel":
|
||||
return GPT2Model
|
||||
if model_architecture == "PhiForCausalLM":
|
||||
return Phi2Model
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
|
@ -226,6 +228,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.LLAMA
|
||||
if arch == "GPT2LMHeadModel":
|
||||
return gguf.MODEL_ARCH.GPT2
|
||||
if arch == "PhiForCausalLM":
|
||||
return gguf.MODEL_ARCH.PHI2
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
|
@ -1046,6 +1050,23 @@ class GPT2Model(Model):
|
|||
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
class Phi2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("Phi2")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import annotations
|
|||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pathlib import Path
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attn_q",
|
||||
"self_attn.k_proj": "attn_k",
|
||||
"self_attn.v_proj": "attn_v",
|
||||
"self_attn.o_proj": "attn_output",
|
||||
"mlp.gate_proj": "ffn_gate",
|
||||
"mlp.down_proj": "ffn_down",
|
||||
"mlp.up_proj": "ffn_up",
|
||||
"input_layernorm": "attn_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
}
|
||||
|
||||
|
||||
def translate_tensor_name(t: str) -> str:
|
||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
||||
if match:
|
||||
nn = match.group(1)
|
||||
sub_layer = match.group(2)
|
||||
lora_type = match.group(3)
|
||||
|
||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
||||
if sub_layer_renamed is None:
|
||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
print(f"Error: unrecognized tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
|
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
|||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
||||
) -> None:
|
||||
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
|
@ -78,11 +47,12 @@ def write_tensor_header(
|
|||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path>")
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
|
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
|||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
|
|||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
|
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
|
|||
v = v.float()
|
||||
|
||||
t = v.detach().numpy()
|
||||
tname = translate_tensor_name(k)
|
||||
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
|
|
@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
|
|||
opt->params.adam.gclip = params.common.adam_gclip;
|
||||
opt->params.adam.eps_f = params.common.adam_eps_f;
|
||||
|
||||
ggml_allocr * alloc = NULL;
|
||||
|
||||
printf("%s: init model\n", __func__);
|
||||
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
|
||||
|
||||
|
@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// allocate input tensors
|
||||
mem_input_data.resize(max_input_size);
|
||||
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||
ggml_allocr_alloc(alloc, tokens_input);
|
||||
ggml_allocr_alloc(alloc, target_probs);
|
||||
ggml_allocr_free(alloc);
|
||||
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||
ggml_allocr_alloc(alloc_inps, tokens_input);
|
||||
ggml_allocr_alloc(alloc_inps, target_probs);
|
||||
|
||||
// context for compute tensors without their data
|
||||
const size_t estimated_compute_size_wo_data = (
|
||||
|
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
|
|||
// find best evaluation order
|
||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
|
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
|
|||
// allocate compute tensors
|
||||
mem_compute_data.resize(max_compute_size);
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = best_order;
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
|
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
|
|||
params.common.use_checkpointing
|
||||
);
|
||||
ggml_allocr_free(alloc);
|
||||
ggml_allocr_free(alloc_inps);
|
||||
|
||||
|
||||
// tokenize data
|
||||
std::vector<llama_token> train_tokens;
|
||||
|
|
1
examples/llama.swiftui/.gitignore
vendored
1
examples/llama.swiftui/.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
xcuserdata
|
||||
xcshareddata
|
||||
|
|
|
@ -6,16 +6,34 @@ enum LlamaError: Error {
|
|||
case couldNotInitializeContext
|
||||
}
|
||||
|
||||
func llama_batch_clear(_ batch: inout llama_batch) {
|
||||
batch.n_tokens = 0
|
||||
}
|
||||
|
||||
func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
|
||||
batch.token [Int(batch.n_tokens)] = id
|
||||
batch.pos [Int(batch.n_tokens)] = pos
|
||||
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
|
||||
for i in 0..<seq_ids.count {
|
||||
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
|
||||
}
|
||||
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0
|
||||
|
||||
batch.n_tokens += 1
|
||||
}
|
||||
|
||||
actor LlamaContext {
|
||||
private var model: OpaquePointer
|
||||
private var context: OpaquePointer
|
||||
private var batch: llama_batch
|
||||
private var tokens_list: [llama_token]
|
||||
|
||||
/// This variable is used to store temporarily invalid cchars
|
||||
private var temporary_invalid_cchars: [CChar]
|
||||
|
||||
var n_len: Int32 = 512
|
||||
var n_len: Int32 = 64
|
||||
var n_cur: Int32 = 0
|
||||
|
||||
var n_decode: Int32 = 0
|
||||
|
||||
init(model: OpaquePointer, context: OpaquePointer) {
|
||||
|
@ -27,25 +45,34 @@ actor LlamaContext {
|
|||
}
|
||||
|
||||
deinit {
|
||||
llama_batch_free(batch)
|
||||
llama_free(context)
|
||||
llama_free_model(model)
|
||||
llama_backend_free()
|
||||
}
|
||||
|
||||
static func createContext(path: String) throws -> LlamaContext {
|
||||
static func create_context(path: String) throws -> LlamaContext {
|
||||
llama_backend_init(false)
|
||||
let model_params = llama_model_default_params()
|
||||
var model_params = llama_model_default_params()
|
||||
|
||||
#if targetEnvironment(simulator)
|
||||
model_params.n_gpu_layers = 0
|
||||
print("Running on simulator, force use n_gpu_layers = 0")
|
||||
#endif
|
||||
let model = llama_load_model_from_file(path, model_params)
|
||||
guard let model else {
|
||||
print("Could not load model at \(path)")
|
||||
throw LlamaError.couldNotInitializeContext
|
||||
}
|
||||
|
||||
let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
|
||||
print("Using \(n_threads) threads")
|
||||
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.n_threads = 8
|
||||
ctx_params.n_threads_batch = 8
|
||||
ctx_params.n_threads = UInt32(n_threads)
|
||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
guard let context else {
|
||||
|
@ -56,6 +83,26 @@ actor LlamaContext {
|
|||
return LlamaContext(model: model, context: context)
|
||||
}
|
||||
|
||||
func model_info() -> String {
|
||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
|
||||
result.initialize(repeating: Int8(0), count: 256)
|
||||
defer {
|
||||
result.deallocate()
|
||||
}
|
||||
|
||||
// TODO: this is probably very stupid way to get the string from C
|
||||
|
||||
let nChars = llama_model_desc(model, result, 256)
|
||||
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
|
||||
|
||||
var SwiftString = ""
|
||||
for char in bufferPointer {
|
||||
SwiftString.append(Character(UnicodeScalar(UInt8(char))))
|
||||
}
|
||||
|
||||
return SwiftString
|
||||
}
|
||||
|
||||
func get_n_tokens() -> Int32 {
|
||||
return batch.n_tokens;
|
||||
}
|
||||
|
@ -79,16 +126,11 @@ actor LlamaContext {
|
|||
print(String(cString: token_to_piece(token: id) + [0]))
|
||||
}
|
||||
|
||||
// batch = llama_batch_init(512, 0) // done in init()
|
||||
batch.n_tokens = Int32(tokens_list.count)
|
||||
llama_batch_clear(&batch)
|
||||
|
||||
for i1 in 0..<batch.n_tokens {
|
||||
for i1 in 0..<tokens_list.count {
|
||||
let i = Int(i1)
|
||||
batch.token[i] = tokens_list[i]
|
||||
batch.pos[i] = i1
|
||||
batch.n_seq_id[Int(i)] = 1
|
||||
batch.seq_id[Int(i)]![0] = 0
|
||||
batch.logits[i] = 0
|
||||
llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
|
@ -141,18 +183,11 @@ actor LlamaContext {
|
|||
print(new_token_str)
|
||||
// tokens_list.append(new_token_id)
|
||||
|
||||
batch.n_tokens = 0
|
||||
|
||||
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||
batch.n_seq_id[Int(batch.n_tokens)] = 1
|
||||
batch.seq_id[Int(batch.n_tokens)]![0] = 0
|
||||
batch.logits[Int(batch.n_tokens)] = 1 // true
|
||||
batch.n_tokens += 1
|
||||
llama_batch_clear(&batch)
|
||||
llama_batch_add(&batch, new_token_id, n_cur, [0], true)
|
||||
|
||||
n_decode += 1
|
||||
|
||||
n_cur += 1
|
||||
n_cur += 1
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("failed to evaluate llama!")
|
||||
|
@ -161,14 +196,111 @@ actor LlamaContext {
|
|||
return new_token_str
|
||||
}
|
||||
|
||||
func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
|
||||
var pp_avg: Double = 0
|
||||
var tg_avg: Double = 0
|
||||
|
||||
var pp_std: Double = 0
|
||||
var tg_std: Double = 0
|
||||
|
||||
for _ in 0..<nr {
|
||||
// bench prompt processing
|
||||
|
||||
llama_batch_clear(&batch)
|
||||
|
||||
let n_tokens = pp
|
||||
|
||||
for i in 0..<n_tokens {
|
||||
llama_batch_add(&batch, 0, Int32(i), [0], false)
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_pp_start = ggml_time_us()
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed during prompt")
|
||||
}
|
||||
|
||||
let t_pp_end = ggml_time_us()
|
||||
|
||||
// bench text generation
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_tg_start = ggml_time_us()
|
||||
|
||||
for i in 0..<tg {
|
||||
llama_batch_clear(&batch)
|
||||
|
||||
for j in 0..<pl {
|
||||
llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
|
||||
}
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed during text generation")
|
||||
}
|
||||
}
|
||||
|
||||
let t_tg_end = ggml_time_us()
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
|
||||
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
|
||||
|
||||
let speed_pp = Double(pp) / t_pp
|
||||
let speed_tg = Double(pl*tg) / t_tg
|
||||
|
||||
pp_avg += speed_pp
|
||||
tg_avg += speed_tg
|
||||
|
||||
pp_std += speed_pp * speed_pp
|
||||
tg_std += speed_tg * speed_tg
|
||||
|
||||
print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
|
||||
}
|
||||
|
||||
pp_avg /= Double(nr)
|
||||
tg_avg /= Double(nr)
|
||||
|
||||
if nr > 1 {
|
||||
pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
|
||||
tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
|
||||
} else {
|
||||
pp_std = 0
|
||||
tg_std = 0
|
||||
}
|
||||
|
||||
let model_desc = model_info();
|
||||
let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
|
||||
let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
|
||||
let backend = "Metal";
|
||||
let pp_avg_str = String(format: "%.2f", pp_avg);
|
||||
let tg_avg_str = String(format: "%.2f", tg_avg);
|
||||
let pp_std_str = String(format: "%.2f", pp_std);
|
||||
let tg_std_str = String(format: "%.2f", tg_std);
|
||||
|
||||
var result = ""
|
||||
|
||||
result += String("| model | size | params | backend | test | t/s |\n")
|
||||
result += String("| --- | --- | --- | --- | --- | --- |\n")
|
||||
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
|
||||
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
temporary_invalid_cchars.removeAll()
|
||||
llama_kv_cache_clear(context)
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||
|
||||
|
|
|
@ -1,481 +1,483 @@
|
|||
// !$*UTF8*$!
|
||||
{
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 56;
|
||||
objects = {
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 56;
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXFrameworksBuildPhase section */
|
||||
|
||||
/* Begin PBXGroup section */
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */,
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */,
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */,
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */,
|
||||
);
|
||||
name = llama.cpp;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C836A2AC328BD0096AF73 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */,
|
||||
8A3F84232AC4C891005E2EE8 /* models */,
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
|
||||
8A1C83742AC328BD0096AF73 /* Products */,
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83742AC328BD0096AF73 /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */,
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */,
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */,
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */,
|
||||
);
|
||||
path = llama.swiftui;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
|
||||
);
|
||||
path = "Preview Content";
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
name = Frameworks;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */,
|
||||
);
|
||||
path = Resources;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
|
||||
);
|
||||
path = models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */,
|
||||
);
|
||||
path = llama.cpp.swift;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||
);
|
||||
path = UI;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
|
||||
);
|
||||
path = Models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */,
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */,
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */,
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */,
|
||||
);
|
||||
name = llama.cpp;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C836A2AC328BD0096AF73 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */,
|
||||
8A3F84232AC4C891005E2EE8 /* models */,
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
|
||||
8A1C83742AC328BD0096AF73 /* Products */,
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83742AC328BD0096AF73 /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */,
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */,
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */,
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */,
|
||||
);
|
||||
path = llama.swiftui;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
|
||||
);
|
||||
path = "Preview Content";
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
name = Frameworks;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */,
|
||||
);
|
||||
path = Resources;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
);
|
||||
path = models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */,
|
||||
);
|
||||
path = llama.cpp.swift;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||
);
|
||||
path = UI;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
|
||||
);
|
||||
path = Models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
|
||||
buildPhases = (
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
productType = "com.apple.product-type.application";
|
||||
};
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
|
||||
buildPhases = (
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
productType = "com.apple.product-type.application";
|
||||
};
|
||||
/* End PBXNativeTarget section */
|
||||
|
||||
/* Begin PBXProject section */
|
||||
8A1C836B2AC328BD0096AF73 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
BuildIndependentTargetsInParallel = 1;
|
||||
LastSwiftUpdateCheck = 1500;
|
||||
LastUpgradeCheck = 1500;
|
||||
TargetAttributes = {
|
||||
8A1C83722AC328BD0096AF73 = {
|
||||
CreatedOnToolsVersion = 15.0;
|
||||
LastSwiftMigration = 1500;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
|
||||
compatibilityVersion = "Xcode 14.0";
|
||||
developmentRegion = en;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
Base,
|
||||
);
|
||||
mainGroup = 8A1C836A2AC328BD0096AF73;
|
||||
packageReferences = (
|
||||
);
|
||||
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
|
||||
);
|
||||
};
|
||||
8A1C836B2AC328BD0096AF73 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
BuildIndependentTargetsInParallel = 1;
|
||||
LastSwiftUpdateCheck = 1500;
|
||||
LastUpgradeCheck = 1500;
|
||||
TargetAttributes = {
|
||||
8A1C83722AC328BD0096AF73 = {
|
||||
CreatedOnToolsVersion = 15.0;
|
||||
LastSwiftMigration = 1500;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
|
||||
compatibilityVersion = "Xcode 14.0";
|
||||
developmentRegion = en;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
Base,
|
||||
);
|
||||
mainGroup = 8A1C836A2AC328BD0096AF73;
|
||||
packageReferences = (
|
||||
);
|
||||
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
|
||||
);
|
||||
};
|
||||
/* End PBXProject section */
|
||||
|
||||
/* Begin PBXResourcesBuildPhase section */
|
||||
8A1C83712AC328BD0096AF73 /* Resources */ = {
|
||||
isa = PBXResourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */,
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
8A1C83712AC328BD0096AF73 /* Resources */ = {
|
||||
isa = PBXResourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */,
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXResourcesBuildPhase section */
|
||||
|
||||
/* Begin PBXSourcesBuildPhase section */
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXSourcesBuildPhase section */
|
||||
|
||||
/* Begin XCBuildConfiguration section */
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83802AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_COMPILATION_MODE = wholemodule;
|
||||
VALIDATE_PRODUCT = YES;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
8A1C83822AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83832AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83802AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_COMPILATION_MODE = wholemodule;
|
||||
VALIDATE_PRODUCT = YES;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
8A1C83822AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83832AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */,
|
||||
8A1C83802AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C83822AC328BE0096AF73 /* Debug */,
|
||||
8A1C83832AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */,
|
||||
8A1C83802AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C83822AC328BE0096AF73 /* Debug */,
|
||||
8A1C83832AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
}
|
||||
|
|
|
@ -3,24 +3,26 @@ import Foundation
|
|||
@MainActor
|
||||
class LlamaState: ObservableObject {
|
||||
@Published var messageLog = ""
|
||||
@Published var cacheCleared = false
|
||||
|
||||
private var llamaContext: LlamaContext?
|
||||
private var modelUrl: URL? {
|
||||
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
|
||||
private var defaultModelUrl: URL? {
|
||||
Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
|
||||
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
|
||||
}
|
||||
|
||||
init() {
|
||||
do {
|
||||
try loadModel()
|
||||
try loadModel(modelUrl: defaultModelUrl)
|
||||
} catch {
|
||||
messageLog += "Error!\n"
|
||||
}
|
||||
}
|
||||
|
||||
private func loadModel() throws {
|
||||
func loadModel(modelUrl: URL?) throws {
|
||||
messageLog += "Loading model...\n"
|
||||
if let modelUrl {
|
||||
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
|
||||
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
|
||||
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||
} else {
|
||||
messageLog += "Could not locate model\n"
|
||||
|
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
|
|||
guard let llamaContext else {
|
||||
return
|
||||
}
|
||||
messageLog += "Attempting to complete text...\n"
|
||||
|
||||
await llamaContext.completion_init(text: text)
|
||||
messageLog += "\(text)"
|
||||
|
||||
|
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
|
|||
await llamaContext.clear()
|
||||
messageLog += "\n\ndone\n"
|
||||
}
|
||||
|
||||
func bench() async {
|
||||
guard let llamaContext else {
|
||||
return
|
||||
}
|
||||
|
||||
messageLog += "\n"
|
||||
messageLog += "Running benchmark...\n"
|
||||
messageLog += "Model info: "
|
||||
messageLog += await llamaContext.model_info() + "\n"
|
||||
|
||||
let t_start = DispatchTime.now().uptimeNanoseconds
|
||||
await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
|
||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||
|
||||
let t_heat = Double(t_end - t_start) / 1_000_000_000.0
|
||||
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
|
||||
|
||||
// if more than 5 seconds, then we're probably running on a slow device
|
||||
if t_heat > 5.0 {
|
||||
messageLog += "Heat up time is too long, aborting benchmark\n"
|
||||
return
|
||||
}
|
||||
|
||||
let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
|
||||
|
||||
messageLog += "\(result)"
|
||||
messageLog += "\n"
|
||||
}
|
||||
|
||||
func clear() async {
|
||||
guard let llamaContext else {
|
||||
return
|
||||
}
|
||||
|
||||
await llamaContext.clear()
|
||||
messageLog = ""
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,24 +5,132 @@ struct ContentView: View {
|
|||
|
||||
@State private var multiLineText = ""
|
||||
|
||||
private static func cleanupModelCaches() {
|
||||
// Delete all models (*.gguf)
|
||||
let fileManager = FileManager.default
|
||||
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
|
||||
do {
|
||||
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
|
||||
for fileURL in fileURLs {
|
||||
if fileURL.pathExtension == "gguf" {
|
||||
try fileManager.removeItem(at: fileURL)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
|
||||
}
|
||||
}
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
ScrollView(.vertical) {
|
||||
ScrollView(.vertical, showsIndicators: true) {
|
||||
Text(llamaState.messageLog)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
.padding()
|
||||
.onTapGesture {
|
||||
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
|
||||
}
|
||||
}
|
||||
|
||||
TextEditor(text: $multiLineText)
|
||||
.frame(height: 200)
|
||||
.frame(height: 80)
|
||||
.padding()
|
||||
.border(Color.gray, width: 0.5)
|
||||
Button(action: {
|
||||
sendText()
|
||||
}) {
|
||||
Text("Send")
|
||||
.padding()
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
HStack {
|
||||
Button("Send") {
|
||||
sendText()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Bench") {
|
||||
bench()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Clear") {
|
||||
clear()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Copy") {
|
||||
UIPasteboard.general.string = llamaState.messageLog
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
}
|
||||
|
||||
VStack {
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.padding(.top, 4)
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-f16.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
|
||||
filename: "phi-2-q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
|
||||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
|
||||
filename: "phi-2-q8_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
|
||||
filename: "mistral-7b-v0.1.Q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
Button("Clear downloaded models") {
|
||||
ContentView.cleanupModelCaches()
|
||||
llamaState.cacheCleared = true
|
||||
}
|
||||
.padding(8)
|
||||
.font(.system(size: 12))
|
||||
}
|
||||
}
|
||||
.padding()
|
||||
|
@ -34,9 +142,20 @@ struct ContentView: View {
|
|||
multiLineText = ""
|
||||
}
|
||||
}
|
||||
|
||||
func bench() {
|
||||
Task {
|
||||
await llamaState.bench()
|
||||
}
|
||||
}
|
||||
|
||||
func clear() {
|
||||
Task {
|
||||
await llamaState.clear()
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
#Preview {
|
||||
ContentView()
|
||||
}
|
||||
*/
|
||||
|
||||
//#Preview {
|
||||
// ContentView()
|
||||
//}
|
||||
|
|
122
examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
Normal file
122
examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
Normal file
|
@ -0,0 +1,122 @@
|
|||
import SwiftUI
|
||||
|
||||
struct DownloadButton: View {
|
||||
@ObservedObject private var llamaState: LlamaState
|
||||
private var modelName: String
|
||||
private var modelUrl: String
|
||||
private var filename: String
|
||||
|
||||
@State private var status: String
|
||||
|
||||
@State private var downloadTask: URLSessionDownloadTask?
|
||||
@State private var progress = 0.0
|
||||
@State private var observation: NSKeyValueObservation?
|
||||
|
||||
private static func getFileURL(filename: String) -> URL {
|
||||
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
|
||||
}
|
||||
|
||||
private func checkFileExistenceAndUpdateStatus() {
|
||||
}
|
||||
|
||||
init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
|
||||
self.llamaState = llamaState
|
||||
self.modelName = modelName
|
||||
self.modelUrl = modelUrl
|
||||
self.filename = filename
|
||||
|
||||
let fileURL = DownloadButton.getFileURL(filename: filename)
|
||||
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
|
||||
}
|
||||
|
||||
private func download() {
|
||||
status = "downloading"
|
||||
print("Downloading model \(modelName) from \(modelUrl)")
|
||||
guard let url = URL(string: modelUrl) else { return }
|
||||
let fileURL = DownloadButton.getFileURL(filename: filename)
|
||||
|
||||
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
|
||||
if let error = error {
|
||||
print("Error: \(error.localizedDescription)")
|
||||
return
|
||||
}
|
||||
|
||||
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
|
||||
print("Server error!")
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
if let temporaryURL = temporaryURL {
|
||||
try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
|
||||
print("Writing to \(filename) completed")
|
||||
|
||||
llamaState.cacheCleared = false
|
||||
|
||||
status = "downloaded"
|
||||
}
|
||||
} catch let err {
|
||||
print("Error: \(err.localizedDescription)")
|
||||
}
|
||||
}
|
||||
|
||||
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
|
||||
self.progress = progress.fractionCompleted
|
||||
}
|
||||
|
||||
downloadTask?.resume()
|
||||
}
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
if status == "download" {
|
||||
Button(action: download) {
|
||||
Text("Download " + modelName)
|
||||
}
|
||||
} else if status == "downloading" {
|
||||
Button(action: {
|
||||
downloadTask?.cancel()
|
||||
status = "download"
|
||||
}) {
|
||||
Text("\(modelName) (Downloading \(Int(progress * 100))%)")
|
||||
}
|
||||
} else if status == "downloaded" {
|
||||
Button(action: {
|
||||
let fileURL = DownloadButton.getFileURL(filename: filename)
|
||||
if !FileManager.default.fileExists(atPath: fileURL.path) {
|
||||
download()
|
||||
return
|
||||
}
|
||||
do {
|
||||
try llamaState.loadModel(modelUrl: fileURL)
|
||||
} catch let err {
|
||||
print("Error: \(err.localizedDescription)")
|
||||
}
|
||||
}) {
|
||||
Text("\(modelName) (Downloaded)")
|
||||
}
|
||||
} else {
|
||||
Text("Unknown status")
|
||||
}
|
||||
}
|
||||
.onDisappear() {
|
||||
downloadTask?.cancel()
|
||||
}
|
||||
.onChange(of: llamaState.cacheCleared) { newValue in
|
||||
if newValue {
|
||||
downloadTask?.cancel()
|
||||
let fileURL = DownloadButton.getFileURL(filename: filename)
|
||||
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// #Preview {
|
||||
// DownloadButton(
|
||||
// llamaState: LlamaState(),
|
||||
// modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
|
||||
// modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
|
||||
// filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
|
||||
// )
|
||||
// }
|
|
@ -10,7 +10,8 @@
|
|||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||
#endif
|
||||
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
#include "httplib.h"
|
||||
#include "json.hpp"
|
||||
|
||||
|
@ -2413,7 +2414,7 @@ json oaicompat_completion_params_parse(
|
|||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||
|
||||
if (llama_params.count("grammar") != 0) {
|
||||
if (body.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
}
|
||||
|
||||
|
@ -2644,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
|||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if SERVER_VERBOSE != 1
|
||||
log_disable();
|
||||
#endif
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
server_params sparams;
|
||||
|
@ -2698,7 +2702,7 @@ int main(int argc, char **argv)
|
|||
}
|
||||
|
||||
// API key is invalid or not provided
|
||||
res.set_content("Unauthorized: Invalid API Key", "text/plain");
|
||||
res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
|
||||
res.status = 401; // Unauthorized
|
||||
|
||||
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
||||
|
@ -2713,28 +2717,28 @@ int main(int argc, char **argv)
|
|||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
|
||||
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.js is found in the public --path
|
||||
svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
|
||||
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
|
||||
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
|
||||
return false;
|
||||
});
|
||||
|
||||
|
@ -2745,7 +2749,7 @@ int main(int argc, char **argv)
|
|||
{ "user_name", llama.name_user.c_str() },
|
||||
{ "assistant_name", llama.name_assistant.c_str() }
|
||||
};
|
||||
res.set_content(data.dump(), "application/json");
|
||||
res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||
|
@ -2759,12 +2763,12 @@ int main(int argc, char **argv)
|
|||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
||||
}
|
||||
else
|
||||
{
|
||||
res.status = 404;
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
|
@ -2835,7 +2839,7 @@ int main(int argc, char **argv)
|
|||
}}
|
||||
};
|
||||
|
||||
res.set_content(models.dump(), "application/json");
|
||||
res.set_content(models.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
// TODO: add mount point without "/v1" prefix -- how?
|
||||
|
@ -2857,10 +2861,10 @@ int main(int argc, char **argv)
|
|||
|
||||
res.set_content(oaicompat_result.dump(-1, ' ', false,
|
||||
json::error_handler_t::replace),
|
||||
"application/json");
|
||||
"application/json; charset=utf-8");
|
||||
} else {
|
||||
res.status = 500;
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
|
@ -2924,12 +2928,12 @@ int main(int argc, char **argv)
|
|||
task_result result = llama.next_result(task_id);
|
||||
if (!result.error && result.stop)
|
||||
{
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
||||
}
|
||||
else
|
||||
{
|
||||
res.status = 404;
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
|
@ -2978,11 +2982,11 @@ int main(int argc, char **argv)
|
|||
svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
const json data = llama.get_model_props();
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
|
||||
{ return res.set_content("", "application/json"); });
|
||||
{ return res.set_content("", "application/json; charset=utf-8"); });
|
||||
|
||||
svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
|
@ -2993,7 +2997,7 @@ int main(int argc, char **argv)
|
|||
tokens = llama.tokenize(body["content"], false);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
|
@ -3007,7 +3011,7 @@ int main(int argc, char **argv)
|
|||
}
|
||||
|
||||
const json data = format_detokenized_response(content);
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
|
@ -3024,7 +3028,7 @@ int main(int argc, char **argv)
|
|||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json");
|
||||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
|
@ -3045,7 +3049,7 @@ int main(int argc, char **argv)
|
|||
{
|
||||
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
||||
}
|
||||
res.set_content(buf, "text/plain");
|
||||
res.set_content(buf, "text/plain; charset=utf-8");
|
||||
res.status = 500;
|
||||
});
|
||||
|
||||
|
@ -3053,15 +3057,15 @@ int main(int argc, char **argv)
|
|||
{
|
||||
if (res.status == 401)
|
||||
{
|
||||
res.set_content("Unauthorized", "text/plain");
|
||||
res.set_content("Unauthorized", "text/plain; charset=utf-8");
|
||||
}
|
||||
if (res.status == 400)
|
||||
{
|
||||
res.set_content("Invalid request", "text/plain");
|
||||
res.set_content("Invalid request", "text/plain; charset=utf-8");
|
||||
}
|
||||
else if (res.status == 404)
|
||||
{
|
||||
res.set_content("File Not Found", "text/plain");
|
||||
res.set_content("File Not Found", "text/plain; charset=utf-8");
|
||||
res.status = 404;
|
||||
}
|
||||
});
|
||||
|
|
237
ggml-cuda.cu
237
ggml-cuda.cu
|
@ -31,6 +31,7 @@
|
|||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
|
@ -40,6 +41,7 @@
|
|||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
|
@ -4998,7 +5000,16 @@ static __global__ void rope_neox(
|
|||
const int ib = col / n_dims;
|
||||
const int ic = col % n_dims;
|
||||
|
||||
const int i = row*ncols + ib*n_dims + ic/2;
|
||||
if (ib > 0) {
|
||||
const int i = row*ncols + ib*n_dims + ic;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ncols + ib*n_dims + ic/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
@ -7057,6 +7068,7 @@ inline void ggml_cuda_op_upscale(
|
|||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_pad(
|
||||
|
@ -7073,6 +7085,7 @@ inline void ggml_cuda_op_pad(
|
|||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_rms_norm(
|
||||
|
@ -7376,7 +7389,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
const int compute_capability = g_compute_capabilities[id];
|
||||
|
||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
half * src0_as_f16 = nullptr;
|
||||
size_t src0_as = 0;
|
||||
|
@ -7817,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
|
@ -7868,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
ggml_cuda_set_peer_access(ne11);
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
|
||||
|
@ -8300,27 +8316,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|||
}
|
||||
|
||||
static __global__ void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
int ne12, int ne13,
|
||||
int ne23,
|
||||
int nb02, int nb03,
|
||||
int nb12, int nb13,
|
||||
int nb2, int nb3,
|
||||
int r2, int r3) {
|
||||
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int64_t ne12, int64_t ne13,
|
||||
int64_t ne23,
|
||||
size_t nb02, size_t nb03,
|
||||
size_t nb12, size_t nb13,
|
||||
size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3) {
|
||||
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -8376,7 +8392,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||
|
||||
size_t dst_as = 0;
|
||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||
|
||||
half * dst_f16 = nullptr;
|
||||
char * dst_t = nullptr;
|
||||
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
const float alpha_f32 = 1.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
const void * alpha = &alpha_f16;
|
||||
const void * beta = &beta_f16;
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||
dst_t = (char *) dst_f16;
|
||||
|
||||
nbd2 /= sizeof(float) / sizeof(half);
|
||||
nbd3 /= sizeof(float) / sizeof(half);
|
||||
} else {
|
||||
dst_t = (char *) dst_ddf;
|
||||
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
cu_data_type = CUDA_R_32F;
|
||||
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
@ -8385,9 +8435,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
#if 0
|
||||
// use cublasGemmEx
|
||||
{
|
||||
|
@ -8397,12 +8444,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
|
@ -8414,11 +8461,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
ne12*ne13,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
|
@ -8435,24 +8482,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
|
||||
dim3 block_dims(ne13, ne12);
|
||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||
src0_as_f16, src1_as_f16, dst_f16,
|
||||
src0_as_f16, src1_as_f16, dst_t,
|
||||
ptrs_src, ptrs_dst,
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
nb12, nb13,
|
||||
dst->nb[2], dst->nb[3],
|
||||
nbd2, nbd3,
|
||||
r2, r3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
||||
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
if (ptrs_src_s != 0) {
|
||||
|
@ -8464,11 +8511,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
}
|
||||
#endif
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||
|
||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||
}
|
||||
|
||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -8734,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb1 = dst->nb[1];
|
||||
|
||||
const struct ggml_tensor * ids = src0;
|
||||
const int32_t id = ((int32_t *) dst->op_params)[0];
|
||||
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
|
||||
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
if (ids->backend == GGML_BACKEND_GPU) {
|
||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
} else {
|
||||
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
||||
}
|
||||
|
@ -8757,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.ne[1] = 1;
|
||||
dst_row.ne[1] = 1;
|
||||
|
||||
src1_row.nb[2] = src1_row.nb[1];
|
||||
dst_row.nb[2] = dst_row.nb[1];
|
||||
|
||||
src1_row.nb[3] = src1_row.nb[1];
|
||||
dst_row.nb[3] = dst_row.nb[1];
|
||||
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char * src1_original = (char *) src1_extra->data_device[g_main_device];
|
||||
char * dst_original = (char *) dst_extra->data_device[g_main_device];
|
||||
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
if (src1->ne[1] == 1) {
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
|
||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
||||
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
||||
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
||||
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
||||
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
||||
|
||||
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
||||
dst_row.data = (char *) dst->data + i01*dst->nb[1];
|
||||
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
||||
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
||||
|
||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
} else {
|
||||
size_t as_src1, as_dst;
|
||||
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
||||
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
||||
|
||||
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
||||
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
||||
|
||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
||||
int64_t num_src1_rows = 0;
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
if (row_id_i != row_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
||||
nb11, cudaMemcpyDeviceToDevice, stream));
|
||||
num_src1_rows++;
|
||||
}
|
||||
|
||||
if (num_src1_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
|
||||
src1_row.nb[1] = nb11;
|
||||
src1_row.nb[2] = num_src1_rows*nb11;
|
||||
src1_row.nb[3] = num_src1_rows*nb11;
|
||||
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||
|
||||
num_src1_rows = 0;
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
if (row_id_i != row_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
||||
nb1, cudaMemcpyDeviceToDevice, stream));
|
||||
num_src1_rows++;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
||||
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9323,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||
return false;
|
||||
}
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
} else {
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
||||
if (ic < n_dims) {
|
||||
const int64_t ib = 0;
|
||||
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
const float cur_rot = inv_ndims*ic - ib;
|
||||
|
@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const int64_t i0 = ic;
|
||||
|
||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
46
ggml.c
46
ggml.c
|
@ -4098,6 +4098,14 @@ struct ggml_tensor * ggml_mul_mat(
|
|||
return result;
|
||||
}
|
||||
|
||||
void ggml_mul_mat_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec) {
|
||||
const int32_t prec_i32 = (int32_t) prec;
|
||||
|
||||
ggml_set_op_params_i32(a, 0, prec_i32);
|
||||
}
|
||||
|
||||
// ggml_mul_mat_id
|
||||
|
||||
struct ggml_tensor * ggml_mul_mat_id(
|
||||
|
@ -9168,6 +9176,8 @@ static void ggml_compute_forward_norm_f32(
|
|||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -9237,6 +9247,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -11562,10 +11574,13 @@ static void ggml_compute_forward_rope_f32(
|
|||
}
|
||||
} else {
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||
theta_base *= freq_scale;
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||
if (ic < n_dims) {
|
||||
const int64_t ib = 0;
|
||||
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
|
@ -11588,6 +11603,14 @@ static void ggml_compute_forward_rope_f32(
|
|||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const int64_t i0 = ic;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11715,10 +11738,13 @@ static void ggml_compute_forward_rope_f16(
|
|||
}
|
||||
} else {
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||
theta_base *= freq_scale;
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||
if (ic < n_dims) {
|
||||
const int64_t ib = 0;
|
||||
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
|
@ -11741,6 +11767,14 @@ static void ggml_compute_forward_rope_f16(
|
|||
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const int64_t i0 = ic;
|
||||
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
14
ggml.h
14
ggml.h
|
@ -303,7 +303,7 @@ extern "C" {
|
|||
|
||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
typedef half ggml_fp16_t;
|
||||
#elif defined(__ARM_NEON)
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
|
@ -343,6 +343,12 @@ extern "C" {
|
|||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
// precision
|
||||
enum ggml_prec {
|
||||
GGML_PREC_DEFAULT,
|
||||
GGML_PREC_F32,
|
||||
};
|
||||
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
|
@ -1057,6 +1063,12 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// change the precision of a matrix multiplication
|
||||
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
||||
GGML_API void ggml_mul_mat_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
// indirect matrix multiplication
|
||||
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||
|
|
|
@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum):
|
|||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
QWEN = auto()
|
||||
PHI2 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
MODEL_ARCH.QWEN: "qwen",
|
||||
MODEL_ARCH.PHI2: "phi2",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
|
@ -359,6 +361,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PHI2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
]
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ class TensorNameMap:
|
|||
"embeddings.word_embeddings", # bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
|
@ -43,6 +44,7 @@ class TensorNameMap:
|
|||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||
"output", # llama-pth bloom
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
),
|
||||
|
||||
# Output norm
|
||||
|
@ -55,6 +57,7 @@ class TensorNameMap:
|
|||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
"language_model.encoder.final_layernorm", # persimmon
|
||||
"lm_head.ln", # phi2
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
|
@ -78,6 +81,7 @@ class TensorNameMap:
|
|||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
|
@ -94,6 +98,7 @@ class TensorNameMap:
|
|||
"h.{bid}.self_attention.query_key_value", # bloom
|
||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
),
|
||||
|
||||
# Attention query
|
||||
|
@ -133,6 +138,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
|
@ -174,6 +180,7 @@ class TensorNameMap:
|
|||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
"h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
|
@ -206,6 +213,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
"h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
|
|
|
@ -109,8 +109,10 @@ class SpecialVocab:
|
|||
return True
|
||||
|
||||
def _set_special_token(self, typ: str, tid: Any) -> None:
|
||||
if not isinstance(tid, int) or tid < 0:
|
||||
if not isinstance(tid, int):
|
||||
return
|
||||
if tid < 0:
|
||||
raise ValueError(f'invalid value for special token type {typ}: {tid}')
|
||||
if self.n_vocab is None or tid < self.n_vocab:
|
||||
if typ in self.special_token_ids:
|
||||
return
|
||||
|
|
503
llama.cpp
503
llama.cpp
|
@ -195,6 +195,7 @@ enum llm_arch {
|
|||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
LLM_ARCH_QWEN,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
{ LLM_ARCH_QWEN, "qwen" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
|
@ -559,6 +561,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PHI2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
|
@ -1430,6 +1445,7 @@ struct llama_model {
|
|||
struct ggml_tensor * output_norm;
|
||||
struct ggml_tensor * output_norm_b;
|
||||
struct ggml_tensor * output;
|
||||
struct ggml_tensor * output_b;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
|
@ -1515,6 +1531,10 @@ struct llama_context {
|
|||
|
||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
#ifndef NDEBUG
|
||||
// guard against access to unset logits
|
||||
std::vector<bool> logits_valid;
|
||||
#endif
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
|
@ -1943,7 +1963,7 @@ namespace GGUFMeta {
|
|||
target = override->bool_value;
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename OT>
|
||||
|
@ -2403,25 +2423,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
return "mostly Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||
return "Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
|
||||
// K-quants
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -2540,6 +2560,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 22: model.type = e_model::MODEL_1B; break;
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
|
@ -2641,6 +2662,15 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHI2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_3B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
@ -3001,7 +3031,7 @@ static void llm_load_tensors(
|
|||
|
||||
(void) main_gpu;
|
||||
|
||||
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
||||
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
||||
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
|
@ -3644,6 +3674,73 @@ static void llm_load_tensors(
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHI2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
backend_norm = llama_backend_offload;
|
||||
backend_output = llama_backend_offload;
|
||||
} else {
|
||||
backend_norm = GGML_BACKEND_CPU;
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
vram_weights += ggml_nbytes(model.output_b);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
||||
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
|
||||
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
@ -3716,7 +3813,6 @@ static void llm_load_tensors(
|
|||
}
|
||||
}
|
||||
} break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
@ -4077,6 +4173,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
// if max_alibi_bias > 0 then apply ALiBi
|
||||
static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_context * ctx,
|
||||
const llama_model & model,
|
||||
const llama_hparams & hparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_tensor * wo,
|
||||
|
@ -4088,6 +4185,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
int32_t n_tokens,
|
||||
int32_t n_kv,
|
||||
float max_alibi_bias,
|
||||
float scale,
|
||||
const llm_build_cb & cb,
|
||||
int il) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
@ -4110,6 +4208,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
}
|
||||
|
||||
if (max_alibi_bias > 0.0f) {
|
||||
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
||||
kq = ggml_scale(ctx, kq, kq_scale);
|
||||
|
@ -4129,7 +4233,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
kq = ggml_soft_max(ctx, kq);
|
||||
cb(kq, "kq_soft_max", il);
|
||||
} else {
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale);
|
||||
cb(kq, "kq_soft_max_ext", il);
|
||||
}
|
||||
|
||||
|
@ -4336,9 +4440,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -4519,9 +4623,9 @@ struct llm_build_context {
|
|||
// apply ALiBi for 13B model
|
||||
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -4643,9 +4747,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -4743,9 +4847,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -4952,9 +5056,9 @@ struct llm_build_context {
|
|||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
// TODO: not tested, could be broken
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5043,9 +5147,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5140,9 +5244,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5234,9 +5338,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5347,9 +5451,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5406,15 +5510,15 @@ struct llm_build_context {
|
|||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_scale
|
||||
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
cb(KQ_scale, "KQ_scale", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -5464,9 +5568,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5510,6 +5614,122 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
struct ggml_cgraph * build_phi2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * attn_norm_output;
|
||||
struct ggml_tensor * ffn_output;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// Q_scale
|
||||
struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
cb(Q_scale, "Q_scale", -1);
|
||||
|
||||
// KQ_scale
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
cb(KQ_scale, "KQ_scale", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(attn_norm_output, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_scale(ctx0, Qcur, Q_scale);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
// FF
|
||||
{
|
||||
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(ffn_output, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_output);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.output_norm,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output_no_bias", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.output_b);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
|
@ -5565,9 +5785,9 @@ struct llm_build_context {
|
|||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
|
@ -5621,7 +5841,7 @@ enum llm_offload_func_e {
|
|||
OFFLOAD_FUNC_FRC, // force offload
|
||||
OFFLOAD_FUNC_KQV,
|
||||
OFFLOAD_FUNC_NR,
|
||||
OFFLOAD_FUNC_EMB,
|
||||
OFFLOAD_FUNC_EMB, // embeddings
|
||||
OFFLOAD_FUNC_OUT,
|
||||
};
|
||||
|
||||
|
@ -5706,6 +5926,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "pos_embd", OFFLOAD_FUNC_NR },
|
||||
|
||||
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
||||
{ "Q_scale", OFFLOAD_FUNC_FRC },
|
||||
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
||||
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
||||
{ "K_shift", OFFLOAD_FUNC_FRC },
|
||||
|
@ -5790,6 +6011,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "l_out", OFFLOAD_FUNC },
|
||||
|
||||
{ "result_norm", OFFLOAD_FUNC_EMB },
|
||||
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
||||
{ "result_output", OFFLOAD_FUNC_OUT },
|
||||
};
|
||||
|
||||
|
@ -5807,6 +6029,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
bool alloc_inp_tokens = false;
|
||||
bool alloc_inp_embd = false;
|
||||
bool alloc_inp_pos = false;
|
||||
bool alloc_inp_Q_scale = false;
|
||||
bool alloc_inp_KQ_scale = false;
|
||||
bool alloc_inp_KQ_mask = false;
|
||||
bool alloc_inp_K_shift = false;
|
||||
|
@ -5874,7 +6097,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
alloc_inp_pos = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
|
||||
if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) {
|
||||
ggml_allocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
|
@ -5882,6 +6105,23 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
|
||||
}
|
||||
|
||||
alloc_inp_Q_scale = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
|
||||
ggml_allocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
const int64_t n_embd_head = model.hparams.n_embd_head();
|
||||
if (model.arch == LLM_ARCH_PHI2) {
|
||||
// with phi2, we scale the Q to avoid precision issues
|
||||
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
||||
ggml_set_f32(cur, 1.0f);
|
||||
} else {
|
||||
ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
|
||||
}
|
||||
}
|
||||
|
||||
alloc_inp_KQ_scale = true;
|
||||
}
|
||||
|
||||
|
@ -6106,6 +6346,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_qwen();
|
||||
} break;
|
||||
case LLM_ARCH_PHI2:
|
||||
{
|
||||
result = llm.build_phi2();
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
result = llm.build_gpt2();
|
||||
|
@ -6243,12 +6487,16 @@ static int llama_decode_internal(
|
|||
|
||||
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
||||
|
||||
// the embeddings could be the second to last tensor, or the third to last tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||
|
||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
||||
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||
|
||||
if (strcmp(embeddings->name, "result_norm") != 0) {
|
||||
embeddings = gf->nodes[gf->n_nodes - 3];
|
||||
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
for (int i = 0; i < gf->n_leafs; i++) {
|
||||
|
@ -6343,6 +6591,14 @@ static int llama_decode_internal(
|
|||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto & logits_valid = lctx.logits_valid;
|
||||
logits_valid.clear();
|
||||
logits_valid.resize(n_tokens);
|
||||
|
||||
logits_out.clear();
|
||||
#endif
|
||||
|
||||
if (batch.logits) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
|
@ -6350,13 +6606,22 @@ static int llama_decode_internal(
|
|||
continue;
|
||||
}
|
||||
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[i] = true;
|
||||
#endif
|
||||
}
|
||||
} else if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
||||
#ifndef NDEBUG
|
||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||
#endif
|
||||
} else {
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[0] = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8819,53 +9084,60 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
if (!fin) {
|
||||
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
||||
return 1;
|
||||
}
|
||||
llama_file fin(path_lora, "rb");
|
||||
|
||||
// verify magic and version
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
uint32_t magic = fin.read_u32();
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint32_t format_version = fin.read_u32();
|
||||
if (format_version != 1) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t lora_r;
|
||||
int32_t lora_alpha;
|
||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||
int32_t lora_r = fin.read_u32();
|
||||
int32_t lora_alpha = fin.read_u32();
|
||||
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||
|
||||
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
// find the max tensor size to estimate the required temporary buffer size
|
||||
size_t max_tensor_size = 0;
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
||||
max_tensor_size = std::max(max_tensor_size, f32_size);
|
||||
}
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// todo: calculate size from biggest possible tensor
|
||||
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
||||
// TODO: use ggml-alloc
|
||||
size_t lora_ctx_size = max_tensor_size * 3;
|
||||
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
||||
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
ggml_context * lora_ctx = ggml_init(params);
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
}
|
||||
unique_context lora_ctx(nullptr, ggml_free);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> ml;
|
||||
ggml_context * base_ctx = NULL;
|
||||
|
||||
unique_context base_ctx(nullptr, ggml_free);
|
||||
std::vector<uint8_t> base_buf;
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
|
@ -8874,6 +9146,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(ctx_size, mmapped_size);
|
||||
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
ggml_init_params base_params;
|
||||
|
@ -8881,9 +9154,9 @@ static int llama_apply_lora_from_file_internal(
|
|||
base_params.mem_buffer = base_buf.data();
|
||||
base_params.no_alloc = ml->use_mmap;
|
||||
|
||||
base_ctx = ggml_init(base_params);
|
||||
base_ctx.reset(ggml_init(base_params));
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
// maybe this should be in llama_model_loader
|
||||
if (ml->use_mmap) {
|
||||
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
||||
}
|
||||
|
@ -8896,27 +9169,35 @@ static int llama_apply_lora_from_file_internal(
|
|||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
while (true) {
|
||||
if (fin.tell() == fin.size) {
|
||||
// eof
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t name_len;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
fin.read_raw(&name_len, sizeof(name_len));
|
||||
fin.read_raw(&ftype, sizeof(ftype));
|
||||
|
||||
if (n_dims != 1 && n_dims != 2) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name;
|
||||
{
|
||||
GGML_ASSERT(name_len <= 1024);
|
||||
char buf[1024];
|
||||
fin.read(buf, length);
|
||||
name = std::string(buf, length);
|
||||
fin.read_raw(buf, name_len);
|
||||
name = std::string(buf, name_len);
|
||||
}
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
|
@ -8930,7 +9211,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
|
@ -8949,22 +9230,15 @@ static int llama_apply_lora_from_file_internal(
|
|||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor * lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
else {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
ggml_set_name(lora_tensor, "lora_tensor");
|
||||
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
||||
ggml_set_name(lora_tensor, name.c_str());
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tellg();
|
||||
size_t offset = fin.tell();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seekg(offset);
|
||||
fin.read((char*)lora_tensor->data, tensor_data_size);
|
||||
fin.seek(offset, SEEK_SET);
|
||||
fin.read_raw(lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
|
||||
|
@ -8994,13 +9268,11 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
// load from base model
|
||||
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||
// TODO: throw
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// TODO: not tested!! maybe not working!
|
||||
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
ml->load_data_for(base_t);
|
||||
} else {
|
||||
base_t = dest_t;
|
||||
|
@ -9029,43 +9301,45 @@ static int llama_apply_lora_from_file_internal(
|
|||
}
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
|
||||
ggml_set_name(scale_tensor, "scale_tensor");
|
||||
|
||||
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||
BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
||||
offload_func_force_inplace(r);
|
||||
ggml_set_name(r, "r_add_inplace");
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
r = ggml_add(lora_ctx.get(), base_t, BA);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_add");
|
||||
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_cpy");
|
||||
}
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
||||
ggml_build_forward_expand(gf, r);
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
||||
|
||||
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
||||
GGML_ASSERT(lora_tensors.size() == 2);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
lora_ctx = ggml_init(params);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
|
@ -9075,12 +9349,6 @@ static int llama_apply_lora_from_file_internal(
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: this should be in a destructor, it will leak on failure
|
||||
ggml_free(lora_ctx);
|
||||
if (base_ctx) {
|
||||
ggml_free(base_ctx);
|
||||
}
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
|
@ -10245,6 +10513,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
assert(ctx->logits_valid.at(i));
|
||||
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
||||
}
|
||||
|
||||
|
|
1
llama.h
1
llama.h
|
@ -39,6 +39,7 @@
|
|||
|
||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
|
|
|
@ -1555,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
|
||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
|
||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
|
||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_alibi());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue