This commit is contained in:
goerch 2023-08-17 04:55:26 +02:00
commit d864596e0a
29 changed files with 2506 additions and 8197 deletions

View file

@ -529,7 +529,6 @@ endif()
add_library(llama add_library(llama
llama.cpp llama.cpp
llama.h llama.h
llama-util.h
) )
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)

View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gguf-llama-simple gptneox-main BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@ -329,10 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
OBJS += ggml-alloc.o OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h common.o: examples/common.cpp examples/common.h
@ -388,10 +385,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS) gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gptneox-main: gptneox-main.cpp ggml.o $(OBJS) gptneox-main: gptneox-main.cpp ggml.o $(OBJS)

View file

@ -1,50 +0,0 @@
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_ALIGNMENT = "general.alignment"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"
# tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"

View file

@ -1,15 +1,15 @@
# HF gptneox--> gguf conversion # HF gptneox--> gguf conversion
import gguf import gguf
import gguf_namemap as tmap
import os import os
import sys import sys
import struct import struct
import json import json
import numpy as np import numpy as np
import torch
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
import torch
from transformers import AutoTokenizer from transformers import AutoTokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = tmap.get_tensor_namemap(block_count) tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")

View file

@ -3,18 +3,17 @@
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
import gguf import gguf
import gguf_namemap as tmap
import os import os
import sys import sys
import struct import struct
import json import json
import numpy as np import numpy as np
import torch import torch
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any] #NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9 # compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -96,6 +95,7 @@ gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32") gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_block_count(llm_arch, block_count)
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = tmap.get_tensor_namemap(block_count) tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
@ -260,7 +260,6 @@ for part_name in part_names:
for name in model_part.keys(): for name in model_part.keys():
data = model_part[name] data = model_part[name]
old_dtype = data.dtype old_dtype = data.dtype
# we don't need these # we don't need these

View file

@ -1,8 +1,6 @@
# HF llama --> gguf conversion # HF llama --> gguf conversion
import gguf import gguf
import gguf_namemap as tmap
import os import os
import sys import sys
import struct import struct
@ -18,7 +16,9 @@ from sentencepiece import SentencePieceProcessor
# compatible with python < 3.9 # compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: # reverse HF permute back to original pth layout
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2) .swapaxes(1, 2)
@ -93,11 +93,21 @@ if "_name_or_path" in hparams:
else: else:
hf_repo="" hf_repo=""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_architecture(llm_arch) gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32") gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
gguf_writer.add_context_length(llm_arch, ctx_length)
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_block_count(llm_arch, block_count)
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"]) gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
@ -189,7 +199,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = tmap.get_tensor_namemap(block_count) tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
@ -218,9 +228,9 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# permute these # reverse permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = permute(data, head_count, head_count_kv) data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: if name.endswith(".weight") and name[:-7] in tensor_map:
@ -287,9 +297,9 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# permute these # reverse permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = permute(data, head_count, head_count_kv) data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names # map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map: if name.endswith(".weight") and name[:-7] in tensor_map:
@ -315,7 +325,7 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data) gguf_writer.write_tensor_to_file(data)

View file

@ -104,7 +104,7 @@ TENSORS_SET = set(TENSORS_LIST)
def find_n_mult(n_ff: int, n_embd: int) -> int: def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range # hardcoded magic range
for n_mult in range(256, 1, -1): for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff: if calc_ff == n_ff:
return n_mult return n_mult
@ -118,6 +118,7 @@ class Params:
n_mult: int n_mult: int
n_head: int n_head: int
n_layer: int n_layer: int
n_kv_head: Optional[int] # This parameter is only used for Llama 2
@staticmethod @staticmethod
def guessed(model: 'LazyModel') -> 'Params': def guessed(model: 'LazyModel') -> 'Params':
@ -144,6 +145,7 @@ class Params:
n_mult = 256, n_mult = 256,
n_head = n_head, n_head = n_head,
n_layer = n_layer, n_layer = n_layer,
n_kv_head = None,
) )
@staticmethod @staticmethod
@ -155,6 +157,7 @@ class Params:
n_head = config["num_attention_heads"]; n_head = config["num_attention_heads"];
n_layer = config["num_hidden_layers"]; n_layer = config["num_hidden_layers"];
n_ff = config["intermediate_size"]; n_ff = config["intermediate_size"];
n_kv_head = config.get("num_key_value_heads")
n_mult = find_n_mult(n_ff, n_embd); n_mult = find_n_mult(n_ff, n_embd);
@ -164,6 +167,7 @@ class Params:
n_mult = n_mult, n_mult = n_mult,
n_head = n_head, n_head = n_head,
n_layer = n_layer, n_layer = n_layer,
n_kv_head = n_kv_head,
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
@ -187,6 +191,7 @@ class Params:
n_mult = n_mult, n_mult = n_mult,
n_head = n_head, n_head = n_head,
n_layer = n_layer, n_layer = n_layer,
n_kv_head = None,
) )
@staticmethod @staticmethod
@ -293,7 +298,9 @@ class SentencePieceVocab:
Vocab = Union[BpeVocab, SentencePieceVocab] Vocab = Union[BpeVocab, SentencePieceVocab]
def permute(weights: NDArray, n_head: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2) .swapaxes(1, 2)
.reshape(weights.shape)) .reshape(weights.shape))
@ -305,7 +312,7 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def astype(self, data_type: DataType) -> 'Tensor': ... def astype(self, data_type: DataType) -> 'Tensor': ...
@abstractmethod @abstractmethod
def permute(self, n_head: int) -> 'Tensor': ... def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
@abstractmethod @abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ... def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod @abstractmethod
@ -343,8 +350,8 @@ class UnquantizedTensor(Tensor):
r = self.ndarray.shape[0] // 3 r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int) -> 'UnquantizedTensor': def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head)) return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@ -367,18 +374,18 @@ GGMLCompatibleTensor = Union[UnquantizedTensor]
class DeferredPermutedTensor(Tensor): class DeferredPermutedTensor(Tensor):
def __init__(self, base: Tensor, n_head: int) -> None: def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
self.base = base self.base = base
self.n_head = n_head self.n_head = n_head
self.data_type = self.base.data_type self.data_type = self.base.data_type
def astype(self, data_type: DataType) -> Tensor: def astype(self, data_type: DataType) -> Tensor:
return self.base.astype(data_type).permute(self.n_head) return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
def to_ggml(self) -> GGMLCompatibleTensor: def to_ggml(self) -> GGMLCompatibleTensor:
return self.base.to_ggml().permute(self.n_head) return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
def permute(self, n_head: int) -> Tensor: def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
raise Exception("shouldn't permute twice") raise Exception("shouldn't permute twice")
@ -474,10 +481,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
return ModelPlus(model, paths, format, vocab) return ModelPlus(model, paths, format, vocab)
def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor: def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().permute(n_head) return lazy_tensor.load().permute(n_head, n_kv_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
@ -502,7 +509,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head) out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)

View file

@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} else if (arg == "-gqa" || arg == "--gqa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
} else if (arg == "--rope-freq-base") { } else if (arg == "--rope-freq-base") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -546,8 +534,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -638,8 +624,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
lparams.n_gqa = params.n_gqa;
lparams.rms_norm_eps = params.rms_norm_eps;
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split; lparams.tensor_split = params.tensor_split;

View file

@ -23,14 +23,12 @@ struct gpt_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor float rope_freq_scale = 1.0f; // RoPE frequency scaling factor

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include <cassert> #include <cassert>
@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
return false; return false;
} }
uint32_t magic = file.read_u32(); uint32_t magic = file.read_u32();
return magic == LLAMA_FILE_MAGIC; return magic == GGUF_MAGIC;
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
if (file.fp == NULL) { if (file.fp == NULL) {
return; return;
} }
// write_magic
file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_u32(LLAMA_FILE_VERSION); // version
// write_hparams
file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_embd);
file.write_u32(model->hparams.n_mult);
file.write_u32(model->hparams.n_head);
file.write_u32(model->hparams.n_layer);
file.write_u32(model->hparams.n_rot);
file.write_u32(LLAMA_FTYPE_ALL_F32);
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. #pragma message("TODO: implement file saving using gguf")
uint32_t n_vocab = model->hparams.n_vocab; (void) vocab;
for (uint32_t i = 0; i < n_vocab; i++) { (void) model;
const auto & token_score = vocab->id_to_token.at(i); (void) w;
file.write_u32((uint32_t) token_score.tok.size()); // // write_magic
file.write_raw(token_score.tok.data(), token_score.tok.size()); // file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_raw(&token_score.score, sizeof(token_score.score)); // file.write_u32(LLAMA_FILE_VERSION); // version
} // // write_hparams
// file.write_u32(model->hparams.n_vocab);
// stuff AK weights into GG weights one by one. // file.write_u32(model->hparams.n_embd);
// w->token_embedding_table -> model->tok_embeddings // file.write_u32(model->hparams.n_mult);
// float* -> struct ggml_tensor // file.write_u32(model->hparams.n_head);
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); // file.write_u32(model->hparams.n_layer);
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table); // file.write_u32(model->hparams.n_rot);
// file.write_u32(LLAMA_FTYPE_ALL_F32);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); //
//print_row(model->norm, 0); // // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// uint32_t n_vocab = model->hparams.n_vocab;
// for rms-att-weight // for (uint32_t i = 0; i < n_vocab; i++) {
int row_length = model->hparams.n_embd; // const auto & token_score = vocab->id_to_token.at(i);
const auto & hparams = model->hparams; // file.write_u32((uint32_t) token_score.tok.size());
//int n_ff = model->hparams.n_embd; // file.write_raw(token_score.tok.data(), token_score.tok.size());
int n_ff = get_n_ff(&hparams); // file.write_raw(&token_score.score, sizeof(token_score.score));
// }
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ //
auto & layer = model->layers[i]; // // stuff AK weights into GG weights one by one.
// 1d // // w->token_embedding_table -> model->tok_embeddings
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); // // float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); // stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim //
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); // stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); // //print_row(model->norm, 0);
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); //
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); // // for rms-att-weight
// int row_length = model->hparams.n_embd;
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); // const auto & hparams = model->hparams;
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); // //int n_ff = model->hparams.n_embd;
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); // int n_ff = get_n_ff(&hparams);
} //
// write tensors // for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
write_tensor(&file, model->tok_embeddings); // auto & layer = model->layers[i];
write_tensor(&file, model->norm); // // 1d
write_tensor(&file, model->output); // ? // stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { // stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
auto & layer = model->layers[i]; //
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
write_tensor(&file, layer.attention_norm); // stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
write_tensor(&file, layer.wq); // stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
write_tensor(&file, layer.wk); // stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
write_tensor(&file, layer.wv); // stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
write_tensor(&file, layer.wo); //
write_tensor(&file, layer.ffn_norm); // stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
write_tensor(&file, layer.w1); // stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
write_tensor(&file, layer.w2); // stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
write_tensor(&file, layer.w3); // }
} // // write tensors
// write_tensor(&file, model->tok_embeddings);
// write_tensor(&file, model->norm);
// write_tensor(&file, model->output); // ?
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
// auto & layer = model->layers[i];
//
// write_tensor(&file, layer.attention_norm);
// write_tensor(&file, layer.wq);
// write_tensor(&file, layer.wk);
// write_tensor(&file, layer.wv);
// write_tensor(&file, layer.wo);
// write_tensor(&file, layer.ffn_norm);
// write_tensor(&file, layer.w1);
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
} }
struct train_params get_default_train_params() { struct train_params get_default_train_params() {

View file

@ -1,126 +0,0 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "gguf-llama.h"
#include "build-info.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
params.prompt = argv[2];
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
// init LLM
llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params();
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ((int) tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stderr);
// main loop
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
// evaluate the transformer
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
tokens_list.clear();
// sample the next token
llama_token new_token_id = 0;
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ?
if (new_token_id == llama_token_eos()) {
fprintf(stderr, " [end of text]\n");
break;
}
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
fflush(stdout);
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
}
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -1,6 +1,5 @@
#include "ggml.h" #include "ggml.h"
#include "gguf-util.h" #include "llama.h"
#include "gguf-llama.h"
#include <cstdio> #include <cstdio>
#include <cinttypes> #include <cinttypes>
@ -21,133 +20,22 @@ static std::string to_string(const T & val) {
return ss.str(); return ss.str();
} }
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
const int32_t n = val.size();
fout.write((const char *) &n, sizeof(n));
fout.write(val.c_str(), n);
}
void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
fout.write((const char *) &val, sizeof(val));
}
void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
fout.write((const char *) &val, sizeof(val));
}
template<typename T>
void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
gguf_ex_write_str(fout, key);
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &val, sizeof(val));
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
}
template<>
void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
gguf_ex_write_str(fout, key);
fout.write((const char *) &type, sizeof(type));
const int32_t n = val.size();
fout.write((const char *) &n, sizeof(n));
fout.write(val.c_str(), n);
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
}
template<typename T>
void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
gguf_ex_write_str(fout, key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fout.write((const char *) &tarr, sizeof(tarr));
}
const int32_t n = val.size();
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &n, sizeof(n));
fout.write((const char *) val.data(), n * sizeof(T));
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
for (int i = 0; i < n; ++i) {
fprintf(stdout, "%s", to_string(val[i]).c_str());
if (i < n - 1) {
fprintf(stdout, ", ");
}
}
fprintf(stdout, "]\n");
}
template<>
void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
gguf_ex_write_str(fout, key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fout.write((const char *) &tarr, sizeof(tarr));
}
const int32_t n = val.size();
fout.write((const char *) &type, sizeof(type));
fout.write((const char *) &n, sizeof(n));
for (int i = 0; i < n; ++i) {
const int32_t nstr = val[i].size();
fout.write((const char *) &nstr, sizeof(nstr));
fout.write(val[i].c_str(), nstr);
}
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
for (int i = 0; i < n; ++i) {
fprintf(stdout, "%s", val[i].c_str());
if (i < n - 1) {
fprintf(stdout, ", ");
}
}
fprintf(stdout, "]\n");
}
bool gguf_ex_write(const std::string & fname) { bool gguf_ex_write(const std::string & fname) {
std::ofstream fout(fname.c_str(), std::ios::binary); struct gguf_context * ctx = gguf_init_empty();
{ gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
const int32_t magic = GGUF_MAGIC; gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
fout.write((const char *) &magic, sizeof(magic)); gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
} gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
gguf_set_val_bool(ctx, "some.parameter.bool", true);
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
{ gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
const int32_t version = GGUF_VERSION; gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
fout.write((const char *) &version, sizeof(version)); gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
}
// NOTE: these have to match the output below!
const int n_tensors = 10;
const int n_kv = 12;
fout.write((const char*) &n_tensors, sizeof(n_tensors));
fout.write((const char*) &n_kv, sizeof(n_kv));
fprintf(stdout, "%s: write header\n", __func__);
// kv data
{
gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8", GGUF_TYPE_UINT8, 0x12);
gguf_ex_write_val< int8_t>(fout, "some.parameter.int8", GGUF_TYPE_INT8, -0x13);
gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16", GGUF_TYPE_UINT16, 0x1234);
gguf_ex_write_val< int16_t>(fout, "some.parameter.int16", GGUF_TYPE_INT16, -0x1235);
gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32", GGUF_TYPE_UINT32, 0x12345678);
gguf_ex_write_val< int32_t>(fout, "some.parameter.int32", GGUF_TYPE_INT32, -0x12345679);
gguf_ex_write_val<float> (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
gguf_ex_write_val<bool> (fout, "some.parameter.bool", GGUF_TYPE_BOOL, true);
gguf_ex_write_val<std::string>(fout, "some.parameter.string", GGUF_TYPE_STRING, "hello world");
gguf_ex_write_arr<int16_t> (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16, { 1, 2, 3, 4, });
gguf_ex_write_arr<float> (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING, { "hello", "world", "!" });
}
uint64_t offset_tensor = 0;
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ 128ull*1024ull*1024ull, /*.mem_size =*/ 128ull*1024ull*1024ull,
@ -157,6 +45,8 @@ bool gguf_ex_write(const std::string & fname) {
struct ggml_context * ctx_data = ggml_init(params); struct ggml_context * ctx_data = ggml_init(params);
const int n_tensors = 10;
// tensor infos // tensor infos
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const std::string name = "tensor_" + to_string(i); const std::string name = "tensor_" + to_string(i);
@ -178,58 +68,15 @@ bool gguf_ex_write(const std::string & fname) {
} }
} }
fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims); gguf_add_tensor(ctx, cur);
for (int j = 0; j < 4; ++j) {
fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
}
fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);
gguf_ex_write_str(fout, name);
gguf_ex_write_i32(fout, n_dims);
for (int j = 0; j < n_dims; ++j) {
gguf_ex_write_i32(fout, cur->ne[j]);
}
gguf_ex_write_i32(fout, cur->type);
gguf_ex_write_u64(fout, offset_tensor);
offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
} }
const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT); gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);
{
const size_t pad = offset_data - fout.tellp();
for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}
}
for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);
const std::string name = "tensor_" + to_string(i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
fout.write((const char *) cur->data, ggml_nbytes(cur));
{
const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);
for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}
}
}
fout.close();
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx);
return true; return true;
} }
@ -345,8 +192,16 @@ bool gguf_ex_read_1(const std::string & fname) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
__func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
// check data // check data
{ {
@ -369,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
return true; return true;
} }
// read just the tensor info and mmap the data in user code
bool gguf_ex_read_2(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
struct gguf_file file(fname.c_str(), "rb");
gguf_mmap data_mmap(&file, 0, false);
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
cur->data = static_cast<char *>(data_mmap.addr) + offset;
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
}
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
@ -427,7 +240,6 @@ int main(int argc, char ** argv) {
} else if (mode == "r") { } else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
} else if (mode == "q") { } else if (mode == "q") {
llama_model_quantize_params params = llama_model_quantize_default_params(); llama_model_quantize_params params = llama_model_quantize_default_params();
llama_model_quantize(fname.c_str(), "quant.gguf", &params); llama_model_quantize(fname.c_str(), "quant.gguf", &params);

View file

@ -266,9 +266,6 @@ int main(int argc, char ** argv) {
params.interactive = true; params.interactive = true;
} }
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) { if (params.verbose_prompt) {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -778,8 +775,7 @@ int main(int argc, char ** argv) {
if (grammar != NULL) { if (grammar != NULL) {
llama_grammar_free(grammar); llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules( std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
parsed_grammar.c_rules());
grammar = llama_grammar_init( grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root")); parsed_grammar.symbol_ids.at("root"));

View file

@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
} }
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
void usage(const char * executable) { void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "\nAllowed quantization types:\n"); fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
if (pos != std::string::npos) { if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1); fpath = fname_inp.substr(0, pos + 1);
} }
// export as [inp path]/ggml-model-[ftype].bin // export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++; arg_idx++;
} }
else { else {

View file

@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
auto lparams = llama_context_default_params(); auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap; lparams.use_mmap = params.use_mmap;

View file

@ -651,8 +651,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -773,23 +771,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} }
else if (arg == "-gqa" || arg == "--gqa")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
}
else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
}
else if (arg == "--rope-freq-base") else if (arg == "--rope-freq-base")
{ {
if (++i >= argc) if (++i >= argc)

View file

@ -36,16 +36,17 @@ int main(int argc, char ** argv) {
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_model * model; llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
@ -54,7 +55,7 @@ int main(int argc, char ** argv) {
const int max_context_size = llama_n_ctx(ctx); const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
if ((int)tokens_list.size() > max_tokens_list_size) { if ((int) tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1; return 1;
} }
@ -74,7 +75,9 @@ int main(int argc, char ** argv) {
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected. // example, we will just stop the loop once this cache is full or once an end of stream is detected.
while (llama_get_kv_cache_token_count( ctx ) < max_context_size) { const int n_gen = std::min(32, max_context_size);
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
// evaluate the transformer // evaluate the transformer
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
@ -114,7 +117,6 @@ int main(int argc, char ** argv) {
// push this new token for next evaluation // push this new token for next evaluation
tokens_list.push_back(new_token_id); tokens_list.push_back(new_token_id);
} }
llama_free(ctx); llama_free(ctx);
@ -122,5 +124,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n");
return 0; return 0;
} }

View file

@ -17,7 +17,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; static const float rms_norm_eps = 1e-5f;
struct random_normal_distribution { struct random_normal_distribution {
std::mt19937 gen; std::mt19937 gen;
@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
return; return;
} }
// write_magic #pragma message("TODO: implement file saving using gguf")
file.write_u32(LLAMA_FILE_MAGIC); // magic (void) vocab;
file.write_u32(LLAMA_FILE_VERSION); // version (void) model;
// write_hparams // // write_magic
file.write_u32(model->hparams.n_vocab); // file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_u32(model->hparams.n_embd); // file.write_u32(LLAMA_FILE_VERSION); // version
file.write_u32(model->hparams.n_mult); // // write_hparams
file.write_u32(model->hparams.n_head); // file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_layer); // file.write_u32(model->hparams.n_embd);
file.write_u32(model->hparams.n_rot); // file.write_u32(model->hparams.n_mult);
file.write_u32(LLAMA_FTYPE_ALL_F32); // file.write_u32(model->hparams.n_head);
// write_vocab // file.write_u32(model->hparams.n_layer);
uint32_t n_vocab = model->hparams.n_vocab; // file.write_u32(model->hparams.n_rot);
for (uint32_t i = 0; i < n_vocab; i++) { // file.write_u32(LLAMA_FTYPE_ALL_F32);
const auto & token_score = vocab->id_to_token.at(i); // // write_vocab
file.write_u32((uint32_t) token_score.tok.size()); // uint32_t n_vocab = model->hparams.n_vocab;
file.write_raw(token_score.tok.data(), token_score.tok.size()); // for (uint32_t i = 0; i < n_vocab; i++) {
file.write_raw(&token_score.score, sizeof(token_score.score)); // const auto & token_score = vocab->id_to_token.at(i);
} // file.write_u32((uint32_t) token_score.tok.size());
// write tensors // file.write_raw(token_score.tok.data(), token_score.tok.size());
write_tensor(&file, model->tok_embeddings); // file.write_raw(&token_score.score, sizeof(token_score.score));
write_tensor(&file, model->norm); // }
write_tensor(&file, model->output); // // write tensors
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { // write_tensor(&file, model->tok_embeddings);
auto & layer = model->layers[i]; // write_tensor(&file, model->norm);
// write_tensor(&file, model->output);
write_tensor(&file, layer.attention_norm); // for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
write_tensor(&file, layer.wq); // auto & layer = model->layers[i];
write_tensor(&file, layer.wk); //
write_tensor(&file, layer.wv); // write_tensor(&file, layer.attention_norm);
write_tensor(&file, layer.wo); // write_tensor(&file, layer.wq);
write_tensor(&file, layer.ffn_norm); // write_tensor(&file, layer.wk);
write_tensor(&file, layer.w1); // write_tensor(&file, layer.wv);
write_tensor(&file, layer.w2); // write_tensor(&file, layer.wo);
write_tensor(&file, layer.w3); // write_tensor(&file, layer.ffn_norm);
} // write_tensor(&file, layer.w1);
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
} }
float cosine_decay(const int decay_steps, const float alpha, int step) { float cosine_decay(const int decay_steps, const float alpha, int step) {

604
ggml.c
View file

@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
error_desc = "insufficient memory"; error_desc = "insufficient memory";
break; break;
} }
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
__func__, error_desc, size/(1024.0*1024.0));
return NULL; return NULL;
} }
return aligned_memory; return aligned_memory;
} }
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
@ -4109,7 +4109,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
// //
// is enough, but just in case, adding the second part // is enough, but just in case, adding the second part
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
}
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
} }
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@ -9136,6 +9140,8 @@ static void ggml_compute_forward_mul(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
@ -16899,7 +16905,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
// compute size of intermediate results // compute size of intermediate results
// TODO: does not take into account scratch buffers !!!! // TODO: does not take into account scratch buffers !!!!
for (int i = 0; i < cgraph->n_nodes; ++i) { for (int i = 0; i < cgraph->n_nodes; ++i) {
size_eval += ggml_nbytes(cgraph->nodes[i]); size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
} }
// print // print
@ -18579,6 +18585,20 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
}; };
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = "u8",
[GGUF_TYPE_INT8] = "i8",
[GGUF_TYPE_UINT16] = "u16",
[GGUF_TYPE_INT16] = "i16",
[GGUF_TYPE_UINT32] = "u32",
[GGUF_TYPE_INT32] = "i32",
[GGUF_TYPE_FLOAT32] = "f32",
[GGUF_TYPE_BOOL] = "bool",
[GGUF_TYPE_STRING] = "str",
[GGUF_TYPE_ARRAY] = "arr",
};
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
union gguf_value { union gguf_value {
uint8_t uint8; uint8_t uint8;
int8_t int8; int8_t int8;
@ -18613,8 +18633,6 @@ struct gguf_header {
uint32_t version; uint32_t version;
uint32_t n_tensors; uint32_t n_tensors;
uint32_t n_kv; uint32_t n_kv;
struct gguf_kv * kv;
}; };
struct gguf_tensor_info { struct gguf_tensor_info {
@ -18622,44 +18640,69 @@ struct gguf_tensor_info {
uint32_t n_dims; uint32_t n_dims;
uint32_t ne[GGML_MAX_DIMS]; uint32_t ne[GGML_MAX_DIMS];
uint32_t n_elms; // TODO: is this needed?
enum ggml_type type; enum ggml_type type;
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void * data;
size_t size;
}; };
struct gguf_context { struct gguf_context {
struct gguf_header header; struct gguf_header header;
struct gguf_kv * kv;
struct gguf_tensor_info * infos; struct gguf_tensor_info * infos;
size_t alignment; size_t alignment;
size_t offset; // offset of `data` from beginning of file size_t offset; // offset of `data` from beginning of file
size_t size_data; // size of `data` in bytes size_t size; // size of `data` in bytes
//uint8_t * padding; //uint8_t * padding;
uint8_t * data; void * data;
}; };
static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset) { static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
const size_t n = fread(dst, 1, size, file); const size_t n = fread(dst, 1, size, file);
*offset += n; *offset += n;
return n == size; return n == size;
} }
static bool gguf_fread_str(struct gguf_str * p, FILE * file, size_t * offset) { static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
p->n = 0; p->n = 0;
p->data = NULL; p->data = NULL;
bool ok = true; bool ok = true;
// TODO: how to avoid mallocs for strings? // TODO: how to avoid mallocs for strings?
ok = ok && gguf_fread_el(&p->n, sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
ok = ok && gguf_fread_el( p->data, p->n, file, offset); ok = ok && gguf_fread_el(file, p->data, p->n, offset);
return ok; return ok;
} }
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
ctx->header.magic = GGUF_MAGIC;
ctx->header.version = GGUF_VERSION;
ctx->header.n_tensors = 0;
ctx->header.n_kv = 0;
ctx->kv = NULL;
ctx->infos = NULL;
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
ctx->offset = 0;
ctx->size = 0;
ctx->data = NULL;
return ctx;
}
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = fopen(fname, "rb"); FILE * file = fopen(fname, "rb");
if (!file) { if (!file) {
@ -18673,7 +18716,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// check the magic before making allocations // check the magic before making allocations
{ {
gguf_fread_el(&magic, sizeof(magic), file, &offset); gguf_fread_el(file, &magic, sizeof(magic), &offset);
if (magic != GGUF_MAGIC) { if (magic != GGUF_MAGIC) {
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
@ -18689,14 +18732,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the header // read the header
{ {
ctx->header.magic = magic; ctx->header.magic = magic;
ctx->header.kv = NULL;
ctx->kv = NULL;
ctx->infos = NULL; ctx->infos = NULL;
ctx->data = NULL; ctx->data = NULL;
ok = ok && gguf_fread_el(&ctx->header.version, sizeof(ctx->header.version), file, &offset); ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset); ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
ok = ok && gguf_fread_el(&ctx->header.n_kv, sizeof(ctx->header.n_kv), file, &offset); ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__); fprintf(stderr, "%s: failed to read header\n", __func__);
@ -18708,33 +18751,33 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs // read the kv pairs
{ {
ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->header.kv[i]; struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i); //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
ok = ok && gguf_fread_str(&kv->key, file, &offset); ok = ok && gguf_fread_str(file, &kv->key, &offset);
//ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset); //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
ok = ok && gguf_fread_el (&kv->type, sizeof(kv->type), file, &offset); ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
switch (kv->type) { switch (kv->type) {
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (&kv->value.uint8, sizeof(kv->value.uint8), file, &offset); break; case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (&kv->value.int8, sizeof(kv->value.int8), file, &offset); break; case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (&kv->value.uint16, sizeof(kv->value.uint16), file, &offset); break; case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (&kv->value.int16, sizeof(kv->value.int16), file, &offset); break; case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (&kv->value.uint32, sizeof(kv->value.uint32), file, &offset); break; case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (&kv->value.int32, sizeof(kv->value.int32), file, &offset); break; case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break; case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (&kv->value.bool_, sizeof(kv->value.bool_), file, &offset); break; case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(&kv->value.str, file, &offset); break; case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
{ {
ok = ok && gguf_fread_el(&kv->value.arr.type, sizeof(kv->value.arr.type), file, &offset); ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
ok = ok && gguf_fread_el(&kv->value.arr.n, sizeof(kv->value.arr.n), file, &offset); ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
switch (kv->value.arr.type) { switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8: case GGUF_TYPE_UINT8:
@ -18747,17 +18790,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
case GGUF_TYPE_BOOL: case GGUF_TYPE_BOOL:
{ {
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
ok = ok && gguf_fread_el(kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], file, &offset); ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
} break; } break;
case GGUF_TYPE_STRING: case GGUF_TYPE_STRING:
{ {
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
for (uint32_t j = 0; j < kv->value.arr.n; ++j) { for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(&((struct gguf_str *) kv->value.arr.data)[j], file, &offset); ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
} }
} break; } break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
}; };
} break; } break;
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
@ -18787,14 +18830,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
info->ne[j] = 1; info->ne[j] = 1;
} }
ok = ok && gguf_fread_str(&info->name, file, &offset); ok = ok && gguf_fread_str(file, &info->name, &offset);
ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims), file, &offset); ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
for (uint32_t j = 0; j < info->n_dims; ++j) { for (uint32_t j = 0; j < info->n_dims; ++j) {
ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset); ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
} }
//ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms), file, &offset); ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (&info->type, sizeof(info->type), file, &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
ok = ok && gguf_fread_el (&info->offset, sizeof(info->offset), file, &offset);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__); fprintf(stderr, "%s: failed to read tensor info\n", __func__);
@ -18827,8 +18869,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// compute the total size of the data section, taking into account the alignment // compute the total size of the data section, taking into account the alignment
{ {
ctx->size = 0;
ctx->size_data = 0;
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
@ -18848,7 +18889,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
ctx->size_data += GGML_PAD(size_cur, ctx->alignment); ctx->size += GGML_PAD(size_cur, ctx->alignment);
} }
} }
@ -18862,7 +18903,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
const size_t mem_size = const size_t mem_size =
params.no_alloc ? params.no_alloc ?
(ctx->header.n_tensors )*ggml_tensor_overhead() : (ctx->header.n_tensors )*ggml_tensor_overhead() :
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data; (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
struct ggml_init_params pdata = { struct ggml_init_params pdata = {
.mem_size = mem_size, .mem_size = mem_size,
@ -18877,12 +18918,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct ggml_tensor * data = NULL; struct ggml_tensor * data = NULL;
if (params.no_alloc == false) { if (params.no_alloc == false) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data); data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL; ok = ok && data != NULL;
// read the binary blob with the tensor data // read the binary blob with the tensor data
ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset); ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: failed to read tensor data\n", __func__); fprintf(stderr, "%s: failed to read tensor data\n", __func__);
@ -18944,10 +18985,10 @@ void gguf_free(struct gguf_context * ctx) {
return; return;
} }
if (ctx->header.kv) { if (ctx->kv) {
// free string memory - not great.. // free string memory - not great..
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->header.kv[i]; struct gguf_kv * kv = &ctx->kv[i];
if (kv->key.data) { if (kv->key.data) {
free(kv->key.data); free(kv->key.data);
@ -18974,7 +19015,7 @@ void gguf_free(struct gguf_context * ctx) {
} }
} }
GGML_ALIGNED_FREE(ctx->header.kv); GGML_ALIGNED_FREE(ctx->kv);
} }
if (ctx->infos) { if (ctx->infos) {
@ -18992,6 +19033,10 @@ void gguf_free(struct gguf_context * ctx) {
GGML_ALIGNED_FREE(ctx); GGML_ALIGNED_FREE(ctx);
} }
const char * gguf_type_name(enum gguf_type type) {
return GGUF_TYPE_NAME[type];
}
int gguf_get_version(struct gguf_context * ctx) { int gguf_get_version(struct gguf_context * ctx) {
return ctx->header.version; return ctx->header.version;
} }
@ -19014,9 +19059,10 @@ int gguf_get_n_kv(struct gguf_context * ctx) {
int gguf_find_key(struct gguf_context * ctx, const char * key) { int gguf_find_key(struct gguf_context * ctx, const char * key) {
// return -1 if key not found // return -1 if key not found
const int n_kv = gguf_get_n_kv(ctx);
int keyfound = -1; int keyfound = -1;
const int n_kv = gguf_get_n_kv(ctx);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
if (strcmp(key, gguf_get_key(ctx, i)) == 0) { if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
keyfound = i; keyfound = i;
@ -19028,71 +19074,87 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
} }
const char * gguf_get_key(struct gguf_context * ctx, int i) { const char * gguf_get_key(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].key.data; return ctx->kv[i].key.data;
} }
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) { enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].type; return ctx->kv[i].type;
} }
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) { enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.arr.type; return ctx->kv[i].value.arr.type;
}
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.data;
} }
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) { const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
struct gguf_kv * kv = &ctx->header.kv[key_id]; struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
return str->data; return str->data;
} }
float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
return ((float *) ctx->header.kv[key_id].value.arr.data)[i];
}
int gguf_get_arr_n(struct gguf_context * ctx, int i) { int gguf_get_arr_n(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.arr.n; return ctx->kv[i].value.arr.n;
} }
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) { uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.uint8; return ctx->kv[i].value.uint8;
} }
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) { int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.int8; return ctx->kv[i].value.int8;
} }
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) { uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.uint16; return ctx->kv[i].value.uint16;
} }
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) { int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.int16; return ctx->kv[i].value.int16;
} }
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) { uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.uint32; return ctx->kv[i].value.uint32;
} }
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) { int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.int32; return ctx->kv[i].value.int32;
} }
float gguf_get_val_f32(struct gguf_context * ctx, int i) { float gguf_get_val_f32(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.float32; return ctx->kv[i].value.float32;
} }
bool gguf_get_val_bool(struct gguf_context * ctx, int i) { bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.bool_; return ctx->kv[i].value.bool_;
} }
const char * gguf_get_val_str (struct gguf_context * ctx, int i) { const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
return ctx->header.kv[i].value.str.data; return ctx->kv[i].value.str.data;
} }
int gguf_get_n_tensors(struct gguf_context * ctx) { int gguf_get_n_tensors(struct gguf_context * ctx) {
return ctx->header.n_tensors; return ctx->header.n_tensors;
} }
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found
int tensorfound = -1;
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
tensorfound = i;
break;
}
}
return tensorfound;
}
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) { size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
return ctx->infos[i].offset; return ctx->infos[i].offset;
} }
@ -19101,6 +19163,406 @@ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data; return ctx->infos[i].name.data;
} }
// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);
if (idx >= 0) {
return idx;
}
const int n_kv = gguf_get_n_kv(ctx);
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv[n_kv].key.n = strlen(key) + 1;
ctx->kv[n_kv].key.data = strdup(key);
ctx->header.n_kv++;
return n_kv;
}
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT8;
ctx->kv[idx].value.uint8 = val;
}
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT8;
ctx->kv[idx].value.int8 = val;
}
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT16;
ctx->kv[idx].value.uint16 = val;
}
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT16;
ctx->kv[idx].value.int16 = val;
}
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT32;
ctx->kv[idx].value.uint32 = val;
}
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT32;
ctx->kv[idx].value.int32 = val;
}
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
ctx->kv[idx].value.float32 = val;
}
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_BOOL;
ctx->kv[idx].value.bool_ = val;
}
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_STRING;
ctx->kv[idx].value.str.n = strlen(val) + 1;
ctx->kv[idx].value.str.data = strdup(val);
}
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = type;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
}
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
for (int i = 0; i < n; i++) {
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
str->n = strlen(data[i]) + 1;
str->data = strdup(data[i]);
}
}
// set or add KV pairs from another context
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
for (uint32_t i = 0; i < src->header.n_kv; i++) {
switch (src->kv[i].type) {
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
case GGUF_TYPE_ARRAY:
{
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
}
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
free(data);
} if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
GGML_ASSERT(false && "nested arrays not supported");
} else {
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
}
} break;
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
}
}
}
void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos[idx].name.n = strlen(tensor->name) + 1;
ctx->infos[idx].name.data = strdup(tensor->name);
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
ctx->infos[idx].ne[i] = 1;
}
ctx->infos[idx].n_dims = tensor->n_dims;
for (int i = 0; i < tensor->n_dims; i++) {
ctx->infos[idx].ne[i] = tensor->ne[i];
}
ctx->infos[idx].type = tensor->type;
ctx->infos[idx].offset = 0;
ctx->infos[idx].data = tensor->data;
ctx->infos[idx].size = ggml_nbytes(tensor);
if (ctx->header.n_tensors > 0) {
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
}
ctx->header.n_tensors++;
}
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ASSERT(false && "tensor not found");
}
ctx->infos[idx].type = type;
}
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ASSERT(false && "tensor not found");
}
ctx->infos[idx].data = data;
ctx->infos[idx].size = size;
// update offsets
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
}
}
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
// fwrite(&val->n, sizeof(val->n), 1, file);
// fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
// fwrite(val, sizeof(char), size, file);
//}
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
static struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : malloc(size),
/*buf.size =*/ size,
/*buf.offset =*/ 0,
};
return buf;
}
static void gguf_buf_free(struct gguf_buf buf) {
if (buf.data) {
free(buf.data);
}
}
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
if (buf->offset + size > buf->size) {
buf->size = 1.5*(buf->offset + size);
if (buf->data) {
buf->data = realloc(buf->data, buf->size);
}
}
}
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
gguf_buf_grow(buf, sizeof(val->n) + val->n);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
}
buf->offset += sizeof(val->n);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, val->data, val->n);
}
buf->offset += val->n;
}
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
gguf_buf_grow(buf, el_size);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, val, el_size);
}
buf->offset += el_size;
}
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
// write key-value pairs
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
gguf_bwrite_str(buf, &kv->key);
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
switch (kv->type) {
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
case GGUF_TYPE_ARRAY:
{
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_BOOL:
{
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
} break;
case GGUF_TYPE_STRING:
{
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
}
} break;
case GGUF_TYPE_ARRAY:
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
};
} break;
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
};
}
// write tensor infos
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
gguf_bwrite_str(buf, &info->name);
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
for (uint32_t j = 0; j < info->n_dims; ++j) {
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
}
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
}
// we require the data section to be aligned, so take into account any padding
{
const size_t offset = buf->offset;
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
if (offset_pad != offset) {
uint8_t pad = 0;
for (size_t i = 0; i < offset_pad - offset; ++i) {
gguf_bwrite_el(buf, &pad, sizeof(pad));
}
}
}
if (only_meta) {
return;
}
size_t offset = 0;
// write tensor data
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
const size_t size = info->size;
const size_t size_pad = GGML_PAD(size, ctx->alignment);
gguf_bwrite_el(buf, info->data, size);
if (size_pad != size) {
uint8_t pad = 0;
for (size_t j = 0; j < size_pad - size; ++j) {
gguf_bwrite_el(buf, &pad, sizeof(pad));
}
}
GGML_ASSERT(offset == info->offset);
offset += size_pad;
}
}
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = fopen(fname, "wb");
if (!file) {
GGML_ASSERT(false && "failed to open file for writing");
}
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, only_meta);
fwrite(buf.data, 1, buf.offset, file);
gguf_buf_free(buf);
fclose(file);
}
size_t gguf_get_meta_size(struct gguf_context * ctx) {
// no allocs - only compute size
struct gguf_buf buf = gguf_buf_init(0);
gguf_write_to_buf(ctx, &buf, true);
return buf.offset;
}
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, true);
memcpy(data, buf.data, buf.offset);
gguf_buf_free(buf);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int ggml_cpu_has_avx(void) { int ggml_cpu_has_avx(void) {

65
ggml.h
View file

@ -566,6 +566,7 @@ extern "C" {
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
GGML_API int ggml_blck_size (enum ggml_type type); GGML_API int ggml_blck_size (enum ggml_type type);
@ -1498,7 +1499,6 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor); struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
@ -1711,7 +1711,6 @@ extern "C" {
// gguf // gguf
// //
// TODO: can be removed if the API is extended for writing
enum gguf_type { enum gguf_type {
GGUF_TYPE_UINT8 = 0, GGUF_TYPE_UINT8 = 0,
GGUF_TYPE_INT8 = 1, GGUF_TYPE_INT8 = 1,
@ -1735,10 +1734,14 @@ extern "C" {
struct ggml_context ** ctx; struct ggml_context ** ctx;
}; };
GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..); //GGML_API struct gguf_context * gguf_init_from_buffer(..);
GGML_API void gguf_free(struct gguf_context * ctx); GGML_API void gguf_free(struct gguf_context * ctx);
GGML_API const char * gguf_type_name(enum gguf_type type);
GGML_API int gguf_get_version (struct gguf_context * ctx); GGML_API int gguf_get_version (struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx); GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx); GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
@ -1747,13 +1750,11 @@ extern "C" {
GGML_API int gguf_get_n_kv(struct gguf_context * ctx); GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key); GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i); GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
// results are undefined if the wrong type is used for the key
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i); GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i); GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i); GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
@ -1764,12 +1765,60 @@ extern "C" {
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i); GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i); GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i); GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
GGML_API void gguf_get_arr_data(struct gguf_context * ctx, int i, void * data); GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i); GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
// overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
// set or add KV pairs from another context
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
// manage tensor info
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
// writing gguf files can be done in 2 ways:
//
// - write the entire gguf_context to a binary file in a single pass:
//
// gguf_write_to_file(ctx, fname);
//
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
//
// FILE * f = fopen(fname, "wb");
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
// fwrite(f, ...);
// void * data = gguf_meta_get_meta_data(ctx);
// fseek(f, 0, SEEK_SET);
// fwrite(f, data, gguf_get_meta_size(ctx));
// free(data);
// fclose(f);
//
// write the entire context to a binary file
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
// //
// system info // system info
// //

File diff suppressed because it is too large Load diff

View file

@ -1,505 +0,0 @@
#ifndef LLAMA_H
#define LLAMA_H
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#ifdef __GNUC__
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define DEPRECATED(func, hint) func
#endif
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_model;
struct llama_context;
typedef int llama_token;
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
float p; // probability of the token
} llama_token_data;
typedef struct llama_token_data_array {
llama_token_data * data;
size_t size;
bool sorted;
} llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx);
enum llama_log_level {
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
// Signature for logging events
// Note that text includes the new line character at the end for most events.
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists.
// It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
} llama_model_quantize_params;
// grammar types
struct llama_grammar;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
// performance timing information
struct llama_timings {
double t_start_ms;
double t_end_ms;
double t_load_ms;
double t_sample_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_sample;
int32_t n_p_eval;
int32_t n_eval;
};
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
// TODO: not great API - very likely to change
// Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free();
LLAMA_API int64_t llama_time_us();
LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_context_params params);
LLAMA_API void llama_free_model(struct llama_model * model);
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// Returns 0 on success
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
const char * path_base_model,
int n_threads),
"please use llama_model_apply_lora_from_file instead");
LLAMA_API int llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
const char * path_base_model,
int n_threads);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
// Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
// Set the state reading from the specified address
// Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
// Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
LLAMA_API int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Same as llama_eval, but use float matrix input directly.
LLAMA_API int llama_eval_embd(
struct llama_context * ctx,
const float * embd,
int n_tokens,
int n_past,
int n_threads);
// Export a static computation graph for context of 511 and batch size of 1
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
// parameters here to keep things simple
// IMPORTANT: do not use for anything else other than debugging and testing!
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_tokenize_bpe(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
LLAMA_API int llama_get_vocab_from_model(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API int llama_token_to_str(
const struct llama_context * ctx,
llama_token token,
char * str,
int length);
LLAMA_API int llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token,
char * str,
int length);
LLAMA_API int llama_token_to_str_with_model(
const struct llama_model * model,
llama_token token,
char * str,
int length);
// Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line
// Grammar
//
LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
// Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
/// @details Selects the token with the highest probability.
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus
}
#endif
// C++ API, will be moving to common.h soon (TM)
#ifdef LLAMA_API_CPP
#include <vector>
#include <string>
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::string llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif // LLAMA_API_CPP
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H

View file

@ -1,470 +0,0 @@
// GGUF counterpart of llama-util.h.
// we may consider making it a part of ggml.c once GGUF work is complete.
// this will require extra work to migrate this to pure C.
// Contains wrappers around OS interfaces.
#ifndef GGUF_UTIL_H
#define GGUF_UTIL_H
#include "ggml.h"
#include <cstdio>
#include <cstdint>
#include <cerrno>
#include <cstring>
#include <cstdarg>
#include <cstdlib>
#include <climits>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h> // for _fseeki64
#endif
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
// TODO: can we merge this one and gguf_context?
struct gguf_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
gguf_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
size_t write_str(const std::string & val) {
size_t total_written = 0;
const int32_t n = val.size();
fwrite((const char *) &n, sizeof(n), 1, fp);
total_written += sizeof(n);
fwrite(val.c_str(), n, 1, fp);
total_written += n;
return total_written;
}
size_t write_i32(int32_t val) {
fwrite((const char *) &val, sizeof(val), 1, fp);
return sizeof(val);
}
size_t write_u64(size_t val) {
fwrite((const char *) &val, sizeof(val), 1, fp);
return sizeof(val);
}
template<typename T>
void write_val(const std::string & key, enum gguf_type type, const T & val) {
write_str(key);
fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &val, sizeof(val), 1, fp);
}
template<typename T>
void write_arr(const std::string & key, enum gguf_type type, const std::vector<T> & val) {
write_str(key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
}
const int32_t n = val.size();
fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &n, sizeof(n), 1, fp);
fwrite(val.data(), sizeof(T), n, fp);
}
void write_str(const std::string & key, enum gguf_type type, const std::string & val) {
write_str(key);
fwrite((const char *) &type, sizeof(type), 1, fp);
const int32_t n = val.size();
fwrite((const char *) &n, sizeof(n), 1, fp);
fwrite(val.c_str(), n, 1, fp);
}
void write_str(const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
write_str(key);
{
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
}
const int32_t n = val.size();
fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &n, sizeof(n), 1, fp);
for (int i = 0; i < n; ++i) {
const int32_t nstr = val[i].size();
fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
fwrite(val[i].c_str(), nstr, 1, fp);
}
}
void write_zeros(size_t count) {
for (size_t i = 0; i < count; ++i) {
fputc(0, fp);
}
}
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
~gguf_file() {
if (fp) {
std::fclose(fp);
}
}
};
#if defined(_WIN32)
static std::string gguf_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif
struct gguf_mmap {
void * addr;
size_t size;
gguf_mmap(const gguf_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}
~gguf_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~gguf_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
(void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct gguf_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
gguf_mlock() {}
gguf_mlock(const gguf_mlock &) = delete;
~gguf_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * ptr) {
GGML_ASSERT(addr == NULL && size == 0);
addr = ptr;
}
void grow_to(size_t target_size) {
GGML_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
gguf_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif
};
#endif

239
gguf.py
View file

@ -4,14 +4,169 @@
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org. 3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
""" """
import sys
import struct import struct
import constants import numpy as np
from enum import IntEnum from enum import IntEnum
from typing import Any, IO, List from typing import Any, IO, List
import numpy as np #
import sys # constants
#
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_ALIGNMENT = "general.alignment"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"
# tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
#
# recommended mapping of model tensor names for storage in gguf
#
def get_tensor_name_map(n_blocks : int):
tensor_map = {}
# Token embeddings
mapped_to = "token_embd"
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings
mapped_to = "pos_embd"
tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output norm
mapped_to = "output_norm"
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth
# Output
mapped_to = "output"
tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks
for i in range(0,n_blocks):
# Attention norm
mapped_to = "blk."+str(i)+".attn_norm"
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2"
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
# Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv"
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query
mapped_to = "blk."+str(i)+".attn_q"
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key
mapped_to = "blk."+str(i)+".attn_k"
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value
mapped_to = "blk."+str(i)+".attn_v"
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output
mapped_to = "blk."+str(i)+".attn_output"
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm"
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate"
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map
#
# implementation
#
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
@ -51,15 +206,15 @@ class GGUFWriter:
def __init__(self, fout: IO): def __init__(self, fout: IO):
self.fout = fout self.fout = fout
self.offset_tensor = 0 self.offset_tensor = 0
self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = b"" self.kv_data = b""
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = b"" self.ti_data = b""
self.ti_data_count = 0 self.ti_data_count = 0
def write_header_to_file(self): def write_header_to_file(self):
self.fout.write(struct.pack("<I", constants.GGUF_MAGIC)) self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", constants.GGUF_VERSION)) self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<I", self.ti_data_count)) self.fout.write(struct.pack("<I", self.ti_data_count))
self.fout.write(struct.pack("<I", self.kv_data_count)) self.fout.write(struct.pack("<I", self.kv_data_count))
self.flush() self.flush()
@ -201,123 +356,125 @@ class GGUFWriter:
self.fout.close() self.fout.close()
def add_architecture(self, architecture: str): def add_architecture(self, architecture: str):
self.add_string(constants.KEY_GENERAL_ARCHITECTURE, self.add_string(KEY_GENERAL_ARCHITECTURE,
architecture) architecture)
def add_author(self, author: str): def add_author(self, author: str):
self.add_string(constants.KEY_GENERAL_AUTHOR, author) self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str):
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(constants.KEY_GENERAL_URL, url) self.add_string(KEY_GENERAL_URL, url)
def add_description(self, description: str): def add_description(self, description: str):
self.add_string(constants.KEY_GENERAL_DESCRIPTION, description) self.add_string(KEY_GENERAL_DESCRIPTION, description)
def add_file_type(self, file_type: str): def add_file_type(self, file_type: str):
self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type) self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
def add_source_url(self, url: str): def add_source_url(self, url: str):
self.add_string(constants.KEY_GENERAL_SOURCE_URL, url) self.add_string(KEY_GENERAL_SOURCE_URL, url)
def add_source_hf_repo(self, repo: str): def add_source_hf_repo(self, repo: str):
self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo) self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
def add_name(self, name: str): def add_name(self, name: str):
self.add_string(constants.KEY_GENERAL_NAME, name) self.add_string(KEY_GENERAL_NAME, name)
def add_quantization_version(self, quantization_version: GGMLQuantizationType): def add_quantization_version(self, quantization_version: GGMLQuantizationType):
self.add_uint32( self.add_uint32(
constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version) KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
def add_custom_alignment(self, alignment: int): def add_custom_alignment(self, alignment: int):
self.data_alignment = alignment self.data_alignment = alignment
self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment) self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
def add_context_length(self, llm: str, length: int): def add_context_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length) KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
def add_embedding_length(self, llm: str, length: int): def add_embedding_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length) KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
def add_block_count(self, llm: str, length: int): def add_block_count(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length) KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
def add_feed_forward_length(self, llm: str, length: int): def add_feed_forward_length(self, llm: str, length: int):
self.add_uint32( self.add_uint32(
constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length) KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
def add_parallel_residual(self, llm: str, use: bool): def add_parallel_residual(self, llm: str, use: bool):
self.add_bool( self.add_bool(
constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use) KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
def add_tensor_data_layout(self, llm: str, layout: str): def add_tensor_data_layout(self, llm: str, layout: str):
self.add_string( self.add_string(
constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout) KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
def add_head_count(self, llm: str, count: int): def add_head_count(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
def add_head_count_kv(self, llm: str, count: int): def add_head_count_kv(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
def add_max_alibi_bias(self, llm: str, bias: float): def add_max_alibi_bias(self, llm: str, bias: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias) KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
def add_clamp_kqv(self, llm: str, value: float): def add_clamp_kqv(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value) KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
def add_layer_norm_eps(self, llm: str, value: float): def add_layer_norm_eps(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
def add_layer_norm_rms_eps(self, llm: str, value: float): def add_layer_norm_rms_eps(self, llm: str, value: float):
self.add_float32( self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
def add_rope_dimension_count(self, llm: str, count: int): def add_rope_dimension_count(self, llm: str, count: int):
self.add_uint32( self.add_uint32(
constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count) KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
def add_rope_scale(self, llm: str, value: float): def add_rope_scale(self, llm: str, value: float):
self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value) self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(constants.KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)
def add_token_list(self, tokens: List): def add_token_list(self, tokens: List):
self.add_array(constants.KEY_TOKENIZER_LIST, tokens) self.add_array(KEY_TOKENIZER_LIST, tokens)
def add_token_merges(self, merges: List): def add_token_merges(self, merges: List):
self.add_array(constants.KEY_TOKENIZER_MERGES, merges) self.add_array(KEY_TOKENIZER_MERGES, merges)
def add_token_types(self, types: List[int]): def add_token_types(self, types: List[int]):
self.add_array(constants.KEY_TOKENIZER_TOKEN_TYPE, types) self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
def add_token_scores(self, scores: List[float]): def add_token_scores(self, scores: List[float]):
self.add_array(constants.KEY_TOKENIZER_SCORES, scores) self.add_array(KEY_TOKENIZER_SCORES, scores)
def add_bos_token_id(self, id: int): def add_bos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id) self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
def add_eos_token_id(self, id: int): def add_eos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id) self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
def add_unk_token_id(self, id: int): def add_unk_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id) self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
def add_sep_token_id(self, id: int): def add_sep_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id) self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
def add_pad_token_id(self, id: int): def add_pad_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id) self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,95 +0,0 @@
# Recommended mapping of model tensor names for storage in gguf
def get_tensor_namemap( n_blocks : int):
tensor_map = {}
# Token embeddings
mapped_to = "token_embd"
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings
mapped_to = "pos_embd"
tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output norm
mapped_to = "output_norm"
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth
# Output
mapped_to = "output"
tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks
for i in range(0,n_blocks):
# Attention norm
mapped_to = "blk."+str(i)+".attn_norm"
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2"
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
# Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv"
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query
mapped_to = "blk."+str(i)+".attn_q"
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key
mapped_to = "blk."+str(i)+".attn_k"
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value
mapped_to = "blk."+str(i)+".attn_v"
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output
mapped_to = "blk."+str(i)+".attn_output"
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm"
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate"
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map

View file

@ -381,6 +381,8 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }

View file

@ -1,553 +0,0 @@
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
#ifndef LLAMA_UTIL_H
#define LLAMA_UTIL_H
#include <cstdio>
#include <cstdint>
#include <cerrno>
#include <cstring>
#include <cstdarg>
#include <cstdlib>
#include <climits>
#include <string>
#include <vector>
#include <stdexcept>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h> // for _fseeki64
#endif
#define LLAMA_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
LLAMA_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
LLAMA_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
// llama_context_data
struct llama_data_context {
virtual void write(const void * src, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_context() = default;
};
struct llama_data_buffer_context : llama_data_context {
uint8_t* ptr;
size_t size_written = 0;
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
void write(const void * src, size_t size) override {
memcpy(ptr, src, size);
ptr += size;
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_file_context : llama_data_context {
llama_file* file;
size_t size_written = 0;
llama_data_file_context(llama_file * f) : file(f) {}
void write(const void * src, size_t size) override {
file->write_raw(src, size);
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}
~llama_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
if (prefetch) {
// The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
// will dynamically load it using GetProcAddress.
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
HMODULE hKernel32;
// This call is guaranteed to succeed.
hKernel32 = GetModuleHandleW(L"kernel32.dll");
// This call may fail if on a pre-Win8 system.
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
if (pPrefetchVirtualMemory) {
// Advise the kernel to preload the mapped memory.
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
}
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
(void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() {}
llama_mlock(const llama_mlock &) = delete;
~llama_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * ptr) {
LLAMA_ASSERT(addr == NULL && size == 0);
addr = ptr;
}
void grow_to(size_t target_size) {
LLAMA_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif
};
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
llama_buffer() = default;
void resize(size_t len) {
#ifdef GGML_USE_METAL
free(addr);
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
}
else {
addr = NULL;
}
#else
delete[] addr;
addr = new uint8_t[len];
#endif
size = len;
}
~llama_buffer() {
#ifdef GGML_USE_METAL
free(addr);
#else
delete[] addr;
#endif
addr = NULL;
}
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) {
free();
addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size;
}
void free() {
if (addr) {
if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
}
addr = NULL;
}
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;
#endif
#endif

2429
llama.cpp

File diff suppressed because it is too large Load diff

61
llama.h
View file

@ -34,29 +34,18 @@
# define DEPRECATED(func, hint) func # define DEPRECATED(func, hint) func
#endif #endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_VERSION 3
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif
#ifndef LLAMA_DEFAULT_RMS_EPS
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -103,8 +92,6 @@ extern "C" {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size int32_t n_batch; // prompt processing batch size
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
@ -129,6 +116,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only bool embedding; // embedding mode only
}; };
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
@ -208,17 +196,12 @@ extern "C" {
int32_t n_eval; int32_t n_eval;
}; };
// Set callback for all future logging events. LLAMA_API struct llama_context_params llama_context_default_params(void);
// If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API int llama_max_devices(); LLAMA_API int llama_max_devices(void);
LLAMA_API bool llama_mmap_supported(void);
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API bool llama_mlock_supported(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
@ -226,9 +209,9 @@ extern "C" {
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa); LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(); LLAMA_API void llama_backend_free(void);
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us(void);
LLAMA_API struct llama_model * llama_load_model_from_file( LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
@ -240,13 +223,6 @@ extern "C" {
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params),
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
@ -384,27 +360,28 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context // Token Id -> String. Uses the vocabulary in the provided context
// Does not write null terminator to the buffer
LLAMA_API int llama_token_to_str( LLAMA_API int llama_token_to_str(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token, llama_token token,
char * str, char * buf,
int length); int length);
LLAMA_API int llama_token_to_str_bpe( LLAMA_API int llama_token_to_str_bpe(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token, llama_token token,
char * str, char * buf,
int length); int length);
LLAMA_API int llama_token_to_str_with_model( LLAMA_API int llama_token_to_str_with_model(
const struct llama_model * model, const struct llama_model * model,
llama_token token, llama_token token,
char * str, char * buf,
int length); int length);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line LLAMA_API llama_token llama_token_nl(void); // next-line
// Grammar // Grammar
// //
@ -484,6 +461,10 @@ extern "C" {
// Print system information // Print system information
LLAMA_API const char * llama_print_system_info(void); LLAMA_API const char * llama_print_system_info(void);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif