Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
This commit is contained in:
commit
d864596e0a
29 changed files with 2506 additions and 8197 deletions
|
@ -529,7 +529,6 @@ endif()
|
||||||
add_library(llama
|
add_library(llama
|
||||||
llama.cpp
|
llama.cpp
|
||||||
llama.h
|
llama.h
|
||||||
llama-util.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
|
|
12
Makefile
12
Makefile
|
@ -1,5 +1,5 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gguf-llama-simple gptneox-main
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||||
|
@ -329,10 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
|
|
||||||
OBJS += ggml-alloc.o
|
OBJS += ggml-alloc.o
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
|
@ -388,10 +385,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
|
||||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gptneox-main: gptneox-main.cpp ggml.o $(OBJS)
|
gptneox-main: gptneox-main.cpp ggml.o $(OBJS)
|
||||||
|
|
50
constants.py
50
constants.py
|
@ -1,50 +0,0 @@
|
||||||
GGUF_MAGIC = 0x47475546
|
|
||||||
GGUF_VERSION = 1
|
|
||||||
GGUF_DEFAULT_ALIGNMENT = 32
|
|
||||||
|
|
||||||
# general
|
|
||||||
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
|
||||||
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
|
||||||
KEY_GENERAL_ALIGNMENT = "general.alignment"
|
|
||||||
KEY_GENERAL_NAME = "general.name"
|
|
||||||
KEY_GENERAL_AUTHOR = "general.author"
|
|
||||||
KEY_GENERAL_URL = "general.url"
|
|
||||||
KEY_GENERAL_DESCRIPTION = "general.description"
|
|
||||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
|
||||||
KEY_GENERAL_LICENSE = "general.license"
|
|
||||||
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
|
||||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
|
||||||
|
|
||||||
# LLM
|
|
||||||
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
|
|
||||||
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
|
|
||||||
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
|
|
||||||
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
|
|
||||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
|
|
||||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
|
|
||||||
|
|
||||||
# attention
|
|
||||||
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
|
|
||||||
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
|
|
||||||
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
|
|
||||||
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
|
|
||||||
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
|
|
||||||
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
|
|
||||||
|
|
||||||
# RoPE
|
|
||||||
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
|
|
||||||
KEY_ROPE_SCALE = "{llm}.rope.scale"
|
|
||||||
|
|
||||||
# tokenization
|
|
||||||
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
|
||||||
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
|
|
||||||
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
|
|
||||||
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
|
|
||||||
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
|
|
||||||
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
|
|
||||||
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
|
|
||||||
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
|
|
||||||
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
|
|
||||||
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
|
|
||||||
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
|
||||||
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
|
|
@ -1,15 +1,15 @@
|
||||||
# HF gptneox--> gguf conversion
|
# HF gptneox--> gguf conversion
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
import gguf_namemap as tmap
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import struct
|
import struct
|
||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
|
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
tensor_map = tmap.get_tensor_namemap(block_count)
|
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
print("gguf: get tensor metadata")
|
print("gguf: get tensor metadata")
|
||||||
|
|
|
@ -3,18 +3,17 @@
|
||||||
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
import gguf_namemap as tmap
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import struct
|
import struct
|
||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
|
||||||
#NDArray = np.ndarray[Any, Any]
|
#NDArray = np.ndarray[Any, Any]
|
||||||
# compatible with python < 3.9
|
# compatible with python < 3.9
|
||||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||||
|
@ -96,6 +95,7 @@ gguf_writer.add_architecture(llm_arch)
|
||||||
gguf_writer.add_name(last_dir)
|
gguf_writer.add_name(last_dir)
|
||||||
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
|
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
|
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
|
||||||
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
|
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
|
||||||
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
|
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(llm_arch, block_count)
|
gguf_writer.add_block_count(llm_arch, block_count)
|
||||||
|
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
tensor_map = tmap.get_tensor_namemap(block_count)
|
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
print("gguf: get tensor metadata")
|
print("gguf: get tensor metadata")
|
||||||
|
@ -260,7 +260,6 @@ for part_name in part_names:
|
||||||
for name in model_part.keys():
|
for name in model_part.keys():
|
||||||
data = model_part[name]
|
data = model_part[name]
|
||||||
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
old_dtype = data.dtype
|
||||||
|
|
||||||
# we don't need these
|
# we don't need these
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# HF llama --> gguf conversion
|
# HF llama --> gguf conversion
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
import gguf_namemap as tmap
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import struct
|
import struct
|
||||||
|
@ -18,7 +16,9 @@ from sentencepiece import SentencePieceProcessor
|
||||||
# compatible with python < 3.9
|
# compatible with python < 3.9
|
||||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
# reverse HF permute back to original pth layout
|
||||||
|
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
||||||
|
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
||||||
if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head
|
if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
|
@ -93,11 +93,21 @@ if "_name_or_path" in hparams:
|
||||||
else:
|
else:
|
||||||
hf_repo=""
|
hf_repo=""
|
||||||
|
|
||||||
|
if "max_sequence_length" in hparams:
|
||||||
|
ctx_length = hparams["max_sequence_length"]
|
||||||
|
elif "max_position_embeddings" in hparams:
|
||||||
|
ctx_length = hparams["max_position_embeddings"]
|
||||||
|
else:
|
||||||
|
print("gguf: can not find ctx length parameter.")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_architecture(llm_arch)
|
gguf_writer.add_architecture(llm_arch)
|
||||||
gguf_writer.add_name(last_dir)
|
gguf_writer.add_name(last_dir)
|
||||||
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
|
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
|
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
|
||||||
|
gguf_writer.add_context_length(llm_arch, ctx_length)
|
||||||
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
|
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(llm_arch, block_count)
|
gguf_writer.add_block_count(llm_arch, block_count)
|
||||||
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
|
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
|
||||||
|
@ -189,7 +199,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
tensor_map = tmap.get_tensor_namemap(block_count)
|
tensor_map = gguf.get_tensor_name_map(block_count)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
print("gguf: get tensor metadata")
|
print("gguf: get tensor metadata")
|
||||||
|
@ -218,9 +228,9 @@ for part_name in part_names:
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
# permute these
|
# reverse permute these
|
||||||
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
||||||
data = permute(data, head_count, head_count_kv)
|
data = reverse_hf_permute(data, head_count, head_count_kv)
|
||||||
|
|
||||||
# map tensor names
|
# map tensor names
|
||||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||||
|
@ -287,9 +297,9 @@ for part_name in part_names:
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
# permute these
|
# reverse permute these
|
||||||
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
||||||
data = permute(data, head_count, head_count_kv)
|
data = reverse_hf_permute(data, head_count, head_count_kv)
|
||||||
|
|
||||||
# map tensor names
|
# map tensor names
|
||||||
if name.endswith(".weight") and name[:-7] in tensor_map:
|
if name.endswith(".weight") and name[:-7] in tensor_map:
|
||||||
|
@ -315,7 +325,7 @@ for part_name in part_names:
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
data = data.astype(np.float16)
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
|
||||||
gguf_writer.write_tensor_to_file(data)
|
gguf_writer.write_tensor_to_file(data)
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@ TENSORS_SET = set(TENSORS_LIST)
|
||||||
|
|
||||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||||
# hardcoded magic range
|
# hardcoded magic range
|
||||||
for n_mult in range(256, 1, -1):
|
for n_mult in range(8192, 1, -1):
|
||||||
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
||||||
if calc_ff == n_ff:
|
if calc_ff == n_ff:
|
||||||
return n_mult
|
return n_mult
|
||||||
|
@ -118,6 +118,7 @@ class Params:
|
||||||
n_mult: int
|
n_mult: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_layer: int
|
n_layer: int
|
||||||
|
n_kv_head: Optional[int] # This parameter is only used for Llama 2
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: 'LazyModel') -> 'Params':
|
def guessed(model: 'LazyModel') -> 'Params':
|
||||||
|
@ -144,6 +145,7 @@ class Params:
|
||||||
n_mult = 256,
|
n_mult = 256,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
|
n_kv_head = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -155,6 +157,7 @@ class Params:
|
||||||
n_head = config["num_attention_heads"];
|
n_head = config["num_attention_heads"];
|
||||||
n_layer = config["num_hidden_layers"];
|
n_layer = config["num_hidden_layers"];
|
||||||
n_ff = config["intermediate_size"];
|
n_ff = config["intermediate_size"];
|
||||||
|
n_kv_head = config.get("num_key_value_heads")
|
||||||
|
|
||||||
n_mult = find_n_mult(n_ff, n_embd);
|
n_mult = find_n_mult(n_ff, n_embd);
|
||||||
|
|
||||||
|
@ -164,6 +167,7 @@ class Params:
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
|
n_kv_head = n_kv_head,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
|
@ -187,6 +191,7 @@ class Params:
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
|
n_kv_head = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -293,7 +298,9 @@ class SentencePieceVocab:
|
||||||
Vocab = Union[BpeVocab, SentencePieceVocab]
|
Vocab = Union[BpeVocab, SentencePieceVocab]
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
@ -305,7 +312,7 @@ class Tensor(metaclass=ABCMeta):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def astype(self, data_type: DataType) -> 'Tensor': ...
|
def astype(self, data_type: DataType) -> 'Tensor': ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute(self, n_head: int) -> 'Tensor': ...
|
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
|
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -343,8 +350,8 @@ class UnquantizedTensor(Tensor):
|
||||||
r = self.ndarray.shape[0] // 3
|
r = self.ndarray.shape[0] // 3
|
||||||
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
||||||
|
|
||||||
def permute(self, n_head: int) -> 'UnquantizedTensor':
|
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
|
||||||
return UnquantizedTensor(permute(self.ndarray, n_head))
|
return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
|
||||||
|
|
||||||
|
|
||||||
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
|
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
|
||||||
|
@ -367,18 +374,18 @@ GGMLCompatibleTensor = Union[UnquantizedTensor]
|
||||||
|
|
||||||
|
|
||||||
class DeferredPermutedTensor(Tensor):
|
class DeferredPermutedTensor(Tensor):
|
||||||
def __init__(self, base: Tensor, n_head: int) -> None:
|
def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
|
||||||
self.base = base
|
self.base = base
|
||||||
self.n_head = n_head
|
self.n_head = n_head
|
||||||
self.data_type = self.base.data_type
|
self.data_type = self.base.data_type
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> Tensor:
|
def astype(self, data_type: DataType) -> Tensor:
|
||||||
return self.base.astype(data_type).permute(self.n_head)
|
return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
|
||||||
|
|
||||||
def to_ggml(self) -> GGMLCompatibleTensor:
|
def to_ggml(self) -> GGMLCompatibleTensor:
|
||||||
return self.base.to_ggml().permute(self.n_head)
|
return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
|
||||||
|
|
||||||
def permute(self, n_head: int) -> Tensor:
|
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
|
||||||
raise Exception("shouldn't permute twice")
|
raise Exception("shouldn't permute twice")
|
||||||
|
|
||||||
|
|
||||||
|
@ -474,10 +481,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
|
||||||
return ModelPlus(model, paths, format, vocab)
|
return ModelPlus(model, paths, format, vocab)
|
||||||
|
|
||||||
|
|
||||||
def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute(n_head)
|
return lazy_tensor.load().permute(n_head, n_kv_head)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
|
@ -502,7 +509,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
||||||
for i in itertools.count():
|
for i in itertools.count():
|
||||||
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
||||||
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
||||||
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
|
||||||
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
|
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
|
||||||
|
|
|
@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
} else if (arg == "-gqa" || arg == "--gqa") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.n_gqa = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.rms_norm_eps = std::stof(argv[i]);
|
|
||||||
} else if (arg == "--rope-freq-base") {
|
} else if (arg == "--rope-freq-base") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -546,8 +534,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
|
||||||
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
|
||||||
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||||
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||||
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||||
|
@ -638,8 +624,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
lparams.n_batch = params.n_batch;
|
lparams.n_batch = params.n_batch;
|
||||||
lparams.n_gqa = params.n_gqa;
|
|
||||||
lparams.rms_norm_eps = params.rms_norm_eps;
|
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
lparams.tensor_split = params.tensor_split;
|
lparams.tensor_split = params.tensor_split;
|
||||||
|
|
|
@ -23,14 +23,12 @@ struct gpt_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
|
|
||||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
uint32_t magic = file.read_u32();
|
uint32_t magic = file.read_u32();
|
||||||
return magic == LLAMA_FILE_MAGIC;
|
return magic == GGUF_MAGIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||||
|
@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
if (file.fp == NULL) {
|
if (file.fp == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// write_magic
|
|
||||||
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
|
||||||
file.write_u32(LLAMA_FILE_VERSION); // version
|
|
||||||
// write_hparams
|
|
||||||
file.write_u32(model->hparams.n_vocab);
|
|
||||||
file.write_u32(model->hparams.n_embd);
|
|
||||||
file.write_u32(model->hparams.n_mult);
|
|
||||||
file.write_u32(model->hparams.n_head);
|
|
||||||
file.write_u32(model->hparams.n_layer);
|
|
||||||
file.write_u32(model->hparams.n_rot);
|
|
||||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
|
||||||
|
|
||||||
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
#pragma message("TODO: implement file saving using gguf")
|
||||||
uint32_t n_vocab = model->hparams.n_vocab;
|
(void) vocab;
|
||||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
(void) model;
|
||||||
const auto & token_score = vocab->id_to_token.at(i);
|
(void) w;
|
||||||
file.write_u32((uint32_t) token_score.tok.size());
|
// // write_magic
|
||||||
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||||
file.write_raw(&token_score.score, sizeof(token_score.score));
|
// file.write_u32(LLAMA_FILE_VERSION); // version
|
||||||
}
|
// // write_hparams
|
||||||
|
// file.write_u32(model->hparams.n_vocab);
|
||||||
// stuff AK weights into GG weights one by one.
|
// file.write_u32(model->hparams.n_embd);
|
||||||
// w->token_embedding_table -> model->tok_embeddings
|
// file.write_u32(model->hparams.n_mult);
|
||||||
// float* -> struct ggml_tensor
|
// file.write_u32(model->hparams.n_head);
|
||||||
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
// file.write_u32(model->hparams.n_layer);
|
||||||
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
// file.write_u32(model->hparams.n_rot);
|
||||||
|
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||||
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
//
|
||||||
//print_row(model->norm, 0);
|
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||||
|
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||||
// for rms-att-weight
|
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
int row_length = model->hparams.n_embd;
|
// const auto & token_score = vocab->id_to_token.at(i);
|
||||||
const auto & hparams = model->hparams;
|
// file.write_u32((uint32_t) token_score.tok.size());
|
||||||
//int n_ff = model->hparams.n_embd;
|
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||||
int n_ff = get_n_ff(&hparams);
|
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||||
|
// }
|
||||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
//
|
||||||
auto & layer = model->layers[i];
|
// // stuff AK weights into GG weights one by one.
|
||||||
// 1d
|
// // w->token_embedding_table -> model->tok_embeddings
|
||||||
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
// // float* -> struct ggml_tensor
|
||||||
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||||
|
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
|
||||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
//
|
||||||
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||||
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
// //print_row(model->norm, 0);
|
||||||
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
//
|
||||||
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
// // for rms-att-weight
|
||||||
|
// int row_length = model->hparams.n_embd;
|
||||||
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
// const auto & hparams = model->hparams;
|
||||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
// //int n_ff = model->hparams.n_embd;
|
||||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
// int n_ff = get_n_ff(&hparams);
|
||||||
}
|
//
|
||||||
// write tensors
|
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||||
write_tensor(&file, model->tok_embeddings);
|
// auto & layer = model->layers[i];
|
||||||
write_tensor(&file, model->norm);
|
// // 1d
|
||||||
write_tensor(&file, model->output); // ?
|
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||||
auto & layer = model->layers[i];
|
//
|
||||||
|
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||||
write_tensor(&file, layer.attention_norm);
|
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||||
write_tensor(&file, layer.wq);
|
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||||
write_tensor(&file, layer.wk);
|
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||||
write_tensor(&file, layer.wv);
|
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||||
write_tensor(&file, layer.wo);
|
//
|
||||||
write_tensor(&file, layer.ffn_norm);
|
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||||
write_tensor(&file, layer.w1);
|
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||||
write_tensor(&file, layer.w2);
|
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||||
write_tensor(&file, layer.w3);
|
// }
|
||||||
}
|
// // write tensors
|
||||||
|
// write_tensor(&file, model->tok_embeddings);
|
||||||
|
// write_tensor(&file, model->norm);
|
||||||
|
// write_tensor(&file, model->output); // ?
|
||||||
|
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||||
|
// auto & layer = model->layers[i];
|
||||||
|
//
|
||||||
|
// write_tensor(&file, layer.attention_norm);
|
||||||
|
// write_tensor(&file, layer.wq);
|
||||||
|
// write_tensor(&file, layer.wk);
|
||||||
|
// write_tensor(&file, layer.wv);
|
||||||
|
// write_tensor(&file, layer.wo);
|
||||||
|
// write_tensor(&file, layer.ffn_norm);
|
||||||
|
// write_tensor(&file, layer.w1);
|
||||||
|
// write_tensor(&file, layer.w2);
|
||||||
|
// write_tensor(&file, layer.w3);
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
struct train_params get_default_train_params() {
|
struct train_params get_default_train_params() {
|
||||||
|
|
|
@ -1,126 +0,0 @@
|
||||||
#ifndef _GNU_SOURCE
|
|
||||||
#define _GNU_SOURCE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "gguf-llama.h"
|
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
gpt_params params;
|
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
|
||||||
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
|
|
||||||
return 1 ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 2) {
|
|
||||||
params.model = argv[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 3) {
|
|
||||||
params.prompt = argv[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
|
||||||
params.prompt = "Hello my name is";
|
|
||||||
}
|
|
||||||
|
|
||||||
// init LLM
|
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
|
||||||
|
|
||||||
if ((int) tokens_list.size() > max_tokens_list_size) {
|
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// main loop
|
|
||||||
|
|
||||||
// The LLM keeps a contextual cache memory of previous token evaluation.
|
|
||||||
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
|
|
||||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
|
||||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
|
||||||
|
|
||||||
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
|
|
||||||
// evaluate the transformer
|
|
||||||
|
|
||||||
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokens_list.clear();
|
|
||||||
|
|
||||||
// sample the next token
|
|
||||||
|
|
||||||
llama_token new_token_id = 0;
|
|
||||||
|
|
||||||
auto logits = llama_get_logits(ctx);
|
|
||||||
auto n_vocab = llama_n_vocab(ctx);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of stream ?
|
|
||||||
if (new_token_id == llama_token_eos()) {
|
|
||||||
fprintf(stderr, " [end of text]\n");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// print the new token :
|
|
||||||
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
|
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
// push this new token for next evaluation
|
|
||||||
tokens_list.push_back(new_token_id);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,6 +1,5 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "gguf-util.h"
|
#include "llama.h"
|
||||||
#include "gguf-llama.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -21,133 +20,22 @@ static std::string to_string(const T & val) {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fout.write((const char *) &n, sizeof(n));
|
|
||||||
fout.write(val.c_str(), n);
|
|
||||||
}
|
|
||||||
|
|
||||||
void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
|
|
||||||
fout.write((const char *) &val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
|
|
||||||
fout.write((const char *) &val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
|
|
||||||
gguf_ex_write_str(fout, key);
|
|
||||||
fout.write((const char *) &type, sizeof(type));
|
|
||||||
fout.write((const char *) &val, sizeof(val));
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
|
|
||||||
gguf_ex_write_str(fout, key);
|
|
||||||
fout.write((const char *) &type, sizeof(type));
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fout.write((const char *) &n, sizeof(n));
|
|
||||||
fout.write(val.c_str(), n);
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
|
|
||||||
gguf_ex_write_str(fout, key);
|
|
||||||
{
|
|
||||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
|
||||||
fout.write((const char *) &tarr, sizeof(tarr));
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fout.write((const char *) &type, sizeof(type));
|
|
||||||
fout.write((const char *) &n, sizeof(n));
|
|
||||||
fout.write((const char *) val.data(), n * sizeof(T));
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
fprintf(stdout, "%s", to_string(val[i]).c_str());
|
|
||||||
if (i < n - 1) {
|
|
||||||
fprintf(stdout, ", ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(stdout, "]\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
|
|
||||||
gguf_ex_write_str(fout, key);
|
|
||||||
{
|
|
||||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
|
||||||
fout.write((const char *) &tarr, sizeof(tarr));
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fout.write((const char *) &type, sizeof(type));
|
|
||||||
fout.write((const char *) &n, sizeof(n));
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
const int32_t nstr = val[i].size();
|
|
||||||
fout.write((const char *) &nstr, sizeof(nstr));
|
|
||||||
fout.write(val[i].c_str(), nstr);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
fprintf(stdout, "%s", val[i].c_str());
|
|
||||||
if (i < n - 1) {
|
|
||||||
fprintf(stdout, ", ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(stdout, "]\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
bool gguf_ex_write(const std::string & fname) {
|
bool gguf_ex_write(const std::string & fname) {
|
||||||
std::ofstream fout(fname.c_str(), std::ios::binary);
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
{
|
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||||
const int32_t magic = GGUF_MAGIC;
|
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
|
||||||
fout.write((const char *) &magic, sizeof(magic));
|
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
|
||||||
}
|
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
|
||||||
|
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||||
|
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||||
|
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||||
|
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||||
|
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||||
|
|
||||||
{
|
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
|
||||||
const int32_t version = GGUF_VERSION;
|
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
|
||||||
fout.write((const char *) &version, sizeof(version));
|
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: these have to match the output below!
|
|
||||||
const int n_tensors = 10;
|
|
||||||
const int n_kv = 12;
|
|
||||||
|
|
||||||
fout.write((const char*) &n_tensors, sizeof(n_tensors));
|
|
||||||
fout.write((const char*) &n_kv, sizeof(n_kv));
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: write header\n", __func__);
|
|
||||||
|
|
||||||
// kv data
|
|
||||||
{
|
|
||||||
gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8", GGUF_TYPE_UINT8, 0x12);
|
|
||||||
gguf_ex_write_val< int8_t>(fout, "some.parameter.int8", GGUF_TYPE_INT8, -0x13);
|
|
||||||
gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16", GGUF_TYPE_UINT16, 0x1234);
|
|
||||||
gguf_ex_write_val< int16_t>(fout, "some.parameter.int16", GGUF_TYPE_INT16, -0x1235);
|
|
||||||
gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32", GGUF_TYPE_UINT32, 0x12345678);
|
|
||||||
gguf_ex_write_val< int32_t>(fout, "some.parameter.int32", GGUF_TYPE_INT32, -0x12345679);
|
|
||||||
|
|
||||||
gguf_ex_write_val<float> (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
|
|
||||||
gguf_ex_write_val<bool> (fout, "some.parameter.bool", GGUF_TYPE_BOOL, true);
|
|
||||||
|
|
||||||
gguf_ex_write_val<std::string>(fout, "some.parameter.string", GGUF_TYPE_STRING, "hello world");
|
|
||||||
|
|
||||||
gguf_ex_write_arr<int16_t> (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16, { 1, 2, 3, 4, });
|
|
||||||
gguf_ex_write_arr<float> (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
|
|
||||||
gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING, { "hello", "world", "!" });
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t offset_tensor = 0;
|
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ 128ull*1024ull*1024ull,
|
/*.mem_size =*/ 128ull*1024ull*1024ull,
|
||||||
|
@ -157,6 +45,8 @@ bool gguf_ex_write(const std::string & fname) {
|
||||||
|
|
||||||
struct ggml_context * ctx_data = ggml_init(params);
|
struct ggml_context * ctx_data = ggml_init(params);
|
||||||
|
|
||||||
|
const int n_tensors = 10;
|
||||||
|
|
||||||
// tensor infos
|
// tensor infos
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const std::string name = "tensor_" + to_string(i);
|
const std::string name = "tensor_" + to_string(i);
|
||||||
|
@ -178,58 +68,15 @@ bool gguf_ex_write(const std::string & fname) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims);
|
gguf_add_tensor(ctx, cur);
|
||||||
for (int j = 0; j < 4; ++j) {
|
|
||||||
fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);
|
|
||||||
|
|
||||||
gguf_ex_write_str(fout, name);
|
|
||||||
gguf_ex_write_i32(fout, n_dims);
|
|
||||||
for (int j = 0; j < n_dims; ++j) {
|
|
||||||
gguf_ex_write_i32(fout, cur->ne[j]);
|
|
||||||
}
|
|
||||||
gguf_ex_write_i32(fout, cur->type);
|
|
||||||
gguf_ex_write_u64(fout, offset_tensor);
|
|
||||||
|
|
||||||
offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT);
|
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||||
|
|
||||||
fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);
|
|
||||||
|
|
||||||
{
|
|
||||||
const size_t pad = offset_data - fout.tellp();
|
|
||||||
|
|
||||||
for (size_t j = 0; j < pad; ++j) {
|
|
||||||
fout.put(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
|
||||||
fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);
|
|
||||||
|
|
||||||
const std::string name = "tensor_" + to_string(i);
|
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
|
|
||||||
|
|
||||||
fout.write((const char *) cur->data, ggml_nbytes(cur));
|
|
||||||
|
|
||||||
{
|
|
||||||
const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < pad; ++j) {
|
|
||||||
fout.put(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fout.close();
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
|
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
|
gguf_free(ctx);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -345,8 +192,16 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
|
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
||||||
__func__, i, cur->n_dims, cur->name, cur->data);
|
|
||||||
|
// print first 10 elements
|
||||||
|
const float * data = (const float *) cur->data;
|
||||||
|
|
||||||
|
printf("%s data[:10] : ", name);
|
||||||
|
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||||
|
printf("%f ", data[j]);
|
||||||
|
}
|
||||||
|
printf("\n\n");
|
||||||
|
|
||||||
// check data
|
// check data
|
||||||
{
|
{
|
||||||
|
@ -369,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// read just the tensor info and mmap the data in user code
|
|
||||||
bool gguf_ex_read_2(const std::string & fname) {
|
|
||||||
struct ggml_context * ctx_data = NULL;
|
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
|
||||||
/*.no_alloc = */ true,
|
|
||||||
/*.ctx = */ &ctx_data,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
|
||||||
|
|
||||||
struct gguf_file file(fname.c_str(), "rb");
|
|
||||||
gguf_mmap data_mmap(&file, 0, false);
|
|
||||||
|
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
|
||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
|
||||||
|
|
||||||
cur->data = static_cast<char *>(data_mmap.addr) + offset;
|
|
||||||
|
|
||||||
// print first 10 elements
|
|
||||||
const float * data = (const float *) cur->data;
|
|
||||||
|
|
||||||
printf("%s data[:10] : ", name);
|
|
||||||
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
|
||||||
printf("%f ", data[j]);
|
|
||||||
}
|
|
||||||
printf("\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
|
||||||
gguf_free(ctx);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
|
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
|
||||||
|
@ -427,7 +240,6 @@ int main(int argc, char ** argv) {
|
||||||
} else if (mode == "r") {
|
} else if (mode == "r") {
|
||||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
||||||
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
|
|
||||||
} else if (mode == "q") {
|
} else if (mode == "q") {
|
||||||
llama_model_quantize_params params = llama_model_quantize_default_params();
|
llama_model_quantize_params params = llama_model_quantize_default_params();
|
||||||
llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms);
|
llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms);
|
||||||
|
|
|
@ -266,9 +266,6 @@ int main(int argc, char ** argv) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine newline token
|
|
||||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
|
@ -778,8 +775,7 @@ int main(int argc, char ** argv) {
|
||||||
if (grammar != NULL) {
|
if (grammar != NULL) {
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free(grammar);
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(
|
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
|
||||||
parsed_grammar.c_rules());
|
|
||||||
grammar = llama_grammar_init(
|
grammar = llama_grammar_init(
|
||||||
grammar_rules.data(), grammar_rules.size(),
|
grammar_rules.data(), grammar_rules.size(),
|
||||||
parsed_grammar.symbol_ids.at("root"));
|
parsed_grammar.symbol_ids.at("root"));
|
||||||
|
|
|
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
void usage(const char * executable) {
|
void usage(const char * executable) {
|
||||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||||
|
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
|
||||||
if (pos != std::string::npos) {
|
if (pos != std::string::npos) {
|
||||||
fpath = fname_inp.substr(0, pos + 1);
|
fpath = fname_inp.substr(0, pos + 1);
|
||||||
}
|
}
|
||||||
// export as [inp path]/ggml-model-[ftype].bin
|
// export as [inp path]/ggml-model-[ftype].gguf
|
||||||
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
|
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||||
arg_idx++;
|
arg_idx++;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
|
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
|
||||||
auto lparams = llama_context_default_params();
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
lparams.n_gqa = params.n_gqa;
|
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
|
|
@ -651,8 +651,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
|
||||||
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
|
||||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
@ -773,23 +771,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "-gqa" || arg == "--gqa")
|
|
||||||
{
|
|
||||||
if (++i >= argc)
|
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.n_gqa = std::stoi(argv[i]);
|
|
||||||
}
|
|
||||||
else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
|
||||||
if (++i >= argc)
|
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.rms_norm_eps = std::stof(argv[i]);
|
|
||||||
}
|
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
|
|
@ -36,16 +36,17 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
|
@ -54,7 +55,7 @@ int main(int argc, char ** argv) {
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int)tokens_list.size() > max_tokens_list_size) {
|
if ((int) tokens_list.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
|
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -74,7 +75,9 @@ int main(int argc, char ** argv) {
|
||||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||||
|
|
||||||
while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
|
const int n_gen = std::min(32, max_context_size);
|
||||||
|
|
||||||
|
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
|
||||||
// evaluate the transformer
|
// evaluate the transformer
|
||||||
|
|
||||||
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
||||||
|
@ -114,7 +117,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
tokens_list.push_back(new_token_id);
|
tokens_list.push_back(new_token_id);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -122,5 +124,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
static const float rms_norm_eps = 1e-5f;
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
std::mt19937 gen;
|
std::mt19937 gen;
|
||||||
|
@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// write_magic
|
#pragma message("TODO: implement file saving using gguf")
|
||||||
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
(void) vocab;
|
||||||
file.write_u32(LLAMA_FILE_VERSION); // version
|
(void) model;
|
||||||
// write_hparams
|
// // write_magic
|
||||||
file.write_u32(model->hparams.n_vocab);
|
// file.write_u32(LLAMA_FILE_MAGIC); // magic
|
||||||
file.write_u32(model->hparams.n_embd);
|
// file.write_u32(LLAMA_FILE_VERSION); // version
|
||||||
file.write_u32(model->hparams.n_mult);
|
// // write_hparams
|
||||||
file.write_u32(model->hparams.n_head);
|
// file.write_u32(model->hparams.n_vocab);
|
||||||
file.write_u32(model->hparams.n_layer);
|
// file.write_u32(model->hparams.n_embd);
|
||||||
file.write_u32(model->hparams.n_rot);
|
// file.write_u32(model->hparams.n_mult);
|
||||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
// file.write_u32(model->hparams.n_head);
|
||||||
// write_vocab
|
// file.write_u32(model->hparams.n_layer);
|
||||||
uint32_t n_vocab = model->hparams.n_vocab;
|
// file.write_u32(model->hparams.n_rot);
|
||||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
// file.write_u32(LLAMA_FTYPE_ALL_F32);
|
||||||
const auto & token_score = vocab->id_to_token.at(i);
|
// // write_vocab
|
||||||
file.write_u32((uint32_t) token_score.tok.size());
|
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||||
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
file.write_raw(&token_score.score, sizeof(token_score.score));
|
// const auto & token_score = vocab->id_to_token.at(i);
|
||||||
}
|
// file.write_u32((uint32_t) token_score.tok.size());
|
||||||
// write tensors
|
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||||
write_tensor(&file, model->tok_embeddings);
|
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||||
write_tensor(&file, model->norm);
|
// }
|
||||||
write_tensor(&file, model->output);
|
// // write tensors
|
||||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
// write_tensor(&file, model->tok_embeddings);
|
||||||
auto & layer = model->layers[i];
|
// write_tensor(&file, model->norm);
|
||||||
|
// write_tensor(&file, model->output);
|
||||||
write_tensor(&file, layer.attention_norm);
|
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||||
write_tensor(&file, layer.wq);
|
// auto & layer = model->layers[i];
|
||||||
write_tensor(&file, layer.wk);
|
//
|
||||||
write_tensor(&file, layer.wv);
|
// write_tensor(&file, layer.attention_norm);
|
||||||
write_tensor(&file, layer.wo);
|
// write_tensor(&file, layer.wq);
|
||||||
write_tensor(&file, layer.ffn_norm);
|
// write_tensor(&file, layer.wk);
|
||||||
write_tensor(&file, layer.w1);
|
// write_tensor(&file, layer.wv);
|
||||||
write_tensor(&file, layer.w2);
|
// write_tensor(&file, layer.wo);
|
||||||
write_tensor(&file, layer.w3);
|
// write_tensor(&file, layer.ffn_norm);
|
||||||
}
|
// write_tensor(&file, layer.w1);
|
||||||
|
// write_tensor(&file, layer.w2);
|
||||||
|
// write_tensor(&file, layer.w3);
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
float cosine_decay(const int decay_steps, const float alpha, int step) {
|
float cosine_decay(const int decay_steps, const float alpha, int step) {
|
||||||
|
|
604
ggml.c
604
ggml.c
|
@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||||
error_desc = "insufficient memory";
|
error_desc = "insufficient memory";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||||
__func__, error_desc, size/(1024.0*1024.0));
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return aligned_memory;
|
return aligned_memory;
|
||||||
}
|
}
|
||||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||||
|
@ -4109,7 +4109,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||||
//
|
//
|
||||||
// is enough, but just in case, adding the second part
|
// is enough, but just in case, adding the second part
|
||||||
|
|
||||||
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||||
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||||
|
@ -9136,6 +9140,8 @@ static void ggml_compute_forward_mul(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
|
@ -16899,7 +16905,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
// compute size of intermediate results
|
// compute size of intermediate results
|
||||||
// TODO: does not take into account scratch buffers !!!!
|
// TODO: does not take into account scratch buffers !!!!
|
||||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print
|
// print
|
||||||
|
@ -18579,6 +18585,20 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
||||||
};
|
};
|
||||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
||||||
|
|
||||||
|
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||||
|
[GGUF_TYPE_UINT8] = "u8",
|
||||||
|
[GGUF_TYPE_INT8] = "i8",
|
||||||
|
[GGUF_TYPE_UINT16] = "u16",
|
||||||
|
[GGUF_TYPE_INT16] = "i16",
|
||||||
|
[GGUF_TYPE_UINT32] = "u32",
|
||||||
|
[GGUF_TYPE_INT32] = "i32",
|
||||||
|
[GGUF_TYPE_FLOAT32] = "f32",
|
||||||
|
[GGUF_TYPE_BOOL] = "bool",
|
||||||
|
[GGUF_TYPE_STRING] = "str",
|
||||||
|
[GGUF_TYPE_ARRAY] = "arr",
|
||||||
|
};
|
||||||
|
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
||||||
|
|
||||||
union gguf_value {
|
union gguf_value {
|
||||||
uint8_t uint8;
|
uint8_t uint8;
|
||||||
int8_t int8;
|
int8_t int8;
|
||||||
|
@ -18613,8 +18633,6 @@ struct gguf_header {
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
uint32_t n_tensors;
|
uint32_t n_tensors;
|
||||||
uint32_t n_kv;
|
uint32_t n_kv;
|
||||||
|
|
||||||
struct gguf_kv * kv;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gguf_tensor_info {
|
struct gguf_tensor_info {
|
||||||
|
@ -18622,44 +18640,69 @@ struct gguf_tensor_info {
|
||||||
|
|
||||||
uint32_t n_dims;
|
uint32_t n_dims;
|
||||||
uint32_t ne[GGML_MAX_DIMS];
|
uint32_t ne[GGML_MAX_DIMS];
|
||||||
uint32_t n_elms; // TODO: is this needed?
|
|
||||||
|
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
||||||
|
|
||||||
|
// for writing API
|
||||||
|
const void * data;
|
||||||
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gguf_context {
|
struct gguf_context {
|
||||||
struct gguf_header header;
|
struct gguf_header header;
|
||||||
|
|
||||||
|
struct gguf_kv * kv;
|
||||||
struct gguf_tensor_info * infos;
|
struct gguf_tensor_info * infos;
|
||||||
|
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
size_t offset; // offset of `data` from beginning of file
|
size_t offset; // offset of `data` from beginning of file
|
||||||
size_t size_data; // size of `data` in bytes
|
size_t size; // size of `data` in bytes
|
||||||
|
|
||||||
//uint8_t * padding;
|
//uint8_t * padding;
|
||||||
uint8_t * data;
|
void * data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset) {
|
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
||||||
const size_t n = fread(dst, 1, size, file);
|
const size_t n = fread(dst, 1, size, file);
|
||||||
*offset += n;
|
*offset += n;
|
||||||
return n == size;
|
return n == size;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool gguf_fread_str(struct gguf_str * p, FILE * file, size_t * offset) {
|
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||||
p->n = 0;
|
p->n = 0;
|
||||||
p->data = NULL;
|
p->data = NULL;
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
// TODO: how to avoid mallocs for strings?
|
// TODO: how to avoid mallocs for strings?
|
||||||
ok = ok && gguf_fread_el(&p->n, sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1);
|
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||||
ok = ok && gguf_fread_el( p->data, p->n, file, offset);
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||||
|
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct gguf_context * gguf_init_empty(void) {
|
||||||
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||||
|
|
||||||
|
ctx->header.magic = GGUF_MAGIC;
|
||||||
|
ctx->header.version = GGUF_VERSION;
|
||||||
|
ctx->header.n_tensors = 0;
|
||||||
|
ctx->header.n_kv = 0;
|
||||||
|
|
||||||
|
ctx->kv = NULL;
|
||||||
|
ctx->infos = NULL;
|
||||||
|
|
||||||
|
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||||
|
ctx->offset = 0;
|
||||||
|
ctx->size = 0;
|
||||||
|
|
||||||
|
ctx->data = NULL;
|
||||||
|
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||||
FILE * file = fopen(fname, "rb");
|
FILE * file = fopen(fname, "rb");
|
||||||
if (!file) {
|
if (!file) {
|
||||||
|
@ -18673,7 +18716,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
// check the magic before making allocations
|
// check the magic before making allocations
|
||||||
{
|
{
|
||||||
gguf_fread_el(&magic, sizeof(magic), file, &offset);
|
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
||||||
|
|
||||||
if (magic != GGUF_MAGIC) {
|
if (magic != GGUF_MAGIC) {
|
||||||
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
||||||
|
@ -18689,14 +18732,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
// read the header
|
// read the header
|
||||||
{
|
{
|
||||||
ctx->header.magic = magic;
|
ctx->header.magic = magic;
|
||||||
ctx->header.kv = NULL;
|
|
||||||
|
|
||||||
|
ctx->kv = NULL;
|
||||||
ctx->infos = NULL;
|
ctx->infos = NULL;
|
||||||
ctx->data = NULL;
|
ctx->data = NULL;
|
||||||
|
|
||||||
ok = ok && gguf_fread_el(&ctx->header.version, sizeof(ctx->header.version), file, &offset);
|
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||||
ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
|
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||||
ok = ok && gguf_fread_el(&ctx->header.n_kv, sizeof(ctx->header.n_kv), file, &offset);
|
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||||
|
@ -18708,33 +18751,33 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
// read the kv pairs
|
// read the kv pairs
|
||||||
{
|
{
|
||||||
ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->header.kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||||
|
|
||||||
ok = ok && gguf_fread_str(&kv->key, file, &offset);
|
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
||||||
//ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
|
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
||||||
ok = ok && gguf_fread_el (&kv->type, sizeof(kv->type), file, &offset);
|
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
||||||
|
|
||||||
switch (kv->type) {
|
switch (kv->type) {
|
||||||
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (&kv->value.uint8, sizeof(kv->value.uint8), file, &offset); break;
|
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
||||||
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (&kv->value.int8, sizeof(kv->value.int8), file, &offset); break;
|
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
||||||
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (&kv->value.uint16, sizeof(kv->value.uint16), file, &offset); break;
|
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
||||||
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (&kv->value.int16, sizeof(kv->value.int16), file, &offset); break;
|
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
||||||
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (&kv->value.uint32, sizeof(kv->value.uint32), file, &offset); break;
|
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
||||||
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (&kv->value.int32, sizeof(kv->value.int32), file, &offset); break;
|
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
||||||
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
|
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
||||||
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (&kv->value.bool_, sizeof(kv->value.bool_), file, &offset); break;
|
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
||||||
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(&kv->value.str, file, &offset); break;
|
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
{
|
{
|
||||||
ok = ok && gguf_fread_el(&kv->value.arr.type, sizeof(kv->value.arr.type), file, &offset);
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||||
ok = ok && gguf_fread_el(&kv->value.arr.n, sizeof(kv->value.arr.n), file, &offset);
|
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||||
|
|
||||||
switch (kv->value.arr.type) {
|
switch (kv->value.arr.type) {
|
||||||
case GGUF_TYPE_UINT8:
|
case GGUF_TYPE_UINT8:
|
||||||
|
@ -18747,17 +18790,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
case GGUF_TYPE_BOOL:
|
case GGUF_TYPE_BOOL:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||||
ok = ok && gguf_fread_el(kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], file, &offset);
|
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
||||||
} break;
|
} break;
|
||||||
case GGUF_TYPE_STRING:
|
case GGUF_TYPE_STRING:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
ok = ok && gguf_fread_str(&((struct gguf_str *) kv->value.arr.data)[j], file, &offset);
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||||
};
|
};
|
||||||
} break;
|
} break;
|
||||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||||
|
@ -18787,14 +18830,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
info->ne[j] = 1;
|
info->ne[j] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ok = ok && gguf_fread_str(&info->name, file, &offset);
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||||
ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims), file, &offset);
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||||
ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
|
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||||
}
|
}
|
||||||
//ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms), file, &offset);
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||||
ok = ok && gguf_fread_el (&info->type, sizeof(info->type), file, &offset);
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||||
ok = ok && gguf_fread_el (&info->offset, sizeof(info->offset), file, &offset);
|
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||||
|
@ -18827,8 +18869,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
// compute the total size of the data section, taking into account the alignment
|
// compute the total size of the data section, taking into account the alignment
|
||||||
{
|
{
|
||||||
|
ctx->size = 0;
|
||||||
ctx->size_data = 0;
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
|
@ -18848,7 +18889,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
||||||
|
|
||||||
ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18862,7 +18903,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
const size_t mem_size =
|
const size_t mem_size =
|
||||||
params.no_alloc ?
|
params.no_alloc ?
|
||||||
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
||||||
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data;
|
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
||||||
|
|
||||||
struct ggml_init_params pdata = {
|
struct ggml_init_params pdata = {
|
||||||
.mem_size = mem_size,
|
.mem_size = mem_size,
|
||||||
|
@ -18877,12 +18918,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
struct ggml_tensor * data = NULL;
|
struct ggml_tensor * data = NULL;
|
||||||
|
|
||||||
if (params.no_alloc == false) {
|
if (params.no_alloc == false) {
|
||||||
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
||||||
|
|
||||||
ok = ok && data != NULL;
|
ok = ok && data != NULL;
|
||||||
|
|
||||||
// read the binary blob with the tensor data
|
// read the binary blob with the tensor data
|
||||||
ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
|
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
||||||
|
@ -18944,10 +18985,10 @@ void gguf_free(struct gguf_context * ctx) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->header.kv) {
|
if (ctx->kv) {
|
||||||
// free string memory - not great..
|
// free string memory - not great..
|
||||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->header.kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
if (kv->key.data) {
|
if (kv->key.data) {
|
||||||
free(kv->key.data);
|
free(kv->key.data);
|
||||||
|
@ -18974,7 +19015,7 @@ void gguf_free(struct gguf_context * ctx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ALIGNED_FREE(ctx->header.kv);
|
GGML_ALIGNED_FREE(ctx->kv);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->infos) {
|
if (ctx->infos) {
|
||||||
|
@ -18992,6 +19033,10 @@ void gguf_free(struct gguf_context * ctx) {
|
||||||
GGML_ALIGNED_FREE(ctx);
|
GGML_ALIGNED_FREE(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * gguf_type_name(enum gguf_type type) {
|
||||||
|
return GGUF_TYPE_NAME[type];
|
||||||
|
}
|
||||||
|
|
||||||
int gguf_get_version(struct gguf_context * ctx) {
|
int gguf_get_version(struct gguf_context * ctx) {
|
||||||
return ctx->header.version;
|
return ctx->header.version;
|
||||||
}
|
}
|
||||||
|
@ -19014,9 +19059,10 @@ int gguf_get_n_kv(struct gguf_context * ctx) {
|
||||||
|
|
||||||
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||||
// return -1 if key not found
|
// return -1 if key not found
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
|
||||||
int keyfound = -1;
|
int keyfound = -1;
|
||||||
|
|
||||||
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
||||||
keyfound = i;
|
keyfound = i;
|
||||||
|
@ -19028,71 +19074,87 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].key.data;
|
return ctx->kv[i].key.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].type;
|
return ctx->kv[i].type;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.arr.type;
|
return ctx->kv[i].value.arr.type;
|
||||||
|
}
|
||||||
|
|
||||||
|
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->kv[i].value.arr.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
||||||
struct gguf_kv * kv = &ctx->header.kv[key_id];
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||||
return str->data;
|
return str->data;
|
||||||
}
|
}
|
||||||
|
|
||||||
float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
|
|
||||||
return ((float *) ctx->header.kv[key_id].value.arr.data)[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.arr.n;
|
return ctx->kv[i].value.arr.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.uint8;
|
return ctx->kv[i].value.uint8;
|
||||||
}
|
}
|
||||||
|
|
||||||
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.int8;
|
return ctx->kv[i].value.int8;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.uint16;
|
return ctx->kv[i].value.uint16;
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.int16;
|
return ctx->kv[i].value.int16;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.uint32;
|
return ctx->kv[i].value.uint32;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.int32;
|
return ctx->kv[i].value.int32;
|
||||||
}
|
}
|
||||||
|
|
||||||
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.float32;
|
return ctx->kv[i].value.float32;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.bool_;
|
return ctx->kv[i].value.bool_;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
||||||
return ctx->header.kv[i].value.str.data;
|
return ctx->kv[i].value.str.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
||||||
return ctx->header.n_tensors;
|
return ctx->header.n_tensors;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
||||||
|
// return -1 if tensor not found
|
||||||
|
int tensorfound = -1;
|
||||||
|
|
||||||
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
|
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
||||||
|
tensorfound = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tensorfound;
|
||||||
|
}
|
||||||
|
|
||||||
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
||||||
return ctx->infos[i].offset;
|
return ctx->infos[i].offset;
|
||||||
}
|
}
|
||||||
|
@ -19101,6 +19163,406 @@ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
||||||
return ctx->infos[i].name.data;
|
return ctx->infos[i].name.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// returns the index
|
||||||
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||||
|
const int idx = gguf_find_key(ctx, key);
|
||||||
|
if (idx >= 0) {
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
|
||||||
|
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
||||||
|
ctx->kv[n_kv].key.n = strlen(key) + 1;
|
||||||
|
ctx->kv[n_kv].key.data = strdup(key);
|
||||||
|
ctx->header.n_kv++;
|
||||||
|
|
||||||
|
return n_kv;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
||||||
|
ctx->kv[idx].value.uint8 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
||||||
|
ctx->kv[idx].value.int8 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
||||||
|
ctx->kv[idx].value.uint16 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
||||||
|
ctx->kv[idx].value.int16 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
||||||
|
ctx->kv[idx].value.uint32 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
||||||
|
ctx->kv[idx].value.int32 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
||||||
|
ctx->kv[idx].value.float32 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
||||||
|
ctx->kv[idx].value.bool_ = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
||||||
|
ctx->kv[idx].value.str.n = strlen(val) + 1;
|
||||||
|
ctx->kv[idx].value.str.data = strdup(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||||
|
ctx->kv[idx].value.arr.type = type;
|
||||||
|
ctx->kv[idx].value.arr.n = n;
|
||||||
|
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
||||||
|
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||||
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||||
|
ctx->kv[idx].value.arr.n = n;
|
||||||
|
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||||
|
str->n = strlen(data[i]) + 1;
|
||||||
|
str->data = strdup(data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// set or add KV pairs from another context
|
||||||
|
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||||
|
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
||||||
|
switch (src->kv[i].type) {
|
||||||
|
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
||||||
|
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
||||||
|
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
||||||
|
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
||||||
|
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
||||||
|
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
||||||
|
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
||||||
|
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
||||||
|
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
||||||
|
case GGUF_TYPE_ARRAY:
|
||||||
|
{
|
||||||
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||||
|
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||||
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||||
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||||
|
}
|
||||||
|
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||||
|
free(data);
|
||||||
|
} if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||||
|
GGML_ASSERT(false && "nested arrays not supported");
|
||||||
|
} else {
|
||||||
|
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_add_tensor(
|
||||||
|
struct gguf_context * ctx,
|
||||||
|
const struct ggml_tensor * tensor) {
|
||||||
|
const int idx = ctx->header.n_tensors;
|
||||||
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
|
ctx->infos[idx].name.n = strlen(tensor->name) + 1;
|
||||||
|
ctx->infos[idx].name.data = strdup(tensor->name);
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
ctx->infos[idx].ne[i] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->infos[idx].n_dims = tensor->n_dims;
|
||||||
|
for (int i = 0; i < tensor->n_dims; i++) {
|
||||||
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->infos[idx].type = tensor->type;
|
||||||
|
ctx->infos[idx].offset = 0;
|
||||||
|
ctx->infos[idx].data = tensor->data;
|
||||||
|
ctx->infos[idx].size = ggml_nbytes(tensor);
|
||||||
|
|
||||||
|
if (ctx->header.n_tensors > 0) {
|
||||||
|
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->header.n_tensors++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
||||||
|
const int idx = gguf_find_tensor(ctx, name);
|
||||||
|
if (idx < 0) {
|
||||||
|
GGML_ASSERT(false && "tensor not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->infos[idx].type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
||||||
|
const int idx = gguf_find_tensor(ctx, name);
|
||||||
|
if (idx < 0) {
|
||||||
|
GGML_ASSERT(false && "tensor not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->infos[idx].data = data;
|
||||||
|
ctx->infos[idx].size = size;
|
||||||
|
|
||||||
|
// update offsets
|
||||||
|
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
||||||
|
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
||||||
|
// fwrite(&val->n, sizeof(val->n), 1, file);
|
||||||
|
// fwrite(val->data, sizeof(char), val->n, file);
|
||||||
|
//}
|
||||||
|
//
|
||||||
|
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
||||||
|
// fwrite(val, sizeof(char), size, file);
|
||||||
|
//}
|
||||||
|
|
||||||
|
struct gguf_buf {
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
size_t offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||||
|
struct gguf_buf buf = {
|
||||||
|
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
||||||
|
/*buf.size =*/ size,
|
||||||
|
/*buf.offset =*/ 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_buf_free(struct gguf_buf buf) {
|
||||||
|
if (buf.data) {
|
||||||
|
free(buf.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
||||||
|
if (buf->offset + size > buf->size) {
|
||||||
|
buf->size = 1.5*(buf->offset + size);
|
||||||
|
if (buf->data) {
|
||||||
|
buf->data = realloc(buf->data, buf->size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
||||||
|
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
||||||
|
|
||||||
|
if (buf->data) {
|
||||||
|
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
||||||
|
}
|
||||||
|
buf->offset += sizeof(val->n);
|
||||||
|
|
||||||
|
if (buf->data) {
|
||||||
|
memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
||||||
|
}
|
||||||
|
buf->offset += val->n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
||||||
|
gguf_buf_grow(buf, el_size);
|
||||||
|
|
||||||
|
if (buf->data) {
|
||||||
|
memcpy((char *) buf->data + buf->offset, val, el_size);
|
||||||
|
}
|
||||||
|
buf->offset += el_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||||
|
// write header
|
||||||
|
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||||
|
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||||
|
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
||||||
|
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
||||||
|
|
||||||
|
// write key-value pairs
|
||||||
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
|
gguf_bwrite_str(buf, &kv->key);
|
||||||
|
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
||||||
|
|
||||||
|
switch (kv->type) {
|
||||||
|
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
||||||
|
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
||||||
|
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
||||||
|
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
||||||
|
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
||||||
|
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
||||||
|
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
||||||
|
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
||||||
|
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
||||||
|
case GGUF_TYPE_ARRAY:
|
||||||
|
{
|
||||||
|
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
||||||
|
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
||||||
|
|
||||||
|
switch (kv->value.arr.type) {
|
||||||
|
case GGUF_TYPE_UINT8:
|
||||||
|
case GGUF_TYPE_INT8:
|
||||||
|
case GGUF_TYPE_UINT16:
|
||||||
|
case GGUF_TYPE_INT16:
|
||||||
|
case GGUF_TYPE_UINT32:
|
||||||
|
case GGUF_TYPE_INT32:
|
||||||
|
case GGUF_TYPE_FLOAT32:
|
||||||
|
case GGUF_TYPE_BOOL:
|
||||||
|
{
|
||||||
|
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||||
|
} break;
|
||||||
|
case GGUF_TYPE_STRING:
|
||||||
|
{
|
||||||
|
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
|
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case GGUF_TYPE_ARRAY:
|
||||||
|
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||||
|
};
|
||||||
|
} break;
|
||||||
|
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// write tensor infos
|
||||||
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
|
gguf_bwrite_str(buf, &info->name);
|
||||||
|
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
||||||
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||||
|
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
||||||
|
}
|
||||||
|
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
||||||
|
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
// we require the data section to be aligned, so take into account any padding
|
||||||
|
{
|
||||||
|
const size_t offset = buf->offset;
|
||||||
|
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
||||||
|
|
||||||
|
if (offset_pad != offset) {
|
||||||
|
uint8_t pad = 0;
|
||||||
|
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
||||||
|
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (only_meta) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t offset = 0;
|
||||||
|
|
||||||
|
// write tensor data
|
||||||
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
|
const size_t size = info->size;
|
||||||
|
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
||||||
|
|
||||||
|
gguf_bwrite_el(buf, info->data, size);
|
||||||
|
|
||||||
|
if (size_pad != size) {
|
||||||
|
uint8_t pad = 0;
|
||||||
|
for (size_t j = 0; j < size_pad - size; ++j) {
|
||||||
|
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(offset == info->offset);
|
||||||
|
|
||||||
|
offset += size_pad;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||||
|
FILE * file = fopen(fname, "wb");
|
||||||
|
if (!file) {
|
||||||
|
GGML_ASSERT(false && "failed to open file for writing");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||||
|
|
||||||
|
gguf_write_to_buf(ctx, &buf, only_meta);
|
||||||
|
|
||||||
|
fwrite(buf.data, 1, buf.offset, file);
|
||||||
|
|
||||||
|
gguf_buf_free(buf);
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
||||||
|
// no allocs - only compute size
|
||||||
|
struct gguf_buf buf = gguf_buf_init(0);
|
||||||
|
|
||||||
|
gguf_write_to_buf(ctx, &buf, true);
|
||||||
|
|
||||||
|
return buf.offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
||||||
|
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||||
|
|
||||||
|
gguf_write_to_buf(ctx, &buf, true);
|
||||||
|
|
||||||
|
memcpy(data, buf.data, buf.offset);
|
||||||
|
|
||||||
|
gguf_buf_free(buf);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int ggml_cpu_has_avx(void) {
|
int ggml_cpu_has_avx(void) {
|
||||||
|
|
65
ggml.h
65
ggml.h
|
@ -566,6 +566,7 @@ extern "C" {
|
||||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||||
|
|
||||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||||
|
@ -1498,7 +1499,6 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
|
@ -1711,7 +1711,6 @@ extern "C" {
|
||||||
// gguf
|
// gguf
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: can be removed if the API is extended for writing
|
|
||||||
enum gguf_type {
|
enum gguf_type {
|
||||||
GGUF_TYPE_UINT8 = 0,
|
GGUF_TYPE_UINT8 = 0,
|
||||||
GGUF_TYPE_INT8 = 1,
|
GGUF_TYPE_INT8 = 1,
|
||||||
|
@ -1735,10 +1734,14 @@ extern "C" {
|
||||||
struct ggml_context ** ctx;
|
struct ggml_context ** ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||||
|
|
||||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||||
|
|
||||||
|
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||||
|
|
||||||
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
||||||
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
||||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||||
|
@ -1747,13 +1750,11 @@ extern "C" {
|
||||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||||
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||||
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
|
|
||||||
|
|
||||||
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
|
|
||||||
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
|
|
||||||
|
|
||||||
|
// results are undefined if the wrong type is used for the key
|
||||||
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
||||||
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
||||||
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
||||||
|
@ -1764,12 +1765,60 @@ extern "C" {
|
||||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||||
GGML_API void gguf_get_arr_data(struct gguf_context * ctx, int i, void * data);
|
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
||||||
|
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
||||||
|
|
||||||
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
||||||
|
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
||||||
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
||||||
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
|
// overrides existing values or adds a new one
|
||||||
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||||
|
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||||
|
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||||
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||||
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||||
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||||
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||||
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||||
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||||
|
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
||||||
|
|
||||||
|
// set or add KV pairs from another context
|
||||||
|
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
||||||
|
|
||||||
|
// manage tensor info
|
||||||
|
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||||
|
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||||
|
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
||||||
|
|
||||||
|
// writing gguf files can be done in 2 ways:
|
||||||
|
//
|
||||||
|
// - write the entire gguf_context to a binary file in a single pass:
|
||||||
|
//
|
||||||
|
// gguf_write_to_file(ctx, fname);
|
||||||
|
//
|
||||||
|
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||||
|
//
|
||||||
|
// FILE * f = fopen(fname, "wb");
|
||||||
|
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
||||||
|
// fwrite(f, ...);
|
||||||
|
// void * data = gguf_meta_get_meta_data(ctx);
|
||||||
|
// fseek(f, 0, SEEK_SET);
|
||||||
|
// fwrite(f, data, gguf_get_meta_size(ctx));
|
||||||
|
// free(data);
|
||||||
|
// fclose(f);
|
||||||
|
//
|
||||||
|
|
||||||
|
// write the entire context to a binary file
|
||||||
|
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||||
|
|
||||||
|
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||||
|
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
||||||
|
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
||||||
|
|
||||||
//
|
//
|
||||||
// system info
|
// system info
|
||||||
//
|
//
|
||||||
|
|
4760
gguf-llama.cpp
4760
gguf-llama.cpp
File diff suppressed because it is too large
Load diff
505
gguf-llama.h
505
gguf-llama.h
|
@ -1,505 +0,0 @@
|
||||||
#ifndef LLAMA_H
|
|
||||||
#define LLAMA_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
|
||||||
#else
|
|
||||||
#define LLAMA_MAX_DEVICES 1
|
|
||||||
#endif // GGML_USE_CUBLAS
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define LLAMA_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define LLAMA_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define LLAMA_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define LLAMA_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
|
||||||
#else
|
|
||||||
# define DEPRECATED(func, hint) func
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// C interface
|
|
||||||
//
|
|
||||||
// TODO: show sample usage
|
|
||||||
//
|
|
||||||
|
|
||||||
struct llama_model;
|
|
||||||
struct llama_context;
|
|
||||||
|
|
||||||
typedef int llama_token;
|
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
|
||||||
llama_token id; // token id
|
|
||||||
float logit; // log-odds of the token
|
|
||||||
float p; // probability of the token
|
|
||||||
} llama_token_data;
|
|
||||||
|
|
||||||
typedef struct llama_token_data_array {
|
|
||||||
llama_token_data * data;
|
|
||||||
size_t size;
|
|
||||||
bool sorted;
|
|
||||||
} llama_token_data_array;
|
|
||||||
|
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
|
||||||
|
|
||||||
enum llama_log_level {
|
|
||||||
LLAMA_LOG_LEVEL_ERROR = 2,
|
|
||||||
LLAMA_LOG_LEVEL_WARN = 3,
|
|
||||||
LLAMA_LOG_LEVEL_INFO = 4
|
|
||||||
};
|
|
||||||
|
|
||||||
// Signature for logging events
|
|
||||||
// Note that text includes the new line character at the end for most events.
|
|
||||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
|
||||||
// if it exists.
|
|
||||||
// It might not exist for progress report where '.' is output repeatedly.
|
|
||||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
|
||||||
|
|
||||||
struct llama_context_params {
|
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
|
||||||
int32_t n_ctx; // text context
|
|
||||||
int32_t n_batch; // prompt processing batch size
|
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
|
||||||
|
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
|
||||||
float rope_freq_base; // RoPE base frequency
|
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor
|
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
|
||||||
llama_progress_callback progress_callback;
|
|
||||||
// context pointer passed to the progress callback
|
|
||||||
void * progress_callback_user_data;
|
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
||||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
|
||||||
bool f16_kv; // use fp16 for KV cache
|
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
|
||||||
bool use_mmap; // use mmap if possible
|
|
||||||
bool use_mlock; // force system to keep model in RAM
|
|
||||||
bool embedding; // embedding mode only
|
|
||||||
};
|
|
||||||
// model file types
|
|
||||||
enum llama_ftype {
|
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
|
||||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
|
||||||
};
|
|
||||||
|
|
||||||
// model quantization parameters
|
|
||||||
typedef struct llama_model_quantize_params {
|
|
||||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
|
||||||
} llama_model_quantize_params;
|
|
||||||
|
|
||||||
// grammar types
|
|
||||||
struct llama_grammar;
|
|
||||||
|
|
||||||
// grammar element type
|
|
||||||
enum llama_gretype {
|
|
||||||
// end of rule definition
|
|
||||||
LLAMA_GRETYPE_END = 0,
|
|
||||||
|
|
||||||
// start of alternate definition for rule
|
|
||||||
LLAMA_GRETYPE_ALT = 1,
|
|
||||||
|
|
||||||
// non-terminal element: reference to rule
|
|
||||||
LLAMA_GRETYPE_RULE_REF = 2,
|
|
||||||
|
|
||||||
// terminal element: character (code point)
|
|
||||||
LLAMA_GRETYPE_CHAR = 3,
|
|
||||||
|
|
||||||
// inverse char(s) ([^a], [^a-b] [^abc])
|
|
||||||
LLAMA_GRETYPE_CHAR_NOT = 4,
|
|
||||||
|
|
||||||
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
|
||||||
// be an inclusive range ([a-z])
|
|
||||||
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
|
||||||
|
|
||||||
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
|
||||||
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
|
||||||
LLAMA_GRETYPE_CHAR_ALT = 6,
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct llama_grammar_element {
|
|
||||||
enum llama_gretype type;
|
|
||||||
uint32_t value; // Unicode code point or rule ID
|
|
||||||
} llama_grammar_element;
|
|
||||||
|
|
||||||
// performance timing information
|
|
||||||
struct llama_timings {
|
|
||||||
double t_start_ms;
|
|
||||||
double t_end_ms;
|
|
||||||
double t_load_ms;
|
|
||||||
double t_sample_ms;
|
|
||||||
double t_p_eval_ms;
|
|
||||||
double t_eval_ms;
|
|
||||||
|
|
||||||
int32_t n_sample;
|
|
||||||
int32_t n_p_eval;
|
|
||||||
int32_t n_eval;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
||||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
|
||||||
|
|
||||||
LLAMA_API int llama_max_devices();
|
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
|
||||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported();
|
|
||||||
LLAMA_API bool llama_mlock_supported();
|
|
||||||
|
|
||||||
// TODO: not great API - very likely to change
|
|
||||||
// Initialize the llama + ggml backend
|
|
||||||
// If numa is true, use NUMA optimizations
|
|
||||||
// Call once at the start of the program
|
|
||||||
LLAMA_API void llama_backend_init(bool numa);
|
|
||||||
// Call once at the end of the program - currently only used for MPI
|
|
||||||
LLAMA_API void llama_backend_free();
|
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
|
||||||
|
|
||||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
|
||||||
const char * path_model,
|
|
||||||
struct llama_context_params params);
|
|
||||||
|
|
||||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
|
||||||
|
|
||||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
|
||||||
struct llama_model * model,
|
|
||||||
struct llama_context_params params);
|
|
||||||
|
|
||||||
|
|
||||||
// Frees all allocated memory
|
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Returns 0 on success
|
|
||||||
LLAMA_API int llama_model_quantize(
|
|
||||||
const char * fname_inp,
|
|
||||||
const char * fname_out,
|
|
||||||
const llama_model_quantize_params * params);
|
|
||||||
|
|
||||||
// Apply a LoRA adapter to a loaded model
|
|
||||||
// path_base_model is the path to a higher quality model to use as a base for
|
|
||||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
|
||||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
|
||||||
// will be applied on top of the previous one
|
|
||||||
// Returns 0 on success
|
|
||||||
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const char * path_lora,
|
|
||||||
const char * path_base_model,
|
|
||||||
int n_threads),
|
|
||||||
"please use llama_model_apply_lora_from_file instead");
|
|
||||||
|
|
||||||
LLAMA_API int llama_model_apply_lora_from_file(
|
|
||||||
const struct llama_model * model,
|
|
||||||
const char * path_lora,
|
|
||||||
const char * path_base_model,
|
|
||||||
int n_threads);
|
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache
|
|
||||||
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Sets the current rng seed.
|
|
||||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
|
||||||
|
|
||||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
|
||||||
// and kv_cache) - will often be smaller after compacting tokens
|
|
||||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Copies the state to the specified destination address.
|
|
||||||
// Destination needs to have allocated enough memory.
|
|
||||||
// Returns the number of bytes copied
|
|
||||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
|
||||||
|
|
||||||
// Set the state reading from the specified address
|
|
||||||
// Returns the number of bytes read
|
|
||||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
|
||||||
|
|
||||||
// Save/load session file
|
|
||||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
|
||||||
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
|
||||||
|
|
||||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
|
||||||
// tokens + n_tokens is the provided batch of new tokens to process
|
|
||||||
// n_past is the number of tokens to use from previous eval calls
|
|
||||||
// Returns 0 on success
|
|
||||||
LLAMA_API int llama_eval(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const llama_token * tokens,
|
|
||||||
int n_tokens,
|
|
||||||
int n_past,
|
|
||||||
int n_threads);
|
|
||||||
|
|
||||||
// Same as llama_eval, but use float matrix input directly.
|
|
||||||
LLAMA_API int llama_eval_embd(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const float * embd,
|
|
||||||
int n_tokens,
|
|
||||||
int n_past,
|
|
||||||
int n_threads);
|
|
||||||
|
|
||||||
// Export a static computation graph for context of 511 and batch size of 1
|
|
||||||
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
|
||||||
// parameters here to keep things simple
|
|
||||||
// IMPORTANT: do not use for anything else other than debugging and testing!
|
|
||||||
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
|
||||||
|
|
||||||
// Convert the provided text into tokens.
|
|
||||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
|
||||||
// Returns the number of tokens on success, no more than n_max_tokens
|
|
||||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
|
||||||
// TODO: not sure if correct
|
|
||||||
LLAMA_API int llama_tokenize(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const char * text,
|
|
||||||
llama_token * tokens,
|
|
||||||
int n_max_tokens,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
LLAMA_API int llama_tokenize_bpe(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const char * text,
|
|
||||||
llama_token * tokens,
|
|
||||||
int n_max_tokens,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
LLAMA_API int llama_tokenize_with_model(
|
|
||||||
const struct llama_model * model,
|
|
||||||
const char * text,
|
|
||||||
llama_token * tokens,
|
|
||||||
int n_max_tokens,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
|
||||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
|
||||||
|
|
||||||
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
|
||||||
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
|
||||||
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
|
||||||
|
|
||||||
// Get the vocabulary as output parameters.
|
|
||||||
// Returns number of results.
|
|
||||||
LLAMA_API int llama_get_vocab(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
const char * * strings,
|
|
||||||
float * scores,
|
|
||||||
int capacity);
|
|
||||||
|
|
||||||
LLAMA_API int llama_get_vocab_from_model(
|
|
||||||
const struct llama_model * model,
|
|
||||||
const char * * strings,
|
|
||||||
float * scores,
|
|
||||||
int capacity);
|
|
||||||
|
|
||||||
// Token logits obtained from the last call to llama_eval()
|
|
||||||
// The logits for the last token are stored in the last row
|
|
||||||
// Can be mutated in order to change the probabilities of the next token
|
|
||||||
// Rows: n_tokens
|
|
||||||
// Cols: n_vocab
|
|
||||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Get the embeddings for the input
|
|
||||||
// shape: [n_embd] (1-dimensional)
|
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
|
||||||
LLAMA_API int llama_token_to_str(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token,
|
|
||||||
char * str,
|
|
||||||
int length);
|
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_bpe(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token,
|
|
||||||
char * str,
|
|
||||||
int length);
|
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_with_model(
|
|
||||||
const struct llama_model * model,
|
|
||||||
llama_token token,
|
|
||||||
char * str,
|
|
||||||
int length);
|
|
||||||
// Special tokens
|
|
||||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
|
||||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
|
||||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
|
||||||
|
|
||||||
// Grammar
|
|
||||||
//
|
|
||||||
LLAMA_API struct llama_grammar * llama_grammar_init(
|
|
||||||
const llama_grammar_element ** rules,
|
|
||||||
size_t n_rules,
|
|
||||||
size_t start_rule_index);
|
|
||||||
|
|
||||||
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
|
||||||
|
|
||||||
// Sampling functions
|
|
||||||
|
|
||||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
|
||||||
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
|
||||||
|
|
||||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
|
||||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
|
||||||
|
|
||||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
|
||||||
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
|
||||||
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
|
||||||
LLAMA_API void llama_sample_classifier_free_guidance(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_token_data_array * candidates,
|
|
||||||
struct llama_context * guidance_ctx,
|
|
||||||
float scale);
|
|
||||||
|
|
||||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
||||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
|
||||||
|
|
||||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
||||||
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
|
||||||
|
|
||||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
||||||
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
|
||||||
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
|
||||||
|
|
||||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
|
||||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
|
||||||
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
|
||||||
|
|
||||||
/// @details Apply constraints from grammar
|
|
||||||
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
|
||||||
|
|
||||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
||||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
|
||||||
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
|
||||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
|
||||||
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
|
||||||
|
|
||||||
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
||||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
|
||||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
|
||||||
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
|
||||||
|
|
||||||
/// @details Selects the token with the highest probability.
|
|
||||||
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
|
||||||
|
|
||||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
|
||||||
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
|
||||||
|
|
||||||
/// @details Accepts the sampled token into the grammar
|
|
||||||
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
|
||||||
|
|
||||||
// Performance information
|
|
||||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
|
||||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
|
||||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Print system information
|
|
||||||
LLAMA_API const char * llama_print_system_info(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// C++ API, will be moving to common.h soon (TM)
|
|
||||||
#ifdef LLAMA_API_CPP
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
//
|
|
||||||
// Vocab utils
|
|
||||||
//
|
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const std::string & text,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize_bpe(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const std::string & text,
|
|
||||||
bool add_bos);
|
|
||||||
|
|
||||||
std::string llama_token_to_str(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token);
|
|
||||||
|
|
||||||
std::string llama_token_to_str_bpe(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_token token);
|
|
||||||
|
|
||||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
|
||||||
#ifdef LLAMA_API_INTERNAL
|
|
||||||
|
|
||||||
struct ggml_tensor;
|
|
||||||
|
|
||||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
|
||||||
|
|
||||||
#endif // LLAMA_API_CPP
|
|
||||||
|
|
||||||
#endif // LLAMA_API_INTERNAL
|
|
||||||
|
|
||||||
#endif // LLAMA_H
|
|
470
gguf-util.h
470
gguf-util.h
|
@ -1,470 +0,0 @@
|
||||||
// GGUF counterpart of llama-util.h.
|
|
||||||
// we may consider making it a part of ggml.c once GGUF work is complete.
|
|
||||||
// this will require extra work to migrate this to pure C.
|
|
||||||
// Contains wrappers around OS interfaces.
|
|
||||||
|
|
||||||
#ifndef GGUF_UTIL_H
|
|
||||||
#define GGUF_UTIL_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cerrno>
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdarg>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <sstream>
|
|
||||||
#include <vector>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#ifdef __has_include
|
|
||||||
#if __has_include(<unistd.h>)
|
|
||||||
#include <unistd.h>
|
|
||||||
#if defined(_POSIX_MAPPED_FILES)
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#endif
|
|
||||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
||||||
#include <sys/resource.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <io.h>
|
|
||||||
#include <stdio.h> // for _fseeki64
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
#ifdef __MINGW32__
|
|
||||||
__attribute__((format(gnu_printf, 1, 2)))
|
|
||||||
#else
|
|
||||||
__attribute__((format(printf, 1, 2)))
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
static std::string format(const char * fmt, ...) {
|
|
||||||
va_list ap, ap2;
|
|
||||||
va_start(ap, fmt);
|
|
||||||
va_copy(ap2, ap);
|
|
||||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
||||||
GGML_ASSERT(size >= 0 && size < INT_MAX);
|
|
||||||
std::vector<char> buf(size + 1);
|
|
||||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
||||||
GGML_ASSERT(size2 == size);
|
|
||||||
va_end(ap2);
|
|
||||||
va_end(ap);
|
|
||||||
return std::string(buf.data(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: can we merge this one and gguf_context?
|
|
||||||
struct gguf_file {
|
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
|
||||||
FILE * fp;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
gguf_file(const char * fname, const char * mode) {
|
|
||||||
fp = std::fopen(fname, mode);
|
|
||||||
if (fp == NULL) {
|
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
||||||
}
|
|
||||||
seek(0, SEEK_END);
|
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t tell() const {
|
|
||||||
#ifdef _WIN32
|
|
||||||
__int64 ret = _ftelli64(fp);
|
|
||||||
#else
|
|
||||||
long ret = std::ftell(fp);
|
|
||||||
#endif
|
|
||||||
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
|
||||||
return (size_t) ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void seek(size_t offset, int whence) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
|
||||||
#else
|
|
||||||
int ret = std::fseek(fp, (long) offset, whence);
|
|
||||||
#endif
|
|
||||||
GGML_ASSERT(ret == 0); // same
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t write_str(const std::string & val) {
|
|
||||||
size_t total_written = 0;
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
|
||||||
total_written += sizeof(n);
|
|
||||||
fwrite(val.c_str(), n, 1, fp);
|
|
||||||
total_written += n;
|
|
||||||
|
|
||||||
return total_written;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t write_i32(int32_t val) {
|
|
||||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
|
||||||
return sizeof(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t write_u64(size_t val) {
|
|
||||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
|
||||||
return sizeof(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
void write_val(const std::string & key, enum gguf_type type, const T & val) {
|
|
||||||
write_str(key);
|
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
|
||||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
void write_arr(const std::string & key, enum gguf_type type, const std::vector<T> & val) {
|
|
||||||
write_str(key);
|
|
||||||
{
|
|
||||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
|
||||||
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
|
||||||
fwrite(val.data(), sizeof(T), n, fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_str(const std::string & key, enum gguf_type type, const std::string & val) {
|
|
||||||
write_str(key);
|
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
|
||||||
fwrite(val.c_str(), n, 1, fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_str(const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
|
|
||||||
write_str(key);
|
|
||||||
{
|
|
||||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
|
||||||
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t n = val.size();
|
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
const int32_t nstr = val[i].size();
|
|
||||||
fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
|
|
||||||
fwrite(val[i].c_str(), nstr, 1, fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_zeros(size_t count) {
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
fputc(0, fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
||||||
if (ferror(fp)) {
|
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~gguf_file() {
|
|
||||||
if (fp) {
|
|
||||||
std::fclose(fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
static std::string gguf_format_win_err(DWORD err) {
|
|
||||||
LPSTR buf;
|
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
||||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
|
||||||
if (!size) {
|
|
||||||
return "FormatMessageA failed";
|
|
||||||
}
|
|
||||||
std::string ret(buf, size);
|
|
||||||
LocalFree(buf);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct gguf_mmap {
|
|
||||||
void * addr;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
gguf_mmap(const gguf_mmap &) = delete;
|
|
||||||
|
|
||||||
#ifdef _POSIX_MAPPED_FILES
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
|
||||||
size = file->size;
|
|
||||||
int fd = fileno(file->fp);
|
|
||||||
int flags = MAP_SHARED;
|
|
||||||
// prefetch/readahead impairs performance on NUMA systems
|
|
||||||
if (numa) { prefetch = 0; }
|
|
||||||
#ifdef __linux__
|
|
||||||
if (prefetch) { flags |= MAP_POPULATE; }
|
|
||||||
#endif
|
|
||||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
|
||||||
if (addr == MAP_FAILED) {
|
|
||||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefetch > 0) {
|
|
||||||
// Advise the kernel to preload the mapped memory
|
|
||||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (numa) {
|
|
||||||
// advise the kernel not to use readahead
|
|
||||||
// (because the next page might not belong on the same node)
|
|
||||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~gguf_mmap() {
|
|
||||||
munmap(addr, size);
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
|
||||||
(void) numa;
|
|
||||||
|
|
||||||
size = file->size;
|
|
||||||
|
|
||||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
|
||||||
|
|
||||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
||||||
DWORD error = GetLastError();
|
|
||||||
|
|
||||||
if (hMapping == NULL) {
|
|
||||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
||||||
error = GetLastError();
|
|
||||||
CloseHandle(hMapping);
|
|
||||||
|
|
||||||
if (addr == NULL) {
|
|
||||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
||||||
if (prefetch) {
|
|
||||||
// Advise the kernel to preload the mapped memory
|
|
||||||
WIN32_MEMORY_RANGE_ENTRY range;
|
|
||||||
range.VirtualAddress = addr;
|
|
||||||
range.NumberOfBytes = (SIZE_T)size;
|
|
||||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
|
||||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
|
||||||
gguf_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
|
||||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
||||||
}
|
|
||||||
|
|
||||||
~gguf_mmap() {
|
|
||||||
if (!UnmapViewOfFile(addr)) {
|
|
||||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
|
||||||
(void) prefetch;
|
|
||||||
(void) numa;
|
|
||||||
|
|
||||||
throw std::runtime_error(std::string("mmap not supported"));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
|
||||||
// will automatically unlock on destruction.
|
|
||||||
struct gguf_mlock {
|
|
||||||
void * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
bool failed_already = false;
|
|
||||||
|
|
||||||
gguf_mlock() {}
|
|
||||||
gguf_mlock(const gguf_mlock &) = delete;
|
|
||||||
|
|
||||||
~gguf_mlock() {
|
|
||||||
if (size) {
|
|
||||||
raw_unlock(addr, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void init(void * ptr) {
|
|
||||||
GGML_ASSERT(addr == NULL && size == 0);
|
|
||||||
addr = ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void grow_to(size_t target_size) {
|
|
||||||
GGML_ASSERT(addr);
|
|
||||||
if (failed_already) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t granularity = lock_granularity();
|
|
||||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
|
||||||
if (target_size > size) {
|
|
||||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
|
||||||
size = target_size;
|
|
||||||
} else {
|
|
||||||
failed_already = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _POSIX_MEMLOCK_RANGE
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) sysconf(_SC_PAGESIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
|
||||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
|
||||||
#else
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t size) {
|
|
||||||
if (!mlock(addr, size)) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
char* errmsg = std::strerror(errno);
|
|
||||||
bool suggest = (errno == ENOMEM);
|
|
||||||
|
|
||||||
// Check if the resource limit is fine after all
|
|
||||||
struct rlimit lock_limit;
|
|
||||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
|
||||||
suggest = false;
|
|
||||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
|
||||||
suggest = false;
|
|
||||||
|
|
||||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
|
||||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef MLOCK_SUGGESTION
|
|
||||||
|
|
||||||
void raw_unlock(void * addr, size_t size) {
|
|
||||||
if (munlock(addr, size)) {
|
|
||||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
SYSTEM_INFO si;
|
|
||||||
GetSystemInfo(&si);
|
|
||||||
return (size_t) si.dwPageSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(void * ptr, size_t len) {
|
|
||||||
for (int tries = 1; ; tries++) {
|
|
||||||
if (VirtualLock(ptr, len)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (tries == 2) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
|
||||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// It failed but this was only the first try; increase the working
|
|
||||||
// set size and try again.
|
|
||||||
SIZE_T min_ws_size, max_ws_size;
|
|
||||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
|
||||||
gguf_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Per MSDN: "The maximum number of pages that a process can lock
|
|
||||||
// is equal to the number of pages in its minimum working set minus
|
|
||||||
// a small overhead."
|
|
||||||
// Hopefully a megabyte is enough overhead:
|
|
||||||
size_t increment = len + 1048576;
|
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
|
||||||
min_ws_size += increment;
|
|
||||||
max_ws_size += increment;
|
|
||||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
|
||||||
gguf_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(void * ptr, size_t len) {
|
|
||||||
if (!VirtualUnlock(ptr, len)) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
|
||||||
gguf_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) 65536;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t len) {
|
|
||||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(const void * addr, size_t len) {}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
239
gguf.py
239
gguf.py
|
@ -4,14 +4,169 @@
|
||||||
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
|
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
import struct
|
import struct
|
||||||
import constants
|
import numpy as np
|
||||||
|
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from typing import Any, IO, List
|
from typing import Any, IO, List
|
||||||
|
|
||||||
import numpy as np
|
#
|
||||||
import sys
|
# constants
|
||||||
|
#
|
||||||
|
|
||||||
|
GGUF_MAGIC = 0x47475546
|
||||||
|
GGUF_VERSION = 1
|
||||||
|
GGUF_DEFAULT_ALIGNMENT = 32
|
||||||
|
|
||||||
|
# general
|
||||||
|
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
||||||
|
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
||||||
|
KEY_GENERAL_ALIGNMENT = "general.alignment"
|
||||||
|
KEY_GENERAL_NAME = "general.name"
|
||||||
|
KEY_GENERAL_AUTHOR = "general.author"
|
||||||
|
KEY_GENERAL_URL = "general.url"
|
||||||
|
KEY_GENERAL_DESCRIPTION = "general.description"
|
||||||
|
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||||
|
KEY_GENERAL_LICENSE = "general.license"
|
||||||
|
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||||
|
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||||
|
|
||||||
|
# LLM
|
||||||
|
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
|
||||||
|
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
|
||||||
|
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
|
||||||
|
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
|
||||||
|
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
|
||||||
|
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
|
||||||
|
|
||||||
|
# attention
|
||||||
|
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
|
||||||
|
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
|
||||||
|
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
|
||||||
|
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
|
||||||
|
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
|
||||||
|
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
|
||||||
|
|
||||||
|
# RoPE
|
||||||
|
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
|
||||||
|
KEY_ROPE_SCALE = "{llm}.rope.scale"
|
||||||
|
|
||||||
|
# tokenization
|
||||||
|
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
||||||
|
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
|
||||||
|
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
|
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
|
||||||
|
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
|
||||||
|
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
|
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
|
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
|
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
|
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
|
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
||||||
|
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||||
|
|
||||||
|
#
|
||||||
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
|
#
|
||||||
|
|
||||||
|
def get_tensor_name_map(n_blocks : int):
|
||||||
|
tensor_map = {}
|
||||||
|
# Token embeddings
|
||||||
|
mapped_to = "token_embd"
|
||||||
|
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
|
||||||
|
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
|
||||||
|
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
|
||||||
|
tensor_map["tok_embeddings"] = mapped_to # llama-pth
|
||||||
|
# Position embeddings
|
||||||
|
mapped_to = "pos_embd"
|
||||||
|
tensor_map["transformer.wpe"] = mapped_to # gpt2
|
||||||
|
# Output norm
|
||||||
|
mapped_to = "output_norm"
|
||||||
|
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
|
||||||
|
tensor_map["transformer.norm_f"] = mapped_to # mpt
|
||||||
|
tensor_map["model.norm"] = mapped_to # llama-hf
|
||||||
|
tensor_map["norm"] = mapped_to # llama-pth
|
||||||
|
# Output
|
||||||
|
mapped_to = "output"
|
||||||
|
tensor_map["embed_out"] = mapped_to # gptneox
|
||||||
|
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
|
||||||
|
tensor_map["output"] = mapped_to # llama-pth
|
||||||
|
# Attention and fee-forward layer blocks
|
||||||
|
for i in range(0,n_blocks):
|
||||||
|
# Attention norm
|
||||||
|
mapped_to = "blk."+str(i)+".attn_norm"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
|
||||||
|
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
|
||||||
|
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
|
||||||
|
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
|
||||||
|
# Attention norm 2
|
||||||
|
mapped_to = "blk."+str(i)+".attn_norm_2"
|
||||||
|
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
|
||||||
|
# Attention query-key-value
|
||||||
|
mapped_to = "blk."+str(i)+".attn_qkv"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
|
||||||
|
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
|
||||||
|
# Attention query
|
||||||
|
mapped_to = "blk."+str(i)+".attn_q"
|
||||||
|
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
|
||||||
|
# Attention key
|
||||||
|
mapped_to = "blk."+str(i)+".attn_k"
|
||||||
|
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
|
||||||
|
# Attention value
|
||||||
|
mapped_to = "blk."+str(i)+".attn_v"
|
||||||
|
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
|
||||||
|
# Attention output
|
||||||
|
mapped_to = "blk."+str(i)+".attn_output"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
|
||||||
|
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
|
||||||
|
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
|
||||||
|
# Feed-forward norm
|
||||||
|
mapped_to = "blk."+str(i)+".ffn_norm"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
|
||||||
|
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
|
||||||
|
# Feed-forward up
|
||||||
|
mapped_to = "blk."+str(i)+".ffn_up"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
|
||||||
|
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
|
||||||
|
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
|
||||||
|
# Feed-forward gate
|
||||||
|
mapped_to = "blk."+str(i)+".ffn_gate"
|
||||||
|
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
|
||||||
|
# Feed-forward down
|
||||||
|
mapped_to = "blk."+str(i)+".ffn_down"
|
||||||
|
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
|
||||||
|
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
|
||||||
|
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
|
||||||
|
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
|
||||||
|
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
|
||||||
|
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
|
||||||
|
|
||||||
|
return tensor_map
|
||||||
|
|
||||||
|
#
|
||||||
|
# implementation
|
||||||
|
#
|
||||||
|
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
F32 = 0
|
F32 = 0
|
||||||
|
@ -51,15 +206,15 @@ class GGUFWriter:
|
||||||
def __init__(self, fout: IO):
|
def __init__(self, fout: IO):
|
||||||
self.fout = fout
|
self.fout = fout
|
||||||
self.offset_tensor = 0
|
self.offset_tensor = 0
|
||||||
self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
self.kv_data = b""
|
self.kv_data = b""
|
||||||
self.kv_data_count = 0
|
self.kv_data_count = 0
|
||||||
self.ti_data = b""
|
self.ti_data = b""
|
||||||
self.ti_data_count = 0
|
self.ti_data_count = 0
|
||||||
|
|
||||||
def write_header_to_file(self):
|
def write_header_to_file(self):
|
||||||
self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
|
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||||
self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
|
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||||
self.fout.write(struct.pack("<I", self.ti_data_count))
|
self.fout.write(struct.pack("<I", self.ti_data_count))
|
||||||
self.fout.write(struct.pack("<I", self.kv_data_count))
|
self.fout.write(struct.pack("<I", self.kv_data_count))
|
||||||
self.flush()
|
self.flush()
|
||||||
|
@ -201,123 +356,125 @@ class GGUFWriter:
|
||||||
self.fout.close()
|
self.fout.close()
|
||||||
|
|
||||||
def add_architecture(self, architecture: str):
|
def add_architecture(self, architecture: str):
|
||||||
self.add_string(constants.KEY_GENERAL_ARCHITECTURE,
|
self.add_string(KEY_GENERAL_ARCHITECTURE,
|
||||||
architecture)
|
architecture)
|
||||||
|
|
||||||
def add_author(self, author: str):
|
def add_author(self, author: str):
|
||||||
self.add_string(constants.KEY_GENERAL_AUTHOR, author)
|
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||||
|
|
||||||
|
def add_tensor_data_layout(self, layout: str):
|
||||||
|
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout)
|
||||||
|
|
||||||
def add_url(self, url: str):
|
def add_url(self, url: str):
|
||||||
self.add_string(constants.KEY_GENERAL_URL, url)
|
self.add_string(KEY_GENERAL_URL, url)
|
||||||
|
|
||||||
def add_description(self, description: str):
|
def add_description(self, description: str):
|
||||||
self.add_string(constants.KEY_GENERAL_DESCRIPTION, description)
|
self.add_string(KEY_GENERAL_DESCRIPTION, description)
|
||||||
|
|
||||||
def add_file_type(self, file_type: str):
|
def add_file_type(self, file_type: str):
|
||||||
self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
|
self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
|
||||||
|
|
||||||
def add_source_url(self, url: str):
|
def add_source_url(self, url: str):
|
||||||
self.add_string(constants.KEY_GENERAL_SOURCE_URL, url)
|
self.add_string(KEY_GENERAL_SOURCE_URL, url)
|
||||||
|
|
||||||
def add_source_hf_repo(self, repo: str):
|
def add_source_hf_repo(self, repo: str):
|
||||||
self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
|
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
||||||
|
|
||||||
def add_name(self, name: str):
|
def add_name(self, name: str):
|
||||||
self.add_string(constants.KEY_GENERAL_NAME, name)
|
self.add_string(KEY_GENERAL_NAME, name)
|
||||||
|
|
||||||
def add_quantization_version(self, quantization_version: GGMLQuantizationType):
|
def add_quantization_version(self, quantization_version: GGMLQuantizationType):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
|
KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
|
||||||
|
|
||||||
def add_custom_alignment(self, alignment: int):
|
def add_custom_alignment(self, alignment: int):
|
||||||
self.data_alignment = alignment
|
self.data_alignment = alignment
|
||||||
self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment)
|
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
|
||||||
|
|
||||||
def add_context_length(self, llm: str, length: int):
|
def add_context_length(self, llm: str, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
|
KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
|
||||||
|
|
||||||
def add_embedding_length(self, llm: str, length: int):
|
def add_embedding_length(self, llm: str, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
|
KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
|
||||||
|
|
||||||
def add_block_count(self, llm: str, length: int):
|
def add_block_count(self, llm: str, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
|
KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
|
||||||
|
|
||||||
def add_feed_forward_length(self, llm: str, length: int):
|
def add_feed_forward_length(self, llm: str, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
|
KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
|
||||||
|
|
||||||
def add_parallel_residual(self, llm: str, use: bool):
|
def add_parallel_residual(self, llm: str, use: bool):
|
||||||
self.add_bool(
|
self.add_bool(
|
||||||
constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
|
KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, llm: str, layout: str):
|
def add_tensor_data_layout(self, llm: str, layout: str):
|
||||||
self.add_string(
|
self.add_string(
|
||||||
constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
|
KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
|
||||||
|
|
||||||
def add_head_count(self, llm: str, count: int):
|
def add_head_count(self, llm: str, count: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
|
KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
|
||||||
|
|
||||||
def add_head_count_kv(self, llm: str, count: int):
|
def add_head_count_kv(self, llm: str, count: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
|
KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
|
||||||
|
|
||||||
def add_max_alibi_bias(self, llm: str, bias: float):
|
def add_max_alibi_bias(self, llm: str, bias: float):
|
||||||
self.add_float32(
|
self.add_float32(
|
||||||
constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
|
KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
|
||||||
|
|
||||||
def add_clamp_kqv(self, llm: str, value: float):
|
def add_clamp_kqv(self, llm: str, value: float):
|
||||||
self.add_float32(
|
self.add_float32(
|
||||||
constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
|
KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, llm: str, value: float):
|
def add_layer_norm_eps(self, llm: str, value: float):
|
||||||
self.add_float32(
|
self.add_float32(
|
||||||
constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
|
KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
|
||||||
|
|
||||||
def add_layer_norm_rms_eps(self, llm: str, value: float):
|
def add_layer_norm_rms_eps(self, llm: str, value: float):
|
||||||
self.add_float32(
|
self.add_float32(
|
||||||
constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
|
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
|
||||||
|
|
||||||
def add_rope_dimension_count(self, llm: str, count: int):
|
def add_rope_dimension_count(self, llm: str, count: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
|
KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
|
||||||
|
|
||||||
def add_rope_scale(self, llm: str, value: float):
|
def add_rope_scale(self, llm: str, value: float):
|
||||||
self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
|
self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value)
|
||||||
|
|
||||||
def add_tokenizer_model(self, model: str):
|
def add_tokenizer_model(self, model: str):
|
||||||
self.add_string(constants.KEY_TOKENIZER_MODEL, model)
|
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||||
|
|
||||||
def add_token_list(self, tokens: List):
|
def add_token_list(self, tokens: List):
|
||||||
self.add_array(constants.KEY_TOKENIZER_LIST, tokens)
|
self.add_array(KEY_TOKENIZER_LIST, tokens)
|
||||||
|
|
||||||
def add_token_merges(self, merges: List):
|
def add_token_merges(self, merges: List):
|
||||||
self.add_array(constants.KEY_TOKENIZER_MERGES, merges)
|
self.add_array(KEY_TOKENIZER_MERGES, merges)
|
||||||
|
|
||||||
def add_token_types(self, types: List[int]):
|
def add_token_types(self, types: List[int]):
|
||||||
self.add_array(constants.KEY_TOKENIZER_TOKEN_TYPE, types)
|
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
|
||||||
|
|
||||||
def add_token_scores(self, scores: List[float]):
|
def add_token_scores(self, scores: List[float]):
|
||||||
self.add_array(constants.KEY_TOKENIZER_SCORES, scores)
|
self.add_array(KEY_TOKENIZER_SCORES, scores)
|
||||||
|
|
||||||
def add_bos_token_id(self, id: int):
|
def add_bos_token_id(self, id: int):
|
||||||
self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id)
|
self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
|
||||||
|
|
||||||
def add_eos_token_id(self, id: int):
|
def add_eos_token_id(self, id: int):
|
||||||
self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id)
|
self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
|
||||||
|
|
||||||
def add_unk_token_id(self, id: int):
|
def add_unk_token_id(self, id: int):
|
||||||
self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id)
|
self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
|
||||||
|
|
||||||
def add_sep_token_id(self, id: int):
|
def add_sep_token_id(self, id: int):
|
||||||
self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id)
|
self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
|
||||||
|
|
||||||
def add_pad_token_id(self, id: int):
|
def add_pad_token_id(self, id: int):
|
||||||
self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id)
|
self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
|
||||||
|
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,95 +0,0 @@
|
||||||
# Recommended mapping of model tensor names for storage in gguf
|
|
||||||
|
|
||||||
def get_tensor_namemap( n_blocks : int):
|
|
||||||
tensor_map = {}
|
|
||||||
# Token embeddings
|
|
||||||
mapped_to = "token_embd"
|
|
||||||
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
|
|
||||||
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
|
|
||||||
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
|
|
||||||
tensor_map["tok_embeddings"] = mapped_to # llama-pth
|
|
||||||
# Position embeddings
|
|
||||||
mapped_to = "pos_embd"
|
|
||||||
tensor_map["transformer.wpe"] = mapped_to # gpt2
|
|
||||||
# Output norm
|
|
||||||
mapped_to = "output_norm"
|
|
||||||
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
|
|
||||||
tensor_map["transformer.norm_f"] = mapped_to # mpt
|
|
||||||
tensor_map["model.norm"] = mapped_to # llama-hf
|
|
||||||
tensor_map["norm"] = mapped_to # llama-pth
|
|
||||||
# Output
|
|
||||||
mapped_to = "output"
|
|
||||||
tensor_map["embed_out"] = mapped_to # gptneox
|
|
||||||
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
|
|
||||||
tensor_map["output"] = mapped_to # llama-pth
|
|
||||||
# Attention and fee-forward layer blocks
|
|
||||||
for i in range(0,n_blocks):
|
|
||||||
# Attention norm
|
|
||||||
mapped_to = "blk."+str(i)+".attn_norm"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
|
|
||||||
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
|
|
||||||
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
|
|
||||||
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
|
|
||||||
# Attention norm 2
|
|
||||||
mapped_to = "blk."+str(i)+".attn_norm_2"
|
|
||||||
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
|
|
||||||
# Attention query-key-value
|
|
||||||
mapped_to = "blk."+str(i)+".attn_qkv"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
|
|
||||||
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
|
|
||||||
# Attention query
|
|
||||||
mapped_to = "blk."+str(i)+".attn_q"
|
|
||||||
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
|
|
||||||
# Attention key
|
|
||||||
mapped_to = "blk."+str(i)+".attn_k"
|
|
||||||
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
|
|
||||||
# Attention value
|
|
||||||
mapped_to = "blk."+str(i)+".attn_v"
|
|
||||||
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
|
|
||||||
# Attention output
|
|
||||||
mapped_to = "blk."+str(i)+".attn_output"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
|
|
||||||
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
|
|
||||||
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
|
|
||||||
# Feed-forward norm
|
|
||||||
mapped_to = "blk."+str(i)+".ffn_norm"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
|
|
||||||
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
|
|
||||||
# Feed-forward up
|
|
||||||
mapped_to = "blk."+str(i)+".ffn_up"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
|
|
||||||
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
|
|
||||||
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
|
|
||||||
# Feed-forward gate
|
|
||||||
mapped_to = "blk."+str(i)+".ffn_gate"
|
|
||||||
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
|
|
||||||
# Feed-forward down
|
|
||||||
mapped_to = "blk."+str(i)+".ffn_down"
|
|
||||||
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
|
|
||||||
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
|
|
||||||
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
|
|
||||||
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
|
|
||||||
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
|
|
||||||
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
|
|
||||||
|
|
||||||
return tensor_map
|
|
|
@ -381,6 +381,8 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
|
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
||||||
|
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
}
|
}
|
||||||
|
|
553
llama-util.h
553
llama-util.h
|
@ -1,553 +0,0 @@
|
||||||
// Internal header to be included only by llama.cpp.
|
|
||||||
// Contains wrappers around OS interfaces.
|
|
||||||
|
|
||||||
#ifndef LLAMA_UTIL_H
|
|
||||||
#define LLAMA_UTIL_H
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cerrno>
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdarg>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#ifdef __has_include
|
|
||||||
#if __has_include(<unistd.h>)
|
|
||||||
#include <unistd.h>
|
|
||||||
#if defined(_POSIX_MAPPED_FILES)
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#endif
|
|
||||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
||||||
#include <sys/resource.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <io.h>
|
|
||||||
#include <stdio.h> // for _fseeki64
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LLAMA_ASSERT(x) \
|
|
||||||
do { \
|
|
||||||
if (!(x)) { \
|
|
||||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
||||||
abort(); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
#ifdef __MINGW32__
|
|
||||||
__attribute__((format(gnu_printf, 1, 2)))
|
|
||||||
#else
|
|
||||||
__attribute__((format(printf, 1, 2)))
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
static std::string format(const char * fmt, ...) {
|
|
||||||
va_list ap, ap2;
|
|
||||||
va_start(ap, fmt);
|
|
||||||
va_copy(ap2, ap);
|
|
||||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
||||||
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
|
||||||
std::vector<char> buf(size + 1);
|
|
||||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
||||||
LLAMA_ASSERT(size2 == size);
|
|
||||||
va_end(ap2);
|
|
||||||
va_end(ap);
|
|
||||||
return std::string(buf.data(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_file {
|
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
|
||||||
FILE * fp;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
|
||||||
fp = std::fopen(fname, mode);
|
|
||||||
if (fp == NULL) {
|
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
||||||
}
|
|
||||||
seek(0, SEEK_END);
|
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t tell() const {
|
|
||||||
#ifdef _WIN32
|
|
||||||
__int64 ret = _ftelli64(fp);
|
|
||||||
#else
|
|
||||||
long ret = std::ftell(fp);
|
|
||||||
#endif
|
|
||||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
|
||||||
return (size_t) ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void seek(size_t offset, int whence) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
|
||||||
#else
|
|
||||||
int ret = std::fseek(fp, (long) offset, whence);
|
|
||||||
#endif
|
|
||||||
LLAMA_ASSERT(ret == 0); // same
|
|
||||||
}
|
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
||||||
if (ferror(fp)) {
|
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::uint32_t read_u32() {
|
|
||||||
std::uint32_t ret;
|
|
||||||
read_raw(&ret, sizeof(ret));
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string read_string(std::uint32_t len) {
|
|
||||||
std::vector<char> chars(len);
|
|
||||||
read_raw(chars.data(), len);
|
|
||||||
return std::string(chars.data(), len);
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_u32(std::uint32_t val) {
|
|
||||||
write_raw(&val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_file() {
|
|
||||||
if (fp) {
|
|
||||||
std::fclose(fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// llama_context_data
|
|
||||||
struct llama_data_context {
|
|
||||||
virtual void write(const void * src, size_t size) = 0;
|
|
||||||
virtual size_t get_size_written() = 0;
|
|
||||||
virtual ~llama_data_context() = default;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_data_buffer_context : llama_data_context {
|
|
||||||
uint8_t* ptr;
|
|
||||||
size_t size_written = 0;
|
|
||||||
|
|
||||||
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
|
|
||||||
|
|
||||||
void write(const void * src, size_t size) override {
|
|
||||||
memcpy(ptr, src, size);
|
|
||||||
ptr += size;
|
|
||||||
size_written += size;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_size_written() override {
|
|
||||||
return size_written;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_data_file_context : llama_data_context {
|
|
||||||
llama_file* file;
|
|
||||||
size_t size_written = 0;
|
|
||||||
|
|
||||||
llama_data_file_context(llama_file * f) : file(f) {}
|
|
||||||
|
|
||||||
void write(const void * src, size_t size) override {
|
|
||||||
file->write_raw(src, size);
|
|
||||||
size_written += size;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_size_written() override {
|
|
||||||
return size_written;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
static std::string llama_format_win_err(DWORD err) {
|
|
||||||
LPSTR buf;
|
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
||||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
|
||||||
if (!size) {
|
|
||||||
return "FormatMessageA failed";
|
|
||||||
}
|
|
||||||
std::string ret(buf, size);
|
|
||||||
LocalFree(buf);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct llama_mmap {
|
|
||||||
void * addr;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
llama_mmap(const llama_mmap &) = delete;
|
|
||||||
|
|
||||||
#ifdef _POSIX_MAPPED_FILES
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
|
||||||
size = file->size;
|
|
||||||
int fd = fileno(file->fp);
|
|
||||||
int flags = MAP_SHARED;
|
|
||||||
// prefetch/readahead impairs performance on NUMA systems
|
|
||||||
if (numa) { prefetch = 0; }
|
|
||||||
#ifdef __linux__
|
|
||||||
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
|
|
||||||
#endif
|
|
||||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
|
||||||
if (addr == MAP_FAILED) {
|
|
||||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefetch > 0) {
|
|
||||||
// Advise the kernel to preload the mapped memory
|
|
||||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (numa) {
|
|
||||||
// advise the kernel not to use readahead
|
|
||||||
// (because the next page might not belong on the same node)
|
|
||||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_mmap() {
|
|
||||||
munmap(addr, size);
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
|
||||||
(void) numa;
|
|
||||||
|
|
||||||
size = file->size;
|
|
||||||
|
|
||||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
|
||||||
|
|
||||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
||||||
DWORD error = GetLastError();
|
|
||||||
|
|
||||||
if (hMapping == NULL) {
|
|
||||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
||||||
error = GetLastError();
|
|
||||||
CloseHandle(hMapping);
|
|
||||||
|
|
||||||
if (addr == NULL) {
|
|
||||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefetch) {
|
|
||||||
// The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
|
|
||||||
// will dynamically load it using GetProcAddress.
|
|
||||||
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
|
||||||
HMODULE hKernel32;
|
|
||||||
|
|
||||||
// This call is guaranteed to succeed.
|
|
||||||
hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
|
||||||
|
|
||||||
// This call may fail if on a pre-Win8 system.
|
|
||||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
|
||||||
|
|
||||||
if (pPrefetchVirtualMemory) {
|
|
||||||
// Advise the kernel to preload the mapped memory.
|
|
||||||
WIN32_MEMORY_RANGE_ENTRY range;
|
|
||||||
range.VirtualAddress = addr;
|
|
||||||
range.NumberOfBytes = (SIZE_T)size;
|
|
||||||
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
|
||||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_mmap() {
|
|
||||||
if (!UnmapViewOfFile(addr)) {
|
|
||||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
|
||||||
(void) prefetch;
|
|
||||||
(void) numa;
|
|
||||||
|
|
||||||
throw std::runtime_error(std::string("mmap not supported"));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
|
||||||
// will automatically unlock on destruction.
|
|
||||||
struct llama_mlock {
|
|
||||||
void * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
bool failed_already = false;
|
|
||||||
|
|
||||||
llama_mlock() {}
|
|
||||||
llama_mlock(const llama_mlock &) = delete;
|
|
||||||
|
|
||||||
~llama_mlock() {
|
|
||||||
if (size) {
|
|
||||||
raw_unlock(addr, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void init(void * ptr) {
|
|
||||||
LLAMA_ASSERT(addr == NULL && size == 0);
|
|
||||||
addr = ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void grow_to(size_t target_size) {
|
|
||||||
LLAMA_ASSERT(addr);
|
|
||||||
if (failed_already) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t granularity = lock_granularity();
|
|
||||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
|
||||||
if (target_size > size) {
|
|
||||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
|
||||||
size = target_size;
|
|
||||||
} else {
|
|
||||||
failed_already = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _POSIX_MEMLOCK_RANGE
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) sysconf(_SC_PAGESIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
|
||||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
|
||||||
#else
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t size) {
|
|
||||||
if (!mlock(addr, size)) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
char* errmsg = std::strerror(errno);
|
|
||||||
bool suggest = (errno == ENOMEM);
|
|
||||||
|
|
||||||
// Check if the resource limit is fine after all
|
|
||||||
struct rlimit lock_limit;
|
|
||||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
|
||||||
suggest = false;
|
|
||||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
|
||||||
suggest = false;
|
|
||||||
|
|
||||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
|
||||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef MLOCK_SUGGESTION
|
|
||||||
|
|
||||||
void raw_unlock(void * addr, size_t size) {
|
|
||||||
if (munlock(addr, size)) {
|
|
||||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
SYSTEM_INFO si;
|
|
||||||
GetSystemInfo(&si);
|
|
||||||
return (size_t) si.dwPageSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(void * ptr, size_t len) {
|
|
||||||
for (int tries = 1; ; tries++) {
|
|
||||||
if (VirtualLock(ptr, len)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (tries == 2) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
|
||||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// It failed but this was only the first try; increase the working
|
|
||||||
// set size and try again.
|
|
||||||
SIZE_T min_ws_size, max_ws_size;
|
|
||||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Per MSDN: "The maximum number of pages that a process can lock
|
|
||||||
// is equal to the number of pages in its minimum working set minus
|
|
||||||
// a small overhead."
|
|
||||||
// Hopefully a megabyte is enough overhead:
|
|
||||||
size_t increment = len + 1048576;
|
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
|
||||||
min_ws_size += increment;
|
|
||||||
max_ws_size += increment;
|
|
||||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(void * ptr, size_t len) {
|
|
||||||
if (!VirtualUnlock(ptr, len)) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) 65536;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t len) {
|
|
||||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(const void * addr, size_t len) {}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
|
||||||
struct llama_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
llama_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t len) {
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
free(addr);
|
|
||||||
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
|
||||||
if (result == 0) {
|
|
||||||
memset(addr, 0, len);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[len];
|
|
||||||
#endif
|
|
||||||
size = len;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_buffer() {
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
free(addr);
|
|
||||||
#else
|
|
||||||
delete[] addr;
|
|
||||||
#endif
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
llama_buffer(const llama_buffer&) = delete;
|
|
||||||
llama_buffer(llama_buffer&&) = delete;
|
|
||||||
llama_buffer& operator=(const llama_buffer&) = delete;
|
|
||||||
llama_buffer& operator=(llama_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
struct llama_ctx_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
bool is_cuda;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
llama_ctx_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
free();
|
|
||||||
|
|
||||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
|
||||||
if (addr) {
|
|
||||||
is_cuda = true;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// fall back to pageable memory
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
is_cuda = false;
|
|
||||||
}
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free() {
|
|
||||||
if (addr) {
|
|
||||||
if (is_cuda) {
|
|
||||||
ggml_cuda_host_free(addr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_ctx_buffer() {
|
|
||||||
free();
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
|
||||||
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
|
||||||
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
|
||||||
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
#else
|
|
||||||
typedef llama_buffer llama_ctx_buffer;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
61
llama.h
61
llama.h
|
@ -34,29 +34,18 @@
|
||||||
# define DEPRECATED(func, hint) func
|
# define DEPRECATED(func, hint) func
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
|
||||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
|
||||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_FILE_VERSION 3
|
|
||||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
|
||||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 1
|
||||||
|
|
||||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef LLAMA_DEFAULT_RMS_EPS
|
|
||||||
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -103,8 +92,6 @@ extern "C" {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
int32_t n_ctx; // text context
|
int32_t n_ctx; // text context
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
|
||||||
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
|
||||||
|
@ -129,6 +116,7 @@ extern "C" {
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -208,17 +196,12 @@ extern "C" {
|
||||||
int32_t n_eval;
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
|
||||||
|
|
||||||
LLAMA_API int llama_max_devices();
|
LLAMA_API int llama_max_devices(void);
|
||||||
|
LLAMA_API bool llama_mmap_supported(void);
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API bool llama_mlock_supported(void);
|
||||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported();
|
|
||||||
LLAMA_API bool llama_mlock_supported();
|
|
||||||
|
|
||||||
// TODO: not great API - very likely to change
|
// TODO: not great API - very likely to change
|
||||||
// Initialize the llama + ggml backend
|
// Initialize the llama + ggml backend
|
||||||
|
@ -226,9 +209,9 @@ extern "C" {
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_backend_init(bool numa);
|
LLAMA_API void llama_backend_init(bool numa);
|
||||||
// Call once at the end of the program - currently only used for MPI
|
// Call once at the end of the program - currently only used for MPI
|
||||||
LLAMA_API void llama_backend_free();
|
LLAMA_API void llama_backend_free(void);
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
|
@ -240,13 +223,6 @@ extern "C" {
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params);
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
|
||||||
// Allocate (almost) all memory needed for the model.
|
|
||||||
// Return NULL on failure
|
|
||||||
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
|
||||||
const char * path_model,
|
|
||||||
struct llama_context_params params),
|
|
||||||
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
@ -384,27 +360,28 @@ extern "C" {
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
// Token Id -> String. Uses the vocabulary in the provided context
|
||||||
|
// Does not write null terminator to the buffer
|
||||||
LLAMA_API int llama_token_to_str(
|
LLAMA_API int llama_token_to_str(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * str,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_bpe(
|
LLAMA_API int llama_token_to_str_bpe(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * str,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_with_model(
|
LLAMA_API int llama_token_to_str_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * str,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
LLAMA_API llama_token llama_token_nl(void); // next-line
|
||||||
|
|
||||||
// Grammar
|
// Grammar
|
||||||
//
|
//
|
||||||
|
@ -484,6 +461,10 @@ extern "C" {
|
||||||
// Print system information
|
// Print system information
|
||||||
LLAMA_API const char * llama_print_system_info(void);
|
LLAMA_API const char * llama_print_system_info(void);
|
||||||
|
|
||||||
|
// Set callback for all future logging events.
|
||||||
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue